[med-svn] [python-pysam] 02/21: Imported Upstream version 0.9.0+ds
Afif Elghraoui
afif at moszumanska.debian.org
Sun Mar 6 07:57:38 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository python-pysam.
commit 364e45ae3dbbca99801346c2e0bb976c5c8d2299
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Sat Mar 5 13:41:37 2016 -0800
Imported Upstream version 0.9.0+ds
---
.gitignore | 16 +
.hgignore | 7 -
.hgtags | 4 -
.travis.yml | 26 +-
INSTALL | 2 +
MANIFEST.in | 10 +
README.rst | 22 +-
THANKS | 4 -
bcftools/HMM.c | 435 ++++
bcftools/HMM.c.pysam.c | 437 +++++
bcftools/HMM.h | 115 ++
bcftools/bcftools.h | 71 +
bcftools/call.h | 126 ++
bcftools/ccall.c | 343 ++++
bcftools/ccall.c.pysam.c | 345 ++++
bcftools/consensus.c | 658 +++++++
bcftools/consensus.c.pysam.c | 660 +++++++
bcftools/convert.c | 1056 ++++++++++
bcftools/convert.c.pysam.c | 1058 ++++++++++
samtools/samtools.h => bcftools/convert.h | 41 +-
bcftools/em.c | 259 +++
bcftools/em.c.pysam.c | 261 +++
bcftools/filter.c | 1788 +++++++++++++++++
bcftools/filter.c.pysam.c | 1790 +++++++++++++++++
bcftools/filter.h | 52 +
bcftools/gvcf.c | 227 +++
bcftools/gvcf.c.pysam.c | 229 +++
bcftools/gvcf.h | 41 +
bcftools/khash_str2str.h | 89 +
bcftools/kmin.c | 209 ++
bcftools/kmin.c.pysam.c | 211 ++
bcftools/kmin.h | 46 +
bcftools/main.c | 264 +++
bcftools/main.c.pysam.c | 266 +++
bcftools/mcall.c | 1537 +++++++++++++++
bcftools/mcall.c.pysam.c | 1539 +++++++++++++++
bcftools/ploidy.c | 254 +++
bcftools/ploidy.c.pysam.c | 256 +++
bcftools/ploidy.h | 129 ++
bcftools/prob1.c | 529 +++++
bcftools/prob1.c.pysam.c | 531 +++++
bcftools/prob1.h | 93 +
bcftools/pysam.h | 5 +
bcftools/rbuf.h | 201 ++
bcftools/tabix.c | 129 ++
bcftools/tabix.c.pysam.c | 131 ++
bcftools/tsv2vcf.c | 121 ++
bcftools/tsv2vcf.c.pysam.c | 123 ++
bcftools/tsv2vcf.h | 85 +
bcftools/vcfannotate.c | 1760 +++++++++++++++++
bcftools/vcfannotate.c.pysam.c | 1762 +++++++++++++++++
bcftools/vcfcall.c | 822 ++++++++
bcftools/vcfcall.c.pysam.c | 824 ++++++++
bcftools/vcfcnv.c | 1386 +++++++++++++
bcftools/vcfcnv.c.pysam.c | 1388 +++++++++++++
bcftools/vcfconcat.c | 662 +++++++
bcftools/vcfconcat.c.pysam.c | 664 +++++++
bcftools/vcfconvert.c | 1448 ++++++++++++++
bcftools/vcfconvert.c.pysam.c | 1450 ++++++++++++++
bcftools/vcffilter.c | 568 ++++++
bcftools/vcffilter.c.pysam.c | 570 ++++++
bcftools/vcfgtcheck.c | 804 ++++++++
bcftools/vcfgtcheck.c.pysam.c | 806 ++++++++
bcftools/vcfindex.c | 240 +++
bcftools/vcfindex.c.pysam.c | 242 +++
bcftools/vcfisec.c | 596 ++++++
bcftools/vcfisec.c.pysam.c | 598 ++++++
bcftools/vcfmerge.c | 2067 +++++++++++++++++++
bcftools/vcfmerge.c.pysam.c | 2069 ++++++++++++++++++++
bcftools/vcfnorm.c | 1810 +++++++++++++++++
bcftools/vcfnorm.c.pysam.c | 1812 +++++++++++++++++
bcftools/vcfplugin.c | 614 ++++++
bcftools/vcfplugin.c.pysam.c | 616 ++++++
bcftools/vcfquery.c | 373 ++++
bcftools/vcfquery.c.pysam.c | 375 ++++
bcftools/vcfroh.c | 794 ++++++++
bcftools/vcfroh.c.pysam.c | 796 ++++++++
bcftools/vcfsom.c | 715 +++++++
bcftools/vcfsom.c.pysam.c | 717 +++++++
bcftools/vcfstats.c | 1590 +++++++++++++++
bcftools/vcfstats.c.pysam.c | 1592 +++++++++++++++
bcftools/vcfview.c | 746 +++++++
bcftools/vcfview.c.pysam.c | 748 +++++++
bcftools/vcmp.c | 132 ++
bcftools/vcmp.c.pysam.c | 134 ++
bcftools/vcmp.h | 62 +
samtools/samtools.h => bcftools/version.c | 40 +-
samtools/samtools.h => bcftools/version.c.pysam.c | 42 +-
bcftools/version.h | 1 +
ci/conda-recipe/build.sh | 8 +
ci/conda-recipe/meta.yaml | 29 +
install-CGAT-tools.sh => ci/install-CGAT-tools.sh | 62 +-
cy_build.py | 87 +
doc/api.rst | 30 +-
doc/conf.py | 76 +-
doc/faq.rst | 42 +-
doc/index.rst | 29 +-
doc/installation.rst | 65 +
doc/release.rst | 67 +
doc/usage.rst | 196 +-
import.py | 140 ++
pysam/__init__.py | 202 +-
pysam/bcftools.py | 24 +
pysam/calignedsegment.pxd | 7 +-
pysam/calignedsegment.pyx | 527 ++---
pysam/calignmentfile.pyx | 314 +--
pysam/cbcf.pxd | 9 +-
pysam/cbcf.pyx | 1757 +++++++++++++----
pysam/cfaidx.pxd | 1 +
pysam/cfaidx.pyx | 101 +-
pysam/chtslib.pxd | 286 ++-
pysam/csamtools.pxd | 8 -
pysam/csamtools.pyx | 146 --
pysam/ctabix.pxd | 6 +-
pysam/ctabix.pyx | 90 +-
pysam/cutils.pxd | 16 +-
pysam/cutils.pyx | 243 ++-
pysam/cvcf.pyx | 3 +
pysam/htslib_util.h | 1 +
pysam/pysam_util.c | 106 +-
pysam/pysam_util.h | 2 +
pysam/samtools.py | 46 +
pysam/utils.py | 88 +
pysam/version.py | 6 +-
run_tests_travis.sh | 113 ++
samtools/bam.c | 67 +-
samtools/bam.c.pysam.c | 67 +-
samtools/bam.h | 7 +-
samtools/bam2bcf.c | 75 +-
samtools/bam2bcf.c.pysam.c | 75 +-
samtools/bam2bcf.h | 13 +-
samtools/bam2bcf_indel.c | 14 +-
samtools/bam2bcf_indel.c.pysam.c | 14 +-
samtools/bam2depth.c | 142 +-
samtools/bam2depth.c.pysam.c | 142 +-
samtools/bam_addrprg.c | 476 +++++
samtools/bam_addrprg.c.pysam.c | 478 +++++
samtools/bam_aux.c | 18 +-
samtools/bam_aux.c.pysam.c | 18 +-
samtools/bam_cat.c | 432 +++-
samtools/bam_cat.c.pysam.c | 432 +++-
samtools/bam_index.c | 29 +-
samtools/bam_index.c.pysam.c | 29 +-
samtools/bam_mate.c | 58 +-
samtools/bam_mate.c.pysam.c | 58 +-
samtools/bam_md.c | 173 +-
samtools/bam_md.c.pysam.c | 173 +-
samtools/bam_plcmd.c | 232 ++-
samtools/bam_plcmd.c.pysam.c | 232 ++-
samtools/bam_quickcheck.c | 134 ++
samtools/bam_quickcheck.c.pysam.c | 136 ++
samtools/bam_reheader.c | 437 ++++-
samtools/bam_reheader.c.pysam.c | 438 ++++-
samtools/bam_rmdup.c | 94 +-
samtools/bam_rmdup.c.pysam.c | 94 +-
samtools/bam_rmdupse.c | 31 +-
samtools/bam_rmdupse.c.pysam.c | 31 +-
samtools/bam_sort.c | 1472 ++++++++++----
samtools/bam_sort.c.pysam.c | 1472 ++++++++++----
samtools/bam_split.c | 99 +-
samtools/bam_split.c.pysam.c | 99 +-
samtools/bam_stat.c | 66 +-
samtools/bam_stat.c.pysam.c | 66 +-
samtools/bam_tview.c | 88 +-
samtools/bam_tview.c.pysam.c | 88 +-
samtools/bam_tview.h | 6 +-
samtools/bam_tview_curses.c | 112 +-
samtools/bam_tview_curses.c.pysam.c | 112 +-
samtools/bam_tview_html.c | 10 +-
samtools/bam_tview_html.c.pysam.c | 10 +-
samtools/bamshuf.c | 112 +-
samtools/bamshuf.c.pysam.c | 112 +-
samtools/bamtk.c | 227 +++
samtools/bamtk.c.pysam.c | 229 +++
samtools/bedcov.c | 31 +-
samtools/bedcov.c.pysam.c | 31 +-
samtools/cut_target.c | 70 +-
samtools/cut_target.c.pysam.c | 70 +-
samtools/dict.c | 151 ++
samtools/dict.c.pysam.c | 153 ++
samtools/errmod.c | 4 +-
samtools/errmod.c.pysam.c | 4 +-
samtools/misc/md5.c | 298 ---
samtools/misc/md5.c.pysam.c | 300 ---
samtools/misc/md5.h | 57 -
samtools/padding.c | 287 ++-
samtools/padding.c.pysam.c | 287 ++-
samtools/phase.c | 44 +-
samtools/phase.c.pysam.c | 44 +-
samtools/sam.c | 29 +-
samtools/sam.c.pysam.c | 29 +-
samtools/sam.h | 26 +-
samtools/sam_opts.c | 153 ++
samtools/sam_opts.c.pysam.c | 155 ++
samtools/sam_opts.h | 99 +
samtools/sam_view.c | 795 +++++---
samtools/sam_view.c.pysam.c | 795 +++++---
samtools/samtools.h | 12 +-
samtools/stats.c | 1075 +++++-----
samtools/stats.c.pysam.c | 1075 +++++-----
samtools/test/merge/test_bam_translate.c | 35 +-
samtools/test/merge/test_bam_translate.c.pysam.c | 35 +-
samtools/test/merge/test_pretty_header.c | 87 -
samtools/test/merge/test_pretty_header.c.pysam.c | 89 -
samtools/test/merge/test_trans_tbl_init.c | 415 ++--
samtools/test/merge/test_trans_tbl_init.c.pysam.c | 415 ++--
samtools/test/split/test_count_rg.c | 9 +-
samtools/test/split/test_count_rg.c.pysam.c | 9 +-
samtools/test/split/test_expand_format_string.c | 13 +-
.../test/split/test_expand_format_string.c.pysam.c | 13 +-
samtools/test/split/test_filter_header_rg.c | 15 +-
.../test/split/test_filter_header_rg.c.pysam.c | 15 +-
samtools/test/split/test_parse_args.c | 32 +-
samtools/test/split/test_parse_args.c.pysam.c | 32 +-
samtools/version.h | 2 +-
setup.cfg | 6 -
setup.py | 531 ++---
tests/AlignedSegment_test.py | 94 +-
tests/AlignmentFile_test.py | 288 ++-
tests/SamFile_test.py | 81 +-
tests/TestUtils.py | 55 +-
tests/VariantFile_test.py | 418 ++++
tests/cbcf_data/Makefile | 23 +
KNOWN_BUGS => tests/cbcf_data/example_empty.vcf | 0
tests/cbcf_data/example_vcf40.vcf | 24 +
tests/cbcf_data/example_vcf42.vcf | 24 +
tests/cbcf_data/example_vcf42_only_header.vcf | 19 +
tests/cbcf_data/example_vcf42_withcontigs.vcf | 27 +
tests/cbcf_data/missing_genotypes.vcf | 6 +
tests/faidx_test.py | 51 +
tests/pysam_data/Makefile | 22 +-
tests/pysam_data/ex1.vcf.gz | Bin 0 -> 16982 bytes
tests/pysam_data/ex1.vcf.gz.tbi | Bin 0 -> 5636 bytes
tests/pysam_data/example_aligned_pairs.sam | 81 +
tests/pysam_data/example_empty_with_header.sam | 1 +
tests/pysam_data/faidx_empty_seq.fq | 40 +
tests/pysam_data/test_mapped_unmapped.sam | 17 +
tests/pysam_data/test_query_position.sam | 9 +
tests/samtools_test.py | 535 +++--
tests/tabix_data/empty.bed.gz | Bin 0 -> 28 bytes
tests/tabix_data/empty.bed.gz.tbi | Bin 0 -> 75 bytes
tests/tabix_data/example_0v23.bed.gz | Bin 0 -> 819 bytes
tests/tabix_data/example_0v23.bed.gz.tbi | Bin 0 -> 244 bytes
tests/tabix_data/example_0v23.vcf.gz | Bin 0 -> 328 bytes
tests/tabix_data/example_0v23.vcf.gz.tbi | Bin 0 -> 155 bytes
tests/tabix_data/example_0v26.bed.gz | Bin 0 -> 819 bytes
tests/tabix_data/example_0v26.bed.gz.tbi | Bin 0 -> 244 bytes
tests/tabix_data/example_0v26.vcf.gz | Bin 0 -> 328 bytes
tests/tabix_data/example_0v26.vcf.gz.tbi | Bin 0 -> 155 bytes
tests/tabix_data/vcf/vcf_v42.vcf | 25 +
tests/tabix_test.py | 636 ++++--
251 files changed, 68626 insertions(+), 7105 deletions(-)
diff --git a/.gitignore b/.gitignore
index 4bd469f..598948d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,14 @@ tests/*.pyxbldc
tests/*.sam
tests/*.fai
tests/pysam_data
+tests/cbcf_data
+
+samtools/config.h
+htslib/config.status
+htslib/config.h
+htslib/config.log
+htslib/config.mk
+pysam/config.py
# cython files
pysam/TabProxies.c
@@ -20,6 +28,14 @@ pysam/csamtools.c
pysam/ctabix.c
pysam/cvcf.c
pysam/chtslib.c
+pysam/cutils.c
+pysam/calignedsegment.c
+pysam/calignmentfile.c
+pysam/cbcf.c
+pysam/cfaidx.c
+pysam/chtslib.c
+pysam/csamfile.c
+pysam/ctabixproxies.c
###### Generic python ignores below ######
diff --git a/.hgignore b/.hgignore
deleted file mode 100644
index 7669bcd..0000000
--- a/.hgignore
+++ /dev/null
@@ -1,7 +0,0 @@
-syntax: glob
-*~
-*#
-
-syntax: regexp
-/doc/_build/
-^/build
diff --git a/.hgtags b/.hgtags
deleted file mode 100644
index 18eb274..0000000
--- a/.hgtags
+++ /dev/null
@@ -1,4 +0,0 @@
-109bf83bbf339aef68ebd85535340511c958e8cc v0.1.1
-cbc81e01406d12faa21113da298dd3092f7442a4 v0.1.2
-ea373dc77c476185be8cae556e7b5ed453e6231c 0.2
-8aeaaf0d7ca171fc655208af00c82b9a8298d40c 0.2 release
diff --git a/.travis.yml b/.travis.yml
index fcb3cbe..1482ed7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,13 +1,25 @@
+os:
+ - linx
+ - osx
-language: python
+language: c
+sudo: required
-python:
- - "2.7"
- - "3.3"
- - "3.4"
+env:
+ matrix:
+ - CONDA_PY=2.7
+ - CONDA_PY=3.3
+ - CONDA_PY=3.4
+ - CONDA_PY=3.5
-script:
- - sudo ./install-CGAT-tools.sh --travis
+addons:
+ apt:
+ packages:
+ - gcc
+ - g++
+
+script:
+ - ./run_tests_travis.sh
notifications:
email:
diff --git a/INSTALL b/INSTALL
index 865daa7..30fe770 100644
--- a/INSTALL
+++ b/INSTALL
@@ -13,6 +13,8 @@ most of the modern Linux/Unix distributions. If you do not have this
library installed, you can still compile the rest of SAMtools by
manually modifying one line in Makefile.
+curl
+
Pysam requires Python (2.6 or greater) and Cython (0.22 or greater).
It has not been tested on many other platforms.
diff --git a/MANIFEST.in b/MANIFEST.in
index 9df7dae..7b1cbda 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -8,19 +8,29 @@ include COPYING
include INSTALL
include KNOWN_BUGS
include THANKS
+include cy_build.py
+include requirements.txt
include pysam/c*.pxd
include pysam/c*.pyx
include pysam/c*.c
include pysam/*.c
include pysam/*.h
+include samtools/configure
+include samtools/config.mk.in
+include samtools/config.h.in
include samtools/*.h
include samtools/*/*.h
include htslib/*.c
include htslib/*.h
+include htslib/configure
+include htslib/config.mk.in
+include htslib/config.h.in
include htslib/htslib/*.h
include htslib/cram/*.c
include htslib/cram/*.h
+include cy_build.py
include pysam.py
+include requirements.txt
# pysam tests
include tests/00README.txt
diff --git a/README.rst b/README.rst
index cf3c260..ab9e612 100644
--- a/README.rst
+++ b/README.rst
@@ -2,20 +2,18 @@
Pysam
=====
-.. image:: https://travis-ci.org/pysam-developers/pysam.svg
- :alt: pysam build status
+|build-status| |docs|
Pysam is a python module for reading and manipulating files in the
SAM/BAM format. The SAM/BAM format is a way to store efficiently large
numbers of alignments (`Li 2009`_), such as those routinely created by
next-generation sequencing methods.
-Pysam is a lightweight wrapper of the samtools_ C-API. Pysam also includes an
-interface for tabix_.
+Pysam is a lightweight wrapper of the samtools_ C-API. Pysam also
+includes an interface for tabix_.
-The latest version is available through
-`pypi <https://pypi.python.org/pypi/pysam>`_. To install, simply
-type::
+The latest version is available through `pypi
+<https://pypi.python.org/pypi/pysam>`_. To install, simply type::
pip install pysam
@@ -28,3 +26,13 @@ Questions and comments are very welcome and should be sent to the
.. _samtools: http://samtools.sourceforge.net/
.. _tabix: http://samtools.sourceforge.net/tabix.shtml
.. _Li 2009: http://www.ncbi.nlm.nih.gov/pubmed/19505943
+
+.. |build-status| image:: https://travis-ci.org/pysam-developers/pysam.svg
+ :alt: build status
+ :scale: 100%
+ :target: https://travis-ci.org/pysam-developers/pysam
+
+.. |docs| image:: https://readthedocs.org/projects/pysam/badge/?version=latest
+ :alt: Documentation Status
+ :scale: 100%
+ :target: https://pysam.readthedocs.org/en/latest/?badge=latest
diff --git a/THANKS b/THANKS
deleted file mode 100644
index a9fa15a..0000000
--- a/THANKS
+++ /dev/null
@@ -1,4 +0,0 @@
-We would like to thank Heng Li and the other samtools contributors for their support
-and their hard work. As a wrapper, pysam merely tries to make their code accessible
-to the python community - the heavy lifting has been done by the samtools developers.
-
diff --git a/bcftools/HMM.c b/bcftools/HMM.c
new file mode 100644
index 0000000..9196544
--- /dev/null
+++ b/bcftools/HMM.c
@@ -0,0 +1,435 @@
+/* The MIT License
+
+ Copyright (c) 2014-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <htslib/hts.h>
+#include "HMM.h"
+
+struct _hmm_t
+{
+ int nstates; // number of states
+
+ double *vprob, *vprob_tmp; // viterbi probs [nstates]
+ uint8_t *vpath; // viterbi path [nstates*nvpath]
+ double *bwd, *bwd_tmp; // bwd probs [nstates]
+ double *fwd; // fwd probs [nstates*(nfwd+1)]
+ int nvpath, nfwd;
+
+ int ntprob_arr; // number of pre-calculated tprob matrices
+ double *curr_tprob, *tmp; // Temporary arrays; curr_tprob is short lived, valid only for
+ // one site (that is, one step of Viterbi algorithm)
+ double *tprob_arr; // Array of transition matrices, precalculated to ntprob_arr
+ // positions. The first matrix is the initial tprob matrix
+ // set by hmm_init() or hmm_set_tprob()
+ set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities
+ // at each site (one step of Viterbi algorithm)
+ void *set_tprob_data;
+ double *init_probs; // Initial state probabilities, NULL for uniform probs
+};
+
+uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
+double *hmm_get_tprob(hmm_t *hmm) { return hmm->tprob_arr; }
+int hmm_get_nstates(hmm_t *hmm) { return hmm->nstates; }
+double *hmm_get_fwd_bwd_prob(hmm_t *hmm) { return hmm->fwd; }
+
+static inline void multiply_matrix(int n, double *a, double *b, double *dst, double *tmp)
+{
+ double *out = dst;
+ if ( a==dst || b==dst )
+ out = tmp;
+
+ int i,j,k;
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<n; j++)
+ {
+ double val = 0;
+ for (k=0; k<n; k++) val += MAT(a,n,i,k)*MAT(b,n,k,j);
+ MAT(out,n,i,j) = val;
+ }
+ }
+ if ( out!=dst )
+ memcpy(dst,out,sizeof(double)*n*n);
+}
+
+hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
+{
+ hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t));
+ hmm->nstates = nstates;
+ hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates);
+ hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates);
+
+ hmm_set_tprob(hmm, tprob, ntprob);
+
+ return hmm;
+}
+
+void hmm_init_states(hmm_t *hmm, double *probs)
+{
+ if ( !probs )
+ {
+ free(hmm->init_probs);
+ hmm->init_probs = NULL;
+ }
+
+ if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates);
+ memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates);
+}
+
+void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
+{
+ hmm->ntprob_arr = ntprob;
+ if ( ntprob<=0 ) ntprob = 1;
+
+ if ( !hmm->tprob_arr )
+ hmm->tprob_arr = (double*) malloc(sizeof(double)*hmm->nstates*hmm->nstates*ntprob);
+
+ memcpy(hmm->tprob_arr,tprob,sizeof(double)*hmm->nstates*hmm->nstates);
+
+ int i;
+ for (i=1; i<ntprob; i++)
+ multiply_matrix(hmm->nstates, hmm->tprob_arr, hmm->tprob_arr+(i-1)*hmm->nstates*hmm->nstates, hmm->tprob_arr+i*hmm->nstates*hmm->nstates, hmm->tmp);
+}
+
+void hmm_set_tprob_func(hmm_t *hmm, set_tprob_f set_tprob, void *data)
+{
+ hmm->set_tprob = set_tprob;
+ hmm->set_tprob_data = data;
+}
+
+static void _set_tprob(hmm_t *hmm, int pos_diff)
+{
+ assert( pos_diff>=0 );
+
+ int i, n;
+
+ n = hmm->ntprob_arr ? pos_diff % hmm->ntprob_arr : 0; // n-th precalculated matrix
+ memcpy(hmm->curr_tprob, hmm->tprob_arr+n*hmm->nstates*hmm->nstates, sizeof(*hmm->curr_tprob)*hmm->nstates*hmm->nstates);
+
+ if ( hmm->ntprob_arr > 0 )
+ {
+ n = pos_diff / hmm->ntprob_arr; // number of full blocks to jump
+ for (i=0; i<n; i++)
+ multiply_matrix(hmm->nstates, hmm->tprob_arr+(hmm->ntprob_arr-1)*hmm->nstates*hmm->nstates, hmm->curr_tprob, hmm->curr_tprob, hmm->tmp);
+ }
+}
+
+void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+{
+ // Init arrays when run for the first time
+ if ( hmm->nvpath < n )
+ {
+ hmm->nvpath = n;
+ hmm->vpath = (uint8_t*) realloc(hmm->vpath, sizeof(uint8_t)*hmm->nvpath*hmm->nstates);
+ }
+ if ( !hmm->vprob )
+ {
+ hmm->vprob = (double*) malloc(sizeof(double)*hmm->nstates);
+ hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
+ }
+
+
+ // Init all states with equal likelihood
+ int i,j, nstates = hmm->nstates;
+ if ( hmm->init_probs )
+ for (i=0; i<nstates; i++) hmm->vprob[i] = hmm->init_probs[i];
+ else
+ for (i=0; i<nstates; i++) hmm->vprob[i] = 1./nstates;
+
+ // Run Viterbi
+ uint32_t prev_pos = sites[0];
+ for (i=0; i<n; i++)
+ {
+ uint8_t *vpath = &hmm->vpath[i*nstates];
+ double *eprob = &eprobs[i*nstates];
+
+ int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[i];
+
+ double vnorm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double vmax = 0;
+ int k, k_vmax = 0;
+ for (k=0; k<nstates; k++)
+ {
+ double pval = hmm->vprob[k] * MAT(hmm->curr_tprob,hmm->nstates,j,k);
+ if ( vmax < pval ) { vmax = pval; k_vmax = k; }
+ }
+ vpath[j] = k_vmax;
+ hmm->vprob_tmp[j] = vmax * eprob[j];
+ vnorm += hmm->vprob_tmp[j];
+ }
+ for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
+ double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
+ }
+
+ // Find the most likely state
+ int iptr = 0;
+ for (i=1; i<nstates; i++)
+ if ( hmm->vprob[iptr] < hmm->vprob[i] ) iptr = i;
+
+ // Trace back the Viterbi path, we are reusing vpath for storing the states (vpath[i*nstates])
+ for (i=n-1; i>=0; i--)
+ {
+ assert( iptr<nstates && hmm->vpath[i*nstates + iptr]<nstates );
+ iptr = hmm->vpath[i*nstates + iptr];
+ hmm->vpath[i*nstates] = iptr; // reusing the array for different purpose here
+ }
+}
+
+void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+{
+ // Init arrays when run for the first time
+ if ( hmm->nfwd < n )
+ {
+ hmm->nfwd = n;
+ hmm->fwd = (double*) realloc(hmm->fwd, sizeof(double)*(hmm->nfwd+1)*hmm->nstates);
+ }
+ if ( !hmm->bwd )
+ {
+ hmm->bwd = (double*) malloc(sizeof(double)*hmm->nstates);
+ hmm->bwd_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
+ }
+
+
+ // Init all states with equal likelihood
+ int i,j,k, nstates = hmm->nstates;
+ if ( hmm->init_probs )
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
+ for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
+ }
+ else
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
+ for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
+ }
+
+ // Run fwd
+ uint32_t prev_pos = sites[0];
+ for (i=0; i<n; i++)
+ {
+ double *fwd_prev = &hmm->fwd[i*nstates];
+ double *fwd = &hmm->fwd[(i+1)*nstates];
+ double *eprob = &eprobs[i*nstates];
+
+ int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[i];
+
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += fwd_prev[k] * MAT(hmm->curr_tprob,hmm->nstates,j,k);
+ fwd[j] = pval * eprob[j];
+ norm += fwd[j];
+ }
+ for (j=0; j<nstates; j++) fwd[j] /= norm;
+ }
+
+ // Run bwd
+ double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
+ prev_pos = sites[n-1];
+ for (i=0; i<n; i++)
+ {
+ double *fwd = &hmm->fwd[(n-i)*nstates];
+ double *eprob = &eprobs[(n-i-1)*nstates];
+
+ int pos_diff = sites[n-i-1] == prev_pos ? 0 : prev_pos - sites[n-i-1] - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, sites[n-i-1], prev_pos, hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[n-i-1];
+
+ double bwd_norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += bwd[k] * eprob[k] * MAT(hmm->curr_tprob,hmm->nstates,k,j);
+ bwd_tmp[j] = pval;
+ bwd_norm += pval;
+ }
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ bwd_tmp[j] /= bwd_norm;
+ fwd[j] *= bwd_tmp[j]; // fwd now stores fwd*bwd
+ norm += fwd[j];
+ }
+ for (j=0; j<nstates; j++) fwd[j] /= norm;
+ double *tmp = bwd_tmp; bwd_tmp = bwd; bwd = tmp;
+ }
+}
+
+void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+{
+ // Init arrays when run for the first time
+ if ( hmm->nfwd < n )
+ {
+ hmm->nfwd = n;
+ hmm->fwd = (double*) realloc(hmm->fwd, sizeof(double)*(hmm->nfwd+1)*hmm->nstates);
+ }
+ if ( !hmm->bwd )
+ {
+ hmm->bwd = (double*) malloc(sizeof(double)*hmm->nstates);
+ hmm->bwd_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
+ }
+
+ // Init all states with equal likelihood
+ int i,j,k, nstates = hmm->nstates;
+ if ( hmm->init_probs )
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
+ for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
+ }
+ else
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
+ for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
+ }
+
+ // New transition matrix: temporary values
+ double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
+ double *tmp_gamma = (double*) calloc(nstates,sizeof(double));
+ double *fwd_bwd = (double*) malloc(sizeof(double)*nstates);
+
+ // Run fwd
+ uint32_t prev_pos = sites[0];
+ for (i=0; i<n; i++)
+ {
+ double *fwd_prev = &hmm->fwd[i*nstates];
+ double *fwd = &hmm->fwd[(i+1)*nstates];
+ double *eprob = &eprobs[i*nstates];
+
+ int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[i];
+
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += fwd_prev[k] * MAT(hmm->curr_tprob,hmm->nstates,j,k);
+ fwd[j] = pval * eprob[j];
+ norm += fwd[j];
+ }
+ for (j=0; j<nstates; j++) fwd[j] /= norm;
+ }
+
+ // Run bwd
+ double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
+ prev_pos = sites[n-1];
+ for (i=0; i<n; i++)
+ {
+ double *fwd = &hmm->fwd[(n-i)*nstates];
+ double *eprob = &eprobs[(n-i-1)*nstates];
+
+ int pos_diff = sites[n-i-1] == prev_pos ? 0 : prev_pos - sites[n-i-1] - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, sites[n-i-1], prev_pos, hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[n-i-1];
+
+ double bwd_norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += bwd[k] * eprob[k] * MAT(hmm->curr_tprob,hmm->nstates,k,j);
+ bwd_tmp[j] = pval;
+ bwd_norm += pval;
+ }
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ bwd_tmp[j] /= bwd_norm;
+ fwd_bwd[j] = fwd[j]*bwd_tmp[j];
+ norm += fwd_bwd[j];
+ }
+ for (j=0; j<nstates; j++)
+ {
+ fwd_bwd[j] /= norm;
+ tmp_gamma[j] += fwd_bwd[j];
+ }
+
+ for (j=0; j<nstates; j++)
+ {
+ for (k=0; k<nstates; k++)
+ {
+ MAT(tmp_xi,nstates,k,j) += fwd[j]*bwd[k]*MAT(hmm->tprob_arr,hmm->nstates,k,j)*eprob[k] / norm;
+ }
+ }
+
+ for (j=0; j<nstates; j++) fwd[j] = fwd_bwd[j]; // fwd now stores fwd*bwd
+
+ double *tmp = bwd_tmp; bwd_tmp = bwd; bwd = tmp;
+ }
+ for (j=0; j<nstates; j++)
+ {
+ double norm = 0;
+ for (k=0; k<nstates; k++)
+ {
+ MAT(hmm->curr_tprob,nstates,k,j) = MAT(tmp_xi,nstates,k,j) / tmp_gamma[j];
+ norm += MAT(hmm->curr_tprob,nstates,k,j);
+ }
+ for (k=0; k<nstates; k++)
+ MAT(hmm->curr_tprob,nstates,k,j) /= norm;
+ }
+ free(tmp_gamma);
+ free(tmp_xi);
+ free(fwd_bwd);
+}
+
+void hmm_destroy(hmm_t *hmm)
+{
+ free(hmm->init_probs);
+ free(hmm->vprob);
+ free(hmm->vprob_tmp);
+ free(hmm->vpath);
+ free(hmm->curr_tprob);
+ free(hmm->tmp);
+ free(hmm->tprob_arr);
+ free(hmm->fwd);
+ free(hmm->bwd);
+ free(hmm->bwd_tmp);
+ free(hmm);
+}
+
diff --git a/bcftools/HMM.c.pysam.c b/bcftools/HMM.c.pysam.c
new file mode 100644
index 0000000..a3b91ff
--- /dev/null
+++ b/bcftools/HMM.c.pysam.c
@@ -0,0 +1,437 @@
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2014-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <htslib/hts.h>
+#include "HMM.h"
+
+struct _hmm_t
+{
+ int nstates; // number of states
+
+ double *vprob, *vprob_tmp; // viterbi probs [nstates]
+ uint8_t *vpath; // viterbi path [nstates*nvpath]
+ double *bwd, *bwd_tmp; // bwd probs [nstates]
+ double *fwd; // fwd probs [nstates*(nfwd+1)]
+ int nvpath, nfwd;
+
+ int ntprob_arr; // number of pre-calculated tprob matrices
+ double *curr_tprob, *tmp; // Temporary arrays; curr_tprob is short lived, valid only for
+ // one site (that is, one step of Viterbi algorithm)
+ double *tprob_arr; // Array of transition matrices, precalculated to ntprob_arr
+ // positions. The first matrix is the initial tprob matrix
+ // set by hmm_init() or hmm_set_tprob()
+ set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities
+ // at each site (one step of Viterbi algorithm)
+ void *set_tprob_data;
+ double *init_probs; // Initial state probabilities, NULL for uniform probs
+};
+
+uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
+double *hmm_get_tprob(hmm_t *hmm) { return hmm->tprob_arr; }
+int hmm_get_nstates(hmm_t *hmm) { return hmm->nstates; }
+double *hmm_get_fwd_bwd_prob(hmm_t *hmm) { return hmm->fwd; }
+
+static inline void multiply_matrix(int n, double *a, double *b, double *dst, double *tmp)
+{
+ double *out = dst;
+ if ( a==dst || b==dst )
+ out = tmp;
+
+ int i,j,k;
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<n; j++)
+ {
+ double val = 0;
+ for (k=0; k<n; k++) val += MAT(a,n,i,k)*MAT(b,n,k,j);
+ MAT(out,n,i,j) = val;
+ }
+ }
+ if ( out!=dst )
+ memcpy(dst,out,sizeof(double)*n*n);
+}
+
+hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
+{
+ hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t));
+ hmm->nstates = nstates;
+ hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates);
+ hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates);
+
+ hmm_set_tprob(hmm, tprob, ntprob);
+
+ return hmm;
+}
+
+void hmm_init_states(hmm_t *hmm, double *probs)
+{
+ if ( !probs )
+ {
+ free(hmm->init_probs);
+ hmm->init_probs = NULL;
+ }
+
+ if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates);
+ memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates);
+}
+
+void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
+{
+ hmm->ntprob_arr = ntprob;
+ if ( ntprob<=0 ) ntprob = 1;
+
+ if ( !hmm->tprob_arr )
+ hmm->tprob_arr = (double*) malloc(sizeof(double)*hmm->nstates*hmm->nstates*ntprob);
+
+ memcpy(hmm->tprob_arr,tprob,sizeof(double)*hmm->nstates*hmm->nstates);
+
+ int i;
+ for (i=1; i<ntprob; i++)
+ multiply_matrix(hmm->nstates, hmm->tprob_arr, hmm->tprob_arr+(i-1)*hmm->nstates*hmm->nstates, hmm->tprob_arr+i*hmm->nstates*hmm->nstates, hmm->tmp);
+}
+
+void hmm_set_tprob_func(hmm_t *hmm, set_tprob_f set_tprob, void *data)
+{
+ hmm->set_tprob = set_tprob;
+ hmm->set_tprob_data = data;
+}
+
+static void _set_tprob(hmm_t *hmm, int pos_diff)
+{
+ assert( pos_diff>=0 );
+
+ int i, n;
+
+ n = hmm->ntprob_arr ? pos_diff % hmm->ntprob_arr : 0; // n-th precalculated matrix
+ memcpy(hmm->curr_tprob, hmm->tprob_arr+n*hmm->nstates*hmm->nstates, sizeof(*hmm->curr_tprob)*hmm->nstates*hmm->nstates);
+
+ if ( hmm->ntprob_arr > 0 )
+ {
+ n = pos_diff / hmm->ntprob_arr; // number of full blocks to jump
+ for (i=0; i<n; i++)
+ multiply_matrix(hmm->nstates, hmm->tprob_arr+(hmm->ntprob_arr-1)*hmm->nstates*hmm->nstates, hmm->curr_tprob, hmm->curr_tprob, hmm->tmp);
+ }
+}
+
+void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+{
+ // Init arrays when run for the first time
+ if ( hmm->nvpath < n )
+ {
+ hmm->nvpath = n;
+ hmm->vpath = (uint8_t*) realloc(hmm->vpath, sizeof(uint8_t)*hmm->nvpath*hmm->nstates);
+ }
+ if ( !hmm->vprob )
+ {
+ hmm->vprob = (double*) malloc(sizeof(double)*hmm->nstates);
+ hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
+ }
+
+
+ // Init all states with equal likelihood
+ int i,j, nstates = hmm->nstates;
+ if ( hmm->init_probs )
+ for (i=0; i<nstates; i++) hmm->vprob[i] = hmm->init_probs[i];
+ else
+ for (i=0; i<nstates; i++) hmm->vprob[i] = 1./nstates;
+
+ // Run Viterbi
+ uint32_t prev_pos = sites[0];
+ for (i=0; i<n; i++)
+ {
+ uint8_t *vpath = &hmm->vpath[i*nstates];
+ double *eprob = &eprobs[i*nstates];
+
+ int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[i];
+
+ double vnorm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double vmax = 0;
+ int k, k_vmax = 0;
+ for (k=0; k<nstates; k++)
+ {
+ double pval = hmm->vprob[k] * MAT(hmm->curr_tprob,hmm->nstates,j,k);
+ if ( vmax < pval ) { vmax = pval; k_vmax = k; }
+ }
+ vpath[j] = k_vmax;
+ hmm->vprob_tmp[j] = vmax * eprob[j];
+ vnorm += hmm->vprob_tmp[j];
+ }
+ for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
+ double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
+ }
+
+ // Find the most likely state
+ int iptr = 0;
+ for (i=1; i<nstates; i++)
+ if ( hmm->vprob[iptr] < hmm->vprob[i] ) iptr = i;
+
+ // Trace back the Viterbi path, we are reusing vpath for storing the states (vpath[i*nstates])
+ for (i=n-1; i>=0; i--)
+ {
+ assert( iptr<nstates && hmm->vpath[i*nstates + iptr]<nstates );
+ iptr = hmm->vpath[i*nstates + iptr];
+ hmm->vpath[i*nstates] = iptr; // reusing the array for different purpose here
+ }
+}
+
+void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+{
+ // Init arrays when run for the first time
+ if ( hmm->nfwd < n )
+ {
+ hmm->nfwd = n;
+ hmm->fwd = (double*) realloc(hmm->fwd, sizeof(double)*(hmm->nfwd+1)*hmm->nstates);
+ }
+ if ( !hmm->bwd )
+ {
+ hmm->bwd = (double*) malloc(sizeof(double)*hmm->nstates);
+ hmm->bwd_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
+ }
+
+
+ // Init all states with equal likelihood
+ int i,j,k, nstates = hmm->nstates;
+ if ( hmm->init_probs )
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
+ for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
+ }
+ else
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
+ for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
+ }
+
+ // Run fwd
+ uint32_t prev_pos = sites[0];
+ for (i=0; i<n; i++)
+ {
+ double *fwd_prev = &hmm->fwd[i*nstates];
+ double *fwd = &hmm->fwd[(i+1)*nstates];
+ double *eprob = &eprobs[i*nstates];
+
+ int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[i];
+
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += fwd_prev[k] * MAT(hmm->curr_tprob,hmm->nstates,j,k);
+ fwd[j] = pval * eprob[j];
+ norm += fwd[j];
+ }
+ for (j=0; j<nstates; j++) fwd[j] /= norm;
+ }
+
+ // Run bwd
+ double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
+ prev_pos = sites[n-1];
+ for (i=0; i<n; i++)
+ {
+ double *fwd = &hmm->fwd[(n-i)*nstates];
+ double *eprob = &eprobs[(n-i-1)*nstates];
+
+ int pos_diff = sites[n-i-1] == prev_pos ? 0 : prev_pos - sites[n-i-1] - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, sites[n-i-1], prev_pos, hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[n-i-1];
+
+ double bwd_norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += bwd[k] * eprob[k] * MAT(hmm->curr_tprob,hmm->nstates,k,j);
+ bwd_tmp[j] = pval;
+ bwd_norm += pval;
+ }
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ bwd_tmp[j] /= bwd_norm;
+ fwd[j] *= bwd_tmp[j]; // fwd now stores fwd*bwd
+ norm += fwd[j];
+ }
+ for (j=0; j<nstates; j++) fwd[j] /= norm;
+ double *tmp = bwd_tmp; bwd_tmp = bwd; bwd = tmp;
+ }
+}
+
+void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+{
+ // Init arrays when run for the first time
+ if ( hmm->nfwd < n )
+ {
+ hmm->nfwd = n;
+ hmm->fwd = (double*) realloc(hmm->fwd, sizeof(double)*(hmm->nfwd+1)*hmm->nstates);
+ }
+ if ( !hmm->bwd )
+ {
+ hmm->bwd = (double*) malloc(sizeof(double)*hmm->nstates);
+ hmm->bwd_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
+ }
+
+ // Init all states with equal likelihood
+ int i,j,k, nstates = hmm->nstates;
+ if ( hmm->init_probs )
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
+ for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
+ }
+ else
+ {
+ for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
+ for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
+ }
+
+ // New transition matrix: temporary values
+ double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
+ double *tmp_gamma = (double*) calloc(nstates,sizeof(double));
+ double *fwd_bwd = (double*) malloc(sizeof(double)*nstates);
+
+ // Run fwd
+ uint32_t prev_pos = sites[0];
+ for (i=0; i<n; i++)
+ {
+ double *fwd_prev = &hmm->fwd[i*nstates];
+ double *fwd = &hmm->fwd[(i+1)*nstates];
+ double *eprob = &eprobs[i*nstates];
+
+ int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[i];
+
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += fwd_prev[k] * MAT(hmm->curr_tprob,hmm->nstates,j,k);
+ fwd[j] = pval * eprob[j];
+ norm += fwd[j];
+ }
+ for (j=0; j<nstates; j++) fwd[j] /= norm;
+ }
+
+ // Run bwd
+ double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
+ prev_pos = sites[n-1];
+ for (i=0; i<n; i++)
+ {
+ double *fwd = &hmm->fwd[(n-i)*nstates];
+ double *eprob = &eprobs[(n-i-1)*nstates];
+
+ int pos_diff = sites[n-i-1] == prev_pos ? 0 : prev_pos - sites[n-i-1] - 1;
+
+ _set_tprob(hmm, pos_diff);
+ if ( hmm->set_tprob ) hmm->set_tprob(hmm, sites[n-i-1], prev_pos, hmm->set_tprob_data, hmm->curr_tprob);
+ prev_pos = sites[n-i-1];
+
+ double bwd_norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ double pval = 0;
+ for (k=0; k<nstates; k++)
+ pval += bwd[k] * eprob[k] * MAT(hmm->curr_tprob,hmm->nstates,k,j);
+ bwd_tmp[j] = pval;
+ bwd_norm += pval;
+ }
+ double norm = 0;
+ for (j=0; j<nstates; j++)
+ {
+ bwd_tmp[j] /= bwd_norm;
+ fwd_bwd[j] = fwd[j]*bwd_tmp[j];
+ norm += fwd_bwd[j];
+ }
+ for (j=0; j<nstates; j++)
+ {
+ fwd_bwd[j] /= norm;
+ tmp_gamma[j] += fwd_bwd[j];
+ }
+
+ for (j=0; j<nstates; j++)
+ {
+ for (k=0; k<nstates; k++)
+ {
+ MAT(tmp_xi,nstates,k,j) += fwd[j]*bwd[k]*MAT(hmm->tprob_arr,hmm->nstates,k,j)*eprob[k] / norm;
+ }
+ }
+
+ for (j=0; j<nstates; j++) fwd[j] = fwd_bwd[j]; // fwd now stores fwd*bwd
+
+ double *tmp = bwd_tmp; bwd_tmp = bwd; bwd = tmp;
+ }
+ for (j=0; j<nstates; j++)
+ {
+ double norm = 0;
+ for (k=0; k<nstates; k++)
+ {
+ MAT(hmm->curr_tprob,nstates,k,j) = MAT(tmp_xi,nstates,k,j) / tmp_gamma[j];
+ norm += MAT(hmm->curr_tprob,nstates,k,j);
+ }
+ for (k=0; k<nstates; k++)
+ MAT(hmm->curr_tprob,nstates,k,j) /= norm;
+ }
+ free(tmp_gamma);
+ free(tmp_xi);
+ free(fwd_bwd);
+}
+
+void hmm_destroy(hmm_t *hmm)
+{
+ free(hmm->init_probs);
+ free(hmm->vprob);
+ free(hmm->vprob_tmp);
+ free(hmm->vpath);
+ free(hmm->curr_tprob);
+ free(hmm->tmp);
+ free(hmm->tprob_arr);
+ free(hmm->fwd);
+ free(hmm->bwd);
+ free(hmm->bwd_tmp);
+ free(hmm);
+}
+
diff --git a/bcftools/HMM.h b/bcftools/HMM.h
new file mode 100644
index 0000000..7f01245
--- /dev/null
+++ b/bcftools/HMM.h
@@ -0,0 +1,115 @@
+/* The MIT License
+
+ Copyright (c) 2014-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#ifndef __HMM_H__
+#define __HMM_H__
+
+#define MAT(matrix,ndim,i,j) (matrix)[(ndim)*(i)+(j)] // P(i|j), that is, transition j->i
+
+typedef struct _hmm_t hmm_t;
+
+typedef void (*set_tprob_f) (hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+
+/**
+ * hmm_init() - initialize HMM
+ * @nstates: number of states
+ * @tprob: transition probabilities matrix (nstates x nstates), for elements ordering
+ * see the MAT macro above.
+ * @ntprob: number of precalculated tprob matrices or 0 for constant probs, independent
+ * of distance
+ */
+hmm_t *hmm_init(int nstates, double *tprob, int ntprob);
+void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob);
+
+/**
+ * hmm_init_states() - initial state probabilities
+ * @probs: initial state probabilities or NULL to reset to default
+ *
+ * If uncalled, all states are initialized with the same likelihood
+ */
+void hmm_init_states(hmm_t *hmm, double *probs);
+
+/**
+ * hmm_get_tprob() - return the array of transition matrices, precalculated
+ * to ntprob positions. The first matrix is the initial tprob matrix
+ * set by hmm_init() or hmm_set_tprob()
+ */
+double *hmm_get_tprob(hmm_t *hmm);
+int hmm_get_nstates(hmm_t *hmm);
+
+/**
+ * hmm_set_tprob_func() - custom setter of transition probabilities
+ */
+void hmm_set_tprob_func(hmm_t *hmm, set_tprob_f set_tprob, void *data);
+
+/**
+ * hmm_run_viterbi() - run Viterbi algorithm
+ * @nsites: number of sites
+ * @eprob: emission probabilities for each site and state (nsites x nstates)
+ * @sites: list of positions
+ *
+ * When done, hmm->vpath[] contains the calculated Viterbi path. The states
+ * are indexed starting from 0, a state at i-th site can be accessed as
+ * vpath[nstates*i].
+ */
+void hmm_run_viterbi(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
+
+/**
+ * hmm_get_viterbi_path() - the viterbi path: state at ith site is the
+ * (nstates*isite)-th element
+ */
+uint8_t *hmm_get_viterbi_path(hmm_t *hmm);
+
+/**
+ * hmm_run_fwd_bwd() - run the forward-backward algorithm
+ * @nsites: number of sites
+ * @eprob: emission probabilities for each site and state (nsites x nstates)
+ * @sites: list of positions
+ */
+void hmm_run_fwd_bwd(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
+
+/**
+ * hmm_get_fwd_bwd_prob() - the probability of i-th state at j-th site can
+ * be accessed as fwd_bwd[j*nstates+i].
+ */
+double *hmm_get_fwd_bwd_prob(hmm_t *hmm);
+
+/**
+ * hmm_run_baum_welch() - run one iteration of Baum-Welch algorithm
+ * @nsites: number of sites
+ * @eprob: emission probabilities for each site and state (nsites x nstates)
+ * @sites: list of positions
+ *
+ * Same as hmm_run_fwd_bwd, in addition curr_tprob contains the new
+ * transition probabilities. In this verison, emission probabilities
+ * are not updated.
+ */
+void hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
+
+void hmm_destroy(hmm_t *hmm);
+
+#endif
+
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h
new file mode 100644
index 0000000..6f22272
--- /dev/null
+++ b/bcftools/bcftools.h
@@ -0,0 +1,71 @@
+/* bcftools.h -- utility function declarations.
+
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#ifndef BCFTOOLS_H
+#define BCFTOOLS_H
+
+#include <stdarg.h>
+#include <htslib/vcf.h>
+#include <math.h>
+
+#define FT_GZ 1
+#define FT_VCF 2
+#define FT_VCF_GZ (FT_GZ|FT_VCF)
+#define FT_BCF (1<<2)
+#define FT_BCF_GZ (FT_GZ|FT_BCF)
+#define FT_STDIN (1<<3)
+
+char *bcftools_version(void);
+void error(const char *format, ...);
+void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
+const char *hts_bcf_wmode(int file_type);
+
+void *smalloc(size_t size); // safe malloc
+
+static inline char gt2iupac(char a, char b)
+{
+ static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
+ if ( a>='a' ) a -= 'a' - 'A';
+ if ( b>='a' ) b -= 'a' - 'A';
+ if ( a=='A' ) a = 0;
+ else if ( a=='C' ) a = 1;
+ else if ( a=='G' ) a = 2;
+ else if ( a=='T' ) a = 3;
+ else return 'N';
+ if ( b=='A' ) b = 0;
+ else if ( b=='C' ) b = 1;
+ else if ( b=='G' ) b = 2;
+ else if ( b=='T' ) b = 3;
+ else return 'N';
+ return iupac[(int)a][(int)b];
+}
+
+static inline double phred_score(double prob)
+{
+ if ( prob==0 ) return 99;
+ prob = -4.3429*log(prob);
+ return prob>99 ? 99 : prob;
+}
+
+#endif
diff --git a/bcftools/call.h b/bcftools/call.h
new file mode 100644
index 0000000..bbf0a52
--- /dev/null
+++ b/bcftools/call.h
@@ -0,0 +1,126 @@
+/* call.h -- variant calling declarations.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#ifndef __CALL_H__
+#define __CALL_H__
+
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include "vcmp.h"
+
+#define CALL_KEEPALT 1
+#define CALL_VARONLY (1<<1)
+#define CALL_CONSTR_TRIO (1<<2)
+#define CALL_CONSTR_ALLELES (1<<3)
+//
+//
+#define CALL_FMT_GQ (1<<6)
+#define CALL_FMT_GP (1<<7)
+
+#define FATHER 0
+#define MOTHER 1
+#define CHILD 2
+typedef struct
+{
+ char *name;
+ int sample[3]; // father, mother, child
+ int type; // see FTYPE_* definitions in mcall.c
+}
+family_t;
+
+typedef struct _ccall_t ccall_t;
+typedef struct
+{
+ // mcall only
+ float *qsum; // QS(sum) values
+ int nqsum, npdg;
+ int *als_map, nals_map; // mapping from full set of alleles to trimmed set of alleles (old -> new)
+ int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old)
+ char **als; // array to hold the trimmed set of alleles to appear on output
+ int nals; // size of the als array
+ family_t *fams; // list of families and samples for trio calling
+ int nfams, mfams;
+ int ntrio[5][5]; // possible trio genotype combinations and their counts; first idx:
+ uint16_t *trio[5][5]; // family type, second index: allele count (2-4, first two are unused)
+ double *GLs;
+ float *GPs; // FORMAT/GP: posterior probabilities
+ int32_t *GQs; // FORMAT/GQ: genotype qualities
+ int32_t *itmp; // temporary int array, used for new PLs with CALL_CONSTR_ALLELES
+ int n_itmp, nGPs;
+ vcmp_t *vcmp;
+ double trio_Pm_SNPs, trio_Pm_del, trio_Pm_ins; // P(mendelian) for trio calling, see mcall_call_trio_genotypes()
+ int32_t *ugts, *cgts; // unconstraind and constrained GTs
+ uint32_t output_tags;
+
+ // ccall only
+ double indel_frac, min_perm_p, min_lrt;
+ double prior_type, pref;
+ double ref_lk, lk_sum;
+ int ngrp1_samples, n_perm;
+ int nhets, ndiploid;
+ char *prior_file;
+ ccall_t *cdat;
+
+ // shared
+ bcf_srs_t *srs; // BCF synced readers holding target alleles for CALL_CONSTR_ALLELES
+ bcf1_t *rec;
+ bcf_hdr_t *hdr;
+ uint32_t flag; // One or more of the CALL_* flags defined above
+ uint8_t *ploidy, all_diploid, unseen;
+
+ double pl2p[256]; // PL to 10^(-PL/10) table
+ int32_t *PLs; // VCF PL likelihoods (rw)
+ int nPLs, mPLs, nac;
+ int32_t *gts, *ac; // GTs and AC (w)
+ double *pdg; // PLs converted to P(D|G)
+ float *anno16; int n16; // see anno[16] in bam2bcf.h
+ double theta; // prior
+}
+call_t;
+
+void error(const char *format, ...);
+
+/*
+ * *call() - return negative value on error or the number of non-reference
+ * alleles on success.
+ */
+int mcall(call_t *call, bcf1_t *rec); // multiallic and rare-variant calling model
+int ccall(call_t *call, bcf1_t *rec); // the default consensus calling model
+int qcall(call_t *call, bcf1_t *rec); // QCall output
+
+void mcall_init(call_t *call);
+void ccall_init(call_t *call);
+void qcall_init(call_t *call);
+
+void mcall_destroy(call_t *call);
+void ccall_destroy(call_t *call);
+void qcall_destroy(call_t *call);
+
+void call_init_pl2p(call_t *call);
+uint32_t *call_trio_prep(int is_x, int is_son);
+
+void init_allele_trimming_maps(call_t *call, int als, int nals);
+void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als);
+
+#endif
diff --git a/bcftools/ccall.c b/bcftools/ccall.c
new file mode 100644
index 0000000..bb43d61
--- /dev/null
+++ b/bcftools/ccall.c
@@ -0,0 +1,343 @@
+/* ccall.c -- consensus variant calling.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+ Portions copyright (C) 2010 Broad Institute.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <math.h>
+#include <htslib/kfunc.h>
+#include "call.h"
+#include "kmin.h"
+#include "prob1.h"
+
+// Most of the original -c calling was moved to bcftools as it was
+// and its data structures were wrapped into the ccal_t to make it
+// functional quickly. This is not the desired state.
+struct _ccall_t
+{
+ bcf_p1aux_t *p1;
+};
+
+void ccall_init(call_t *call)
+{
+ call->cdat = (ccall_t*) calloc(1,sizeof(ccall_t));
+ call_init_pl2p(call);
+ call->cdat->p1 = bcf_p1_init(bcf_hdr_nsamples(call->hdr), call->ploidy);
+ call->gts = (int*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int)); // assuming at most diploid everywhere
+ call->nals_map = 5;
+ call->als_map = (int*) malloc(sizeof(int)*call->nals_map);
+
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ if ( call->output_tags & CALL_FMT_GQ )
+ {
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
+ call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
+ }
+ if ( call->output_tags & CALL_FMT_GP )
+ error("Sorry, -f GP is not supported with -c\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">");
+ // Todo: groups not migrated to 'bcftools call' yet
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AF2,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first and second group ALT allele frequency (assuming HWE)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n");
+ // bcf_hdr_append(call->hdr,);
+ // bcf_hdr_append(call->hdr,);
+ bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
+
+ return;
+}
+void ccall_destroy(call_t *call)
+{
+ free(call->itmp);
+ free(call->als_map);
+ free(call->gts);
+ free(call->anno16);
+ free(call->PLs);
+ free(call->GQs);
+ free(call->pdg);
+ bcf_p1_destroy(call->cdat->p1);
+ free(call->cdat);
+ return;
+}
+
+// Inits P(D|G): convert PLs from log space, only two alleles (three PLs) are used.
+// NB: The original samtools calling code uses pdgs in reverse order (AA comes
+// first, RR last), while the -m calling model uses the canonical order.
+static void set_pdg3(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt)
+{
+ int i;
+ for (i=0; i<n_smpl; i++)
+ {
+ pdg[2] = pl2p[ PLs[0] ];
+ pdg[1] = pl2p[ PLs[1] ];
+ pdg[0] = pl2p[ PLs[2] ];
+ PLs += n_gt;
+ pdg += 3;
+ }
+}
+
+static double ttest(int n1, int n2, float a[4])
+{
+ extern double kf_betai(double a, double b, double x);
+ double t, v, u1, u2;
+ if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0;
+ u1 = (double)a[0] / n1; u2 = (double)a[2] / n2;
+ if (u1 <= u2) return 1.;
+ t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2));
+ v = n1 + n2 - 2;
+ return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t));
+}
+
+static int test16_core(float anno[16], anno16_t *a)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ double left, right;
+ int i;
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ for (i=0; i<4; i++) a->d[i] = anno[i];
+ a->depth = anno[0] + anno[1] + anno[2] + anno[3];
+ a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
+ if (a->depth == 0) return -1;
+ a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
+ kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
+ for (i = 1; i < 4; ++i)
+ a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
+ return 0;
+}
+
+int test16(float *anno16, anno16_t *a)
+{
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.;
+ a->mq = a->depth = a->is_tested = 0;
+ return test16_core(anno16, a);
+}
+static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double em[10])
+{
+ int has_I16, is_var;
+ float fq, r;
+ anno16_t a;
+ float tmpf[4], tmpi;
+
+ bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16);
+
+ has_I16 = test16(call->anno16, &a) >= 0? 1 : 0;
+
+ // print EM
+ if (em[0] >= 0)
+ {
+ tmpf[0] = 1 - em[0];
+ bcf_update_info_float(call->hdr, rec, "AF1", tmpf, 1);
+ }
+ if (em[4] >= 0 && em[4] <= 0.05)
+ {
+ tmpf[0] = em[3]; tmpf[1] = em[2]; tmpf[2] = em[1]; tmpf[3] = em[4];
+ bcf_update_info_float(call->hdr, rec, "G3", tmpf, 3);
+ bcf_update_info_float(call->hdr, rec, "HWE", &tmpf[3], 1);
+ }
+ if (em[5] >= 0 && em[6] >= 0)
+ {
+ tmpf[0] = 1 - em[5]; tmpf[1] = 1 - em[6];
+ bcf_update_info_float(call->hdr, rec, "AF2", tmpf, 2);
+ }
+ if (em[7] >= 0)
+ {
+ tmpf[0] = em[7];
+ bcf_update_info_float(call->hdr, rec, "LRT", tmpf, 1);
+ }
+ if (em[8] >= 0)
+ {
+ tmpf[0] = em[8];
+ bcf_update_info_float(call->hdr, rec, "LRT2", tmpf, 1);
+ }
+
+ bcf_p1aux_t *p1 = call->cdat->p1;
+ if (p1->cons_llr > 0)
+ {
+ tmpi = p1->cons_llr;
+ bcf_update_info_int32(call->hdr, rec, "CLR", &tmpi, 1);
+ // todo: trio calling with -c
+ if (p1->cons_gt > 0)
+ {
+ char tmp[4];
+ tmp[0] = p1->cons_gt&0xff; tmp[1] = p1->cons_gt>>8&0xff; tmp[2] = p1->cons_gt>>16&0xff; tmp[3] = 0;
+ bcf_update_info_string(call->hdr, rec, "UGT", tmp);
+ tmp[0] = p1->cons_gt>>32&0xff; tmp[1] = p1->cons_gt>>40&0xff; tmp[2] = p1->cons_gt>>48&0xff;
+ bcf_update_info_string(call->hdr, rec, "CGT", tmp);
+ }
+ }
+ if (pr == 0) return 1;
+
+ is_var = (pr->p_ref < call->pref);
+ r = is_var? pr->p_ref : pr->p_var;
+
+ bcf_update_info_int32(call->hdr, rec, "AC1", &pr->ac, 1);
+ int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
+ bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
+ bcf_update_info_int32(call->hdr, rec, "MQ", &a.mq, 1);
+
+ fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
+ if (fq < -999) fq = -999;
+ if (fq > 999) fq = 999;
+ bcf_update_info_float(call->hdr, rec, "FQ", &fq, 1);
+
+ assert( pr->cmp[0]<0 );
+ // todo
+ // if (pr->cmp[0] >= 0.) { // two sample groups
+ // int i, q[3];
+ // for (i = 1; i < 3; ++i) {
+ // double x = pr->cmp[i] + pr->cmp[0]/2.;
+ // q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
+ // if (q[i] > 255) q[i] = 255;
+ // }
+ // if (pr->perm_rank >= 0) ksprintf(&s, "PR=%d;", pr->perm_rank);
+ //
+ // ksprintf(&s, "PCHI2=%.3g;PC2=%d,%d;", q[1], q[2], pr->p_chi2);
+ // }
+
+ if (has_I16 && a.is_tested)
+ {
+ int i;
+ for (i=0; i<4; i++) tmpf[i] = a.p[i];
+ bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
+ }
+ bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
+ rec->qual = r < 1e-100? 999 : -4.343 * log(r);
+ if (rec->qual > 999) rec->qual = 999;
+
+ // Remove unused alleles
+ int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
+ if ( call->flag & CALL_KEEPALT && call->unseen>0 )
+ {
+ assert( call->unseen==nals-1 );
+ nals--;
+ }
+
+ if ( nals<rec->n_allele )
+ {
+ bcf_update_alleles(call->hdr, rec, (const char**)rec->d.allele, nals);
+
+ // Update PLs
+ int npls_src = call->nPLs / rec->n_sample, npls_dst = nals*(nals+1)/2;
+ int *pls_src = call->PLs - npls_src, *pls_dst = call->PLs - npls_dst;
+ int isample, i;
+ for (isample = 0; isample < rec->n_sample; isample++)
+ {
+ pls_src += npls_src;
+ pls_dst += npls_dst;
+ if ( !call->ploidy || call->ploidy[isample]==2 )
+ {
+ for (i=0; i<npls_dst; i++)
+ pls_dst[i] = pls_src[i];
+ }
+ else
+ {
+ for (i=0; i<nals; i++)
+ {
+ int isrc = (i+1)*(i+2)/2-1;
+ pls_dst[i] = pls_src[isrc];
+ }
+ if (i<npls_dst) pls_dst[i] = bcf_int32_vector_end;
+ }
+ }
+ bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*rec->n_sample);
+ }
+
+ // Call genotypes
+ int i;
+ for (i=0; i<rec->n_sample; i++)
+ {
+ int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2;
+ int gt = x&3;
+ if ( !call->ploidy || call->ploidy[i]==2 )
+ {
+ if ( gt==1 )
+ {
+ call->gts[2*i] = bcf_gt_unphased(0);
+ call->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else if ( gt==0 )
+ {
+ call->gts[2*i] = bcf_gt_unphased(1);
+ call->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else
+ {
+ call->gts[2*i] = bcf_gt_unphased(0);
+ call->gts[2*i+1] = bcf_gt_unphased(0);
+ }
+ if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = x>>2;
+ }
+ else
+ {
+ if ( gt==0 ) call->gts[2*i] = bcf_gt_unphased(1);
+ else call->gts[2*i] = bcf_gt_unphased(0);
+ call->gts[2*i+1] = bcf_int32_vector_end;
+ if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = bcf_int32_missing;
+ }
+ }
+ bcf_update_genotypes(call->hdr, rec, call->gts, rec->n_sample*2);
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, rec->n_sample);
+
+ // trim Number=R tags
+ int out_als = 0;
+ for (i=0; i<nals; i++) out_als |= 1<<i;
+ init_allele_trimming_maps(call, out_als, nals_ori);
+ mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
+
+ return is_var;
+}
+
+
+int ccall(call_t *call, bcf1_t *rec)
+{
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+
+ // Get the genotype likelihoods
+ int nals = rec->n_allele;
+ call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
+ if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // diploid+haploid or haploid only
+ error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+
+ // Convert PLs to probabilities, only first two alleles are considered
+ int ngts = nals*(nals+1)/2;
+ hts_expand(double, 3*nsmpl, call->npdg, call->pdg);
+ set_pdg3(call->pl2p, call->PLs, call->pdg, nsmpl, ngts);
+
+ double em[10] = {-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.};
+ int ret = bcf_em1(call, rec, call->ngrp1_samples, 0x1ff, em);
+
+ bcf_p1rst_t pr;
+ int do_contrast = (em[7] >= 0 && em[7] < call->min_lrt) ? 1 : 0;
+ ret = bcf_p1_cal(call, rec, do_contrast, call->cdat->p1, &pr);
+ if (pr.p_ref >= call->pref && (call->flag & CALL_VARONLY)) return 0;
+ if (ret >= 0) ret = update_bcf1(call, rec, &pr, em);
+ return ret;
+}
+
diff --git a/bcftools/ccall.c.pysam.c b/bcftools/ccall.c.pysam.c
new file mode 100644
index 0000000..d4ceb01
--- /dev/null
+++ b/bcftools/ccall.c.pysam.c
@@ -0,0 +1,345 @@
+#include "pysam.h"
+
+/* ccall.c -- consensus variant calling.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+ Portions copyright (C) 2010 Broad Institute.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <math.h>
+#include <htslib/kfunc.h>
+#include "call.h"
+#include "kmin.h"
+#include "prob1.h"
+
+// Most of the original -c calling was moved to bcftools as it was
+// and its data structures were wrapped into the ccal_t to make it
+// functional quickly. This is not the desired state.
+struct _ccall_t
+{
+ bcf_p1aux_t *p1;
+};
+
+void ccall_init(call_t *call)
+{
+ call->cdat = (ccall_t*) calloc(1,sizeof(ccall_t));
+ call_init_pl2p(call);
+ call->cdat->p1 = bcf_p1_init(bcf_hdr_nsamples(call->hdr), call->ploidy);
+ call->gts = (int*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int)); // assuming at most diploid everywhere
+ call->nals_map = 5;
+ call->als_map = (int*) malloc(sizeof(int)*call->nals_map);
+
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ if ( call->output_tags & CALL_FMT_GQ )
+ {
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
+ call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
+ }
+ if ( call->output_tags & CALL_FMT_GP )
+ error("Sorry, -f GP is not supported with -c\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">");
+ // Todo: groups not migrated to 'bcftools call' yet
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AF2,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first and second group ALT allele frequency (assuming HWE)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n");
+ // bcf_hdr_append(call->hdr,);
+ // bcf_hdr_append(call->hdr,);
+ bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
+
+ return;
+}
+void ccall_destroy(call_t *call)
+{
+ free(call->itmp);
+ free(call->als_map);
+ free(call->gts);
+ free(call->anno16);
+ free(call->PLs);
+ free(call->GQs);
+ free(call->pdg);
+ bcf_p1_destroy(call->cdat->p1);
+ free(call->cdat);
+ return;
+}
+
+// Inits P(D|G): convert PLs from log space, only two alleles (three PLs) are used.
+// NB: The original samtools calling code uses pdgs in reverse order (AA comes
+// first, RR last), while the -m calling model uses the canonical order.
+static void set_pdg3(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt)
+{
+ int i;
+ for (i=0; i<n_smpl; i++)
+ {
+ pdg[2] = pl2p[ PLs[0] ];
+ pdg[1] = pl2p[ PLs[1] ];
+ pdg[0] = pl2p[ PLs[2] ];
+ PLs += n_gt;
+ pdg += 3;
+ }
+}
+
+static double ttest(int n1, int n2, float a[4])
+{
+ extern double kf_betai(double a, double b, double x);
+ double t, v, u1, u2;
+ if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0;
+ u1 = (double)a[0] / n1; u2 = (double)a[2] / n2;
+ if (u1 <= u2) return 1.;
+ t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2));
+ v = n1 + n2 - 2;
+ return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t));
+}
+
+static int test16_core(float anno[16], anno16_t *a)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ double left, right;
+ int i;
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ for (i=0; i<4; i++) a->d[i] = anno[i];
+ a->depth = anno[0] + anno[1] + anno[2] + anno[3];
+ a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
+ if (a->depth == 0) return -1;
+ a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
+ kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
+ for (i = 1; i < 4; ++i)
+ a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
+ return 0;
+}
+
+int test16(float *anno16, anno16_t *a)
+{
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.;
+ a->mq = a->depth = a->is_tested = 0;
+ return test16_core(anno16, a);
+}
+static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double em[10])
+{
+ int has_I16, is_var;
+ float fq, r;
+ anno16_t a;
+ float tmpf[4], tmpi;
+
+ bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16);
+
+ has_I16 = test16(call->anno16, &a) >= 0? 1 : 0;
+
+ // print EM
+ if (em[0] >= 0)
+ {
+ tmpf[0] = 1 - em[0];
+ bcf_update_info_float(call->hdr, rec, "AF1", tmpf, 1);
+ }
+ if (em[4] >= 0 && em[4] <= 0.05)
+ {
+ tmpf[0] = em[3]; tmpf[1] = em[2]; tmpf[2] = em[1]; tmpf[3] = em[4];
+ bcf_update_info_float(call->hdr, rec, "G3", tmpf, 3);
+ bcf_update_info_float(call->hdr, rec, "HWE", &tmpf[3], 1);
+ }
+ if (em[5] >= 0 && em[6] >= 0)
+ {
+ tmpf[0] = 1 - em[5]; tmpf[1] = 1 - em[6];
+ bcf_update_info_float(call->hdr, rec, "AF2", tmpf, 2);
+ }
+ if (em[7] >= 0)
+ {
+ tmpf[0] = em[7];
+ bcf_update_info_float(call->hdr, rec, "LRT", tmpf, 1);
+ }
+ if (em[8] >= 0)
+ {
+ tmpf[0] = em[8];
+ bcf_update_info_float(call->hdr, rec, "LRT2", tmpf, 1);
+ }
+
+ bcf_p1aux_t *p1 = call->cdat->p1;
+ if (p1->cons_llr > 0)
+ {
+ tmpi = p1->cons_llr;
+ bcf_update_info_int32(call->hdr, rec, "CLR", &tmpi, 1);
+ // todo: trio calling with -c
+ if (p1->cons_gt > 0)
+ {
+ char tmp[4];
+ tmp[0] = p1->cons_gt&0xff; tmp[1] = p1->cons_gt>>8&0xff; tmp[2] = p1->cons_gt>>16&0xff; tmp[3] = 0;
+ bcf_update_info_string(call->hdr, rec, "UGT", tmp);
+ tmp[0] = p1->cons_gt>>32&0xff; tmp[1] = p1->cons_gt>>40&0xff; tmp[2] = p1->cons_gt>>48&0xff;
+ bcf_update_info_string(call->hdr, rec, "CGT", tmp);
+ }
+ }
+ if (pr == 0) return 1;
+
+ is_var = (pr->p_ref < call->pref);
+ r = is_var? pr->p_ref : pr->p_var;
+
+ bcf_update_info_int32(call->hdr, rec, "AC1", &pr->ac, 1);
+ int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
+ bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
+ bcf_update_info_int32(call->hdr, rec, "MQ", &a.mq, 1);
+
+ fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
+ if (fq < -999) fq = -999;
+ if (fq > 999) fq = 999;
+ bcf_update_info_float(call->hdr, rec, "FQ", &fq, 1);
+
+ assert( pr->cmp[0]<0 );
+ // todo
+ // if (pr->cmp[0] >= 0.) { // two sample groups
+ // int i, q[3];
+ // for (i = 1; i < 3; ++i) {
+ // double x = pr->cmp[i] + pr->cmp[0]/2.;
+ // q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
+ // if (q[i] > 255) q[i] = 255;
+ // }
+ // if (pr->perm_rank >= 0) ksprintf(&s, "PR=%d;", pr->perm_rank);
+ //
+ // ksprintf(&s, "PCHI2=%.3g;PC2=%d,%d;", q[1], q[2], pr->p_chi2);
+ // }
+
+ if (has_I16 && a.is_tested)
+ {
+ int i;
+ for (i=0; i<4; i++) tmpf[i] = a.p[i];
+ bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
+ }
+ bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
+ rec->qual = r < 1e-100? 999 : -4.343 * log(r);
+ if (rec->qual > 999) rec->qual = 999;
+
+ // Remove unused alleles
+ int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
+ if ( call->flag & CALL_KEEPALT && call->unseen>0 )
+ {
+ assert( call->unseen==nals-1 );
+ nals--;
+ }
+
+ if ( nals<rec->n_allele )
+ {
+ bcf_update_alleles(call->hdr, rec, (const char**)rec->d.allele, nals);
+
+ // Update PLs
+ int npls_src = call->nPLs / rec->n_sample, npls_dst = nals*(nals+1)/2;
+ int *pls_src = call->PLs - npls_src, *pls_dst = call->PLs - npls_dst;
+ int isample, i;
+ for (isample = 0; isample < rec->n_sample; isample++)
+ {
+ pls_src += npls_src;
+ pls_dst += npls_dst;
+ if ( !call->ploidy || call->ploidy[isample]==2 )
+ {
+ for (i=0; i<npls_dst; i++)
+ pls_dst[i] = pls_src[i];
+ }
+ else
+ {
+ for (i=0; i<nals; i++)
+ {
+ int isrc = (i+1)*(i+2)/2-1;
+ pls_dst[i] = pls_src[isrc];
+ }
+ if (i<npls_dst) pls_dst[i] = bcf_int32_vector_end;
+ }
+ }
+ bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*rec->n_sample);
+ }
+
+ // Call genotypes
+ int i;
+ for (i=0; i<rec->n_sample; i++)
+ {
+ int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2;
+ int gt = x&3;
+ if ( !call->ploidy || call->ploidy[i]==2 )
+ {
+ if ( gt==1 )
+ {
+ call->gts[2*i] = bcf_gt_unphased(0);
+ call->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else if ( gt==0 )
+ {
+ call->gts[2*i] = bcf_gt_unphased(1);
+ call->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else
+ {
+ call->gts[2*i] = bcf_gt_unphased(0);
+ call->gts[2*i+1] = bcf_gt_unphased(0);
+ }
+ if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = x>>2;
+ }
+ else
+ {
+ if ( gt==0 ) call->gts[2*i] = bcf_gt_unphased(1);
+ else call->gts[2*i] = bcf_gt_unphased(0);
+ call->gts[2*i+1] = bcf_int32_vector_end;
+ if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = bcf_int32_missing;
+ }
+ }
+ bcf_update_genotypes(call->hdr, rec, call->gts, rec->n_sample*2);
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, rec->n_sample);
+
+ // trim Number=R tags
+ int out_als = 0;
+ for (i=0; i<nals; i++) out_als |= 1<<i;
+ init_allele_trimming_maps(call, out_als, nals_ori);
+ mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
+
+ return is_var;
+}
+
+
+int ccall(call_t *call, bcf1_t *rec)
+{
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+
+ // Get the genotype likelihoods
+ int nals = rec->n_allele;
+ call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
+ if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // diploid+haploid or haploid only
+ error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+
+ // Convert PLs to probabilities, only first two alleles are considered
+ int ngts = nals*(nals+1)/2;
+ hts_expand(double, 3*nsmpl, call->npdg, call->pdg);
+ set_pdg3(call->pl2p, call->PLs, call->pdg, nsmpl, ngts);
+
+ double em[10] = {-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.};
+ int ret = bcf_em1(call, rec, call->ngrp1_samples, 0x1ff, em);
+
+ bcf_p1rst_t pr;
+ int do_contrast = (em[7] >= 0 && em[7] < call->min_lrt) ? 1 : 0;
+ ret = bcf_p1_cal(call, rec, do_contrast, call->cdat->p1, &pr);
+ if (pr.p_ref >= call->pref && (call->flag & CALL_VARONLY)) return 0;
+ if (ret >= 0) ret = update_bcf1(call, rec, &pr, em);
+ return ret;
+}
+
diff --git a/bcftools/consensus.c b/bcftools/consensus.c
new file mode 100644
index 0000000..7a615fe
--- /dev/null
+++ b/bcftools/consensus.c
@@ -0,0 +1,658 @@
+/* The MIT License
+
+ Copyright (c) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <htslib/vcf.h>
+#include <htslib/kstring.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include <htslib/regidx.h>
+#include "bcftools.h"
+#include "rbuf.h"
+
+typedef struct
+{
+ int num; // number of ungapped blocks in this chain
+ int *block_lengths; // length of the ungapped blocks in this chain
+ int *ref_gaps; // length of the gaps on the reference sequence between blocks
+ int *alt_gaps; // length of the gaps on the alternative sequence between blocks
+ int ori_pos;
+ int ref_last_block_ori; // start position on the reference sequence of the following ungapped block (0-based)
+ int alt_last_block_ori; // start position on the alternative sequence of the following ungapped block (0-based)
+}
+chain_t;
+
+
+typedef struct
+{
+ kstring_t fa_buf; // buffered reference sequence
+ int fa_ori_pos; // start position of the fa_buffer (wrt original sequence)
+ int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins)
+ int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
+ int fa_end_pos; // region's end position in the original sequence
+ int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
+ int fa_case; // output upper case or lower case?
+ int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
+
+ rbuf_t vcf_rbuf;
+ bcf1_t **vcf_buf;
+ int nvcf_buf, rid;
+
+ regidx_t *mask;
+
+ int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
+ chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
+ // Note that the chain is re-initialised for each chromosome/seq_region
+
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ FILE *fp_out;
+ FILE *fp_chain;
+ char **argv;
+ int argc, output_iupac, haplotype, isample;
+ char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
+}
+args_t;
+
+static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
+{
+// fprintf(stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
+ chain = (chain_t*) calloc(1,sizeof(chain_t));
+ chain->num = 0;
+ chain->block_lengths = NULL;
+ chain->ref_gaps = NULL;
+ chain->alt_gaps = NULL;
+ chain->ori_pos = ref_ori_pos;
+ chain->ref_last_block_ori = ref_ori_pos;
+ chain->alt_last_block_ori = ref_ori_pos;
+ return chain;
+}
+
+static void destroy_chain(args_t *args)
+{
+ chain_t *chain = args->chain;
+ free(chain->ref_gaps);
+ free(chain->alt_gaps);
+ free(chain->block_lengths);
+ free(chain);
+ chain = NULL;
+}
+
+static void print_chain(args_t *args)
+{
+ /*
+ Example chain format (see: https://genome.ucsc.edu/goldenPath/help/chain.html):
+ chain 1 500 + 480 500 1 501 + 480 501 1
+ 12 3 1
+ 1 0 3
+ 484
+
+ chain line is:
+ - chain
+ - score (sum of the length of ungapped block in this case)
+ - ref_seqname (from the fasta header, parsed by htslib)
+ - ref_seqlength (from the fasta header)
+ - ref_strand (+ or -; always + for bcf-consensus)
+ - ref_start (as defined in the fasta header)
+ - ref_end (as defined in the fasta header)
+ - alt_seqname (same as ref_seqname as bcf-consensus only considers SNPs and indels)
+ - alt_seqlength (adjusted to match the length of the alt sequence)
+ - alt_strand (+ or -; always + for bcf-consensus)
+ - alt_start (same as ref_start, as no edits are recorded/applied before that position)
+ - alt_end (adjusted to match the length of the alt sequence)
+ - chain_num (just an auto-increment id)
+
+ the other (sorted) lines are:
+ - length of the ungapped alignment block
+ - gap on the ref sequence between this and the next block (all but the last line)
+ - gap on the alt sequence between this and the next block (all but the last line)
+ */
+ chain_t *chain = args->chain;
+ int n = chain->num;
+ int ref_end_pos = args->fa_length + chain->ori_pos;
+ int last_block_size = ref_end_pos - chain->ref_last_block_ori;
+ int alt_end_pos = chain->alt_last_block_ori + last_block_size;
+ int score = 0;
+ for (n=0; n<chain->num; n++) {
+ score += chain->block_lengths[n];
+ }
+ score += last_block_size;
+ fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
+ for (n=0; n<chain->num; n++) {
+ fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
+ }
+ fprintf(args->fp_chain, "%d\n\n", last_block_size);
+}
+
+static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
+{
+// fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
+ int num = chain->num;
+
+ if (ref_start <= chain->ref_last_block_ori) {
+ // In case this variant is back-to-back with the previous one
+ chain->ref_last_block_ori = ref_start + ref_len;
+ chain->alt_last_block_ori = alt_start + alt_len;
+ chain->ref_gaps[num-1] += ref_len;
+ chain->alt_gaps[num-1] += alt_len;
+
+ } else {
+ // Extend the ungapped blocks, store the gap length
+ chain->block_lengths = (int*) realloc(chain->block_lengths, (num + 1) * sizeof(int));
+ chain->ref_gaps = (int*) realloc(chain->ref_gaps, (num + 1) * sizeof(int));
+ chain->alt_gaps = (int*) realloc(chain->alt_gaps, (num + 1) * sizeof(int));
+ chain->block_lengths[num] = ref_start - chain->ref_last_block_ori;
+ chain->ref_gaps[num] = ref_len;
+ chain->alt_gaps[num] = alt_len;
+ // Update the start positions of the next block
+ chain->ref_last_block_ori = ref_start + ref_len;
+ chain->alt_last_block_ori = alt_start + alt_len;
+ // Increment the number of ungapped blocks
+ chain->num++;
+ }
+}
+
+static void init_data(args_t *args)
+{
+ args->files = bcf_sr_init();
+ args->files->require_index = 1;
+ if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum));
+ args->hdr = args->files->readers[0].header;
+ args->isample = -1;
+ if ( args->sample )
+ {
+ args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
+ if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
+ }
+ if ( args->haplotype && args->isample<0 )
+ {
+ if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
+ args->isample = 0;
+ }
+ if ( args->mask_fname )
+ {
+ args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
+ if ( !args->mask ) error("Failed to initialize mask regions\n");
+ }
+ // In case we want to store the chains
+ if ( args->chain_fname )
+ {
+ args->fp_chain = fopen(args->chain_fname,"w");
+ if ( ! args->fp_chain ) error("Failed to create %s: %s\n", args->chain_fname, strerror(errno));
+ args->chain_id = 0;
+ }
+ rbuf_init(&args->vcf_rbuf, 100);
+ args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*));
+ if ( args->output_fname ) {
+ args->fp_out = fopen(args->output_fname,"w");
+ if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
+ }
+ else args->fp_out = stdout;
+}
+
+static void destroy_data(args_t *args)
+{
+ bcf_sr_destroy(args->files);
+ int i;
+ for (i=0; i<args->vcf_rbuf.m; i++)
+ if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
+ free(args->vcf_buf);
+ free(args->fa_buf.s);
+ if ( args->mask ) regidx_destroy(args->mask);
+ if ( args->chain_fname )
+ if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
+ if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
+}
+
+static void init_region(args_t *args, char *line)
+{
+ char *ss, *se = line;
+ while ( *se && !isspace(*se) && *se!=':' ) se++;
+ int from = 0, to = 0;
+ char tmp, *tmp_ptr = NULL;
+ if ( *se )
+ {
+ tmp = *se; *se = 0; tmp_ptr = se;
+ ss = ++se;
+ from = strtol(ss,&se,10);
+ if ( ss==se || !*se || *se!='-' ) from = 0;
+ else
+ {
+ from--;
+ ss = ++se;
+ to = strtol(ss,&se,10);
+ if ( ss==se || (*se && !isspace(*se)) ) { from = 0; to = 0; }
+ else to--;
+ }
+ }
+ args->rid = bcf_hdr_name2id(args->hdr,line);
+ if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
+ args->fa_buf.l = 0;
+ args->fa_length = 0;
+ args->fa_end_pos = to;
+ args->fa_ori_pos = from;
+ args->fa_src_pos = from;
+ args->fa_mod_off = 0;
+ args->fa_frz_pos = -1;
+ args->fa_case = -1;
+ args->vcf_rbuf.n = 0;
+ bcf_sr_seek(args->files,line,args->fa_ori_pos);
+ if ( tmp_ptr ) *tmp_ptr = tmp;
+ fprintf(args->fp_out,">%s\n",line);
+ if (args->chain_fname )
+ {
+ args->chain = init_chain(args->chain, args->fa_ori_pos);
+ } else {
+ args->chain = NULL;
+ }
+}
+
+static bcf1_t **next_vcf_line(args_t *args)
+{
+ if ( args->vcf_rbuf.n )
+ {
+ int i = rbuf_shift(&args->vcf_rbuf);
+ return &args->vcf_buf[i];
+ }
+ else if ( bcf_sr_next_line(args->files) )
+ return &args->files->readers[0].buffer[0];
+
+ return NULL;
+}
+static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
+{
+ bcf1_t *rec = *rec_ptr;
+ if ( args->vcf_rbuf.n >= args->vcf_rbuf.m )
+ error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+
+ // Insert the new record in the buffer. The line would be overwritten in
+ // the next bcf_sr_next_line call, therefore we need to swap it with an
+ // unused one
+ int i = rbuf_append(&args->vcf_rbuf);
+ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = bcf_init1();
+ bcf1_t *tmp = rec; *rec_ptr = args->vcf_buf[i]; args->vcf_buf[i] = tmp;
+}
+static void flush_fa_buffer(args_t *args, int len)
+{
+ if ( !args->fa_buf.l ) return;
+
+ int nwr = 0;
+ while ( nwr + 60 <= args->fa_buf.l )
+ {
+ if ( fwrite(args->fa_buf.s+nwr,1,60,args->fp_out) != 60 ) error("Could not write: %s\n", args->output_fname);
+ if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
+ nwr += 60;
+ }
+ if ( nwr )
+ args->fa_ori_pos += nwr;
+
+ if ( len )
+ {
+ // not finished on this chr yet and the buffer cannot be emptied completely
+ if ( nwr && nwr < args->fa_buf.l )
+ memmove(args->fa_buf.s,args->fa_buf.s+nwr,args->fa_buf.l-nwr);
+ args->fa_buf.l -= nwr;
+ return;
+ }
+
+ // empty the whole buffer
+ if ( nwr == args->fa_buf.l ) { args->fa_buf.l = 0; return; }
+
+ if ( fwrite(args->fa_buf.s+nwr,1,args->fa_buf.l - nwr,args->fp_out) != args->fa_buf.l - nwr ) error("Could not write: %s\n", args->output_fname);
+ if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
+
+ args->fa_ori_pos += args->fa_buf.l - nwr - args->fa_mod_off;
+ args->fa_mod_off = 0;
+ args->fa_buf.l = 0;
+}
+static void apply_variant(args_t *args, bcf1_t *rec)
+{
+ if ( rec->n_allele==1 ) return;
+
+ if ( rec->pos <= args->fa_frz_pos )
+ {
+ fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ return;
+ }
+ if ( args->mask )
+ {
+ char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
+ int start = rec->pos;
+ int end = rec->pos + rec->rlen - 1;
+ if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
+ }
+
+ int i, ialt = 1;
+ if ( args->isample >= 0 )
+ {
+ bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
+ if ( !fmt ) return;
+ if ( args->haplotype )
+ {
+ if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
+ uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
+ ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
+ ialt = bcf_gt_allele(ialt);
+ }
+ else if ( args->output_iupac )
+ {
+ uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
+ ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
+ ialt = bcf_gt_allele(ialt);
+
+ int jalt;
+ if ( fmt->n>1 )
+ {
+ ptr = fmt->p + fmt->size*args->isample + 1;
+ jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
+ else jalt = bcf_gt_allele(jalt);
+ }
+ else jalt = ialt;
+ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
+ {
+ char ial = rec->d.allele[ialt][0];
+ char jal = rec->d.allele[jalt][0];
+ rec->d.allele[ialt][0] = gt2iupac(ial,jal);
+ }
+ }
+ else
+ {
+ for (i=0; i<fmt->n; i++)
+ {
+ uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
+ ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
+ ialt = bcf_gt_allele(ialt);
+ if ( ialt ) break;
+ }
+ }
+ if ( !ialt ) return; // ref allele
+ if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ }
+ else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
+ {
+ char ial = rec->d.allele[0][0];
+ char jal = rec->d.allele[1][0];
+ rec->d.allele[1][0] = gt2iupac(ial,jal);
+ }
+
+ int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
+ if ( idx<0 || idx>=args->fa_buf.l )
+ error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
+
+ // sanity check the reference base
+ int len_diff = 0, alen = 0;
+ if ( rec->d.allele[ialt][0]=='<' )
+ {
+ if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
+ error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1);
+ assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1
+ len_diff = 1-rec->rlen;
+ rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event
+ alen = strlen(rec->d.allele[ialt]);
+ }
+ else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
+ {
+ // fprintf(stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
+ char tmp = 0;
+ if ( args->fa_buf.l - idx > rec->rlen )
+ {
+ tmp = args->fa_buf.s[idx+rec->rlen];
+ args->fa_buf.s[idx+rec->rlen] = 0;
+ }
+ error(
+ "The fasta sequence does not match the REF allele at %s:%d:\n"
+ " .vcf: [%s]\n"
+ " .vcf: [%s] <- (ALT)\n"
+ " .fa: [%s]%c%s\n",
+ bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
+ tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
+ );
+ }
+ else
+ {
+ alen = strlen(rec->d.allele[ialt]);
+ len_diff = alen - rec->rlen;
+ }
+
+ if ( args->fa_case )
+ for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
+ else
+ for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);
+
+ if ( len_diff <= 0 )
+ {
+ // deletion or same size event
+ for (i=0; i<alen; i++)
+ args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ if ( len_diff )
+ memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
+ }
+ else
+ {
+ // insertion
+ ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
+ memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
+ for (i=0; i<alen; i++)
+ args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ }
+ if (args->chain && len_diff != 0)
+ {
+ // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
+ if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
+ {
+ // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
+ push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
+ }
+ else
+ {
+ // otherwise, just the coordinates of the variant as given
+ push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
+ }
+ }
+ args->fa_buf.l += len_diff;
+ args->fa_mod_off += len_diff;
+ args->fa_frz_pos = rec->pos + rec->rlen - 1;
+}
+
+
+static void mask_region(args_t *args, char *seq, int len)
+{
+ char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
+ int start = args->fa_src_pos - len;
+ int end = args->fa_src_pos;
+
+ regitr_t itr;
+ if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return;
+
+ int idx_start, idx_end, i;
+ while ( REGITR_OVERLAP(itr,start,end) )
+ {
+ idx_start = REGITR_START(itr) - start;
+ idx_end = REGITR_END(itr) - start;
+ if ( idx_start < 0 ) idx_start = 0;
+ if ( idx_end >= len ) idx_end = len - 1;
+ for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
+ itr.i++;
+ }
+}
+
+static void consensus(args_t *args)
+{
+ htsFile *fasta = hts_open(args->ref_fname, "rb");
+ if ( !fasta ) error("Error reading %s\n", args->ref_fname);
+ kstring_t str = {0,0,0};
+ while ( hts_getline(fasta, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( str.s[0]=='>' )
+ {
+ // new sequence encountered, apply all chached variants
+ while ( args->vcf_rbuf.n )
+ {
+ if (args->chain) {
+ print_chain(args);
+ destroy_chain(args);
+ }
+ bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
+ if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
+ int i = rbuf_shift(&args->vcf_rbuf);
+ apply_variant(args, args->vcf_buf[i]);
+ }
+ flush_fa_buffer(args, 0);
+ init_region(args, str.s+1);
+ continue;
+ }
+ args->fa_length += str.l;
+ args->fa_src_pos += str.l;
+
+ // determine if uppercase or lowercase is used in this fasta file
+ if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;
+
+ if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l);
+ kputs(str.s, &args->fa_buf);
+
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
+ {
+ bcf1_t *rec = *rec_ptr;
+
+ // still the same chr and the same region? if not, fasta buf can be flushed
+ if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) )
+ {
+ // save the vcf record until next time and flush
+ unread_vcf_line(args, rec_ptr);
+ rec_ptr = NULL;
+ break;
+ }
+
+ // is the vcf record well beyond cached fasta buffer? if yes, the buf can be flushed
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos )
+ {
+ unread_vcf_line(args, rec_ptr);
+ rec_ptr = NULL;
+ break;
+ }
+
+ // is the cached fasta buffer full enough? if not, read more fasta, no flushing
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off < rec->pos + rec->rlen )
+ {
+ unread_vcf_line(args, rec_ptr);
+ break;
+ }
+ apply_variant(args, rec);
+ }
+ if ( !rec_ptr ) flush_fa_buffer(args, 60);
+ }
+ if (args->chain) {
+ print_chain(args);
+ destroy_chain(args);
+ }
+ flush_fa_buffer(args, 0);
+ hts_close(fasta);
+ free(str.s);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference\n");
+ fprintf(stderr, " fasta file.\n");
+ fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
+ fprintf(stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(stderr, " -m, --mask <file> replace regions with N\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
+ fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
+ fprintf(stderr, "Examples:\n");
+ fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
+ fprintf(stderr, " # in the form \">chr:from-to\".\n");
+ fprintf(stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_consensus(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+
+ static struct option loptions[] =
+ {
+ {"sample",1,0,'s'},
+ {"iupac-codes",0,0,'i'},
+ {"haplotype",1,0,'H'},
+ {"output",1,0,'o'},
+ {"fasta-ref",1,0,'f'},
+ {"mask",1,0,'m'},
+ {"chain",1,0,'c'},
+ {0,0,0,0}
+ };
+ char c;
+ while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 's': args->sample = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'i': args->output_iupac = 1; break;
+ case 'f': args->ref_fname = optarg; break;
+ case 'm': args->mask_fname = optarg; break;
+ case 'c': args->chain_fname = optarg; break;
+ case 'H':
+ args->haplotype = optarg[0] - '0';
+ if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ break;
+ default: usage(args); break;
+ }
+ }
+ if ( optind>=argc ) usage(args);
+ args->fname = argv[optind];
+
+ if ( !args->ref_fname && !isatty(fileno((FILE *)stdin)) ) args->ref_fname = "-";
+ if ( !args->ref_fname ) usage(args);
+
+ init_data(args);
+ consensus(args);
+ destroy_data(args);
+ free(args);
+
+ return 0;
+}
+
+
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c
new file mode 100644
index 0000000..7765d6b
--- /dev/null
+++ b/bcftools/consensus.c.pysam.c
@@ -0,0 +1,660 @@
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <htslib/vcf.h>
+#include <htslib/kstring.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include <htslib/regidx.h>
+#include "bcftools.h"
+#include "rbuf.h"
+
+typedef struct
+{
+ int num; // number of ungapped blocks in this chain
+ int *block_lengths; // length of the ungapped blocks in this chain
+ int *ref_gaps; // length of the gaps on the reference sequence between blocks
+ int *alt_gaps; // length of the gaps on the alternative sequence between blocks
+ int ori_pos;
+ int ref_last_block_ori; // start position on the reference sequence of the following ungapped block (0-based)
+ int alt_last_block_ori; // start position on the alternative sequence of the following ungapped block (0-based)
+}
+chain_t;
+
+
+typedef struct
+{
+ kstring_t fa_buf; // buffered reference sequence
+ int fa_ori_pos; // start position of the fa_buffer (wrt original sequence)
+ int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins)
+ int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
+ int fa_end_pos; // region's end position in the original sequence
+ int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
+ int fa_case; // output upper case or lower case?
+ int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
+
+ rbuf_t vcf_rbuf;
+ bcf1_t **vcf_buf;
+ int nvcf_buf, rid;
+
+ regidx_t *mask;
+
+ int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
+ chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
+ // Note that the chain is re-initialised for each chromosome/seq_region
+
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ FILE *fp_out;
+ FILE *fp_chain;
+ char **argv;
+ int argc, output_iupac, haplotype, isample;
+ char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
+}
+args_t;
+
+static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
+{
+// fprintf(pysamerr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
+ chain = (chain_t*) calloc(1,sizeof(chain_t));
+ chain->num = 0;
+ chain->block_lengths = NULL;
+ chain->ref_gaps = NULL;
+ chain->alt_gaps = NULL;
+ chain->ori_pos = ref_ori_pos;
+ chain->ref_last_block_ori = ref_ori_pos;
+ chain->alt_last_block_ori = ref_ori_pos;
+ return chain;
+}
+
+static void destroy_chain(args_t *args)
+{
+ chain_t *chain = args->chain;
+ free(chain->ref_gaps);
+ free(chain->alt_gaps);
+ free(chain->block_lengths);
+ free(chain);
+ chain = NULL;
+}
+
+static void print_chain(args_t *args)
+{
+ /*
+ Example chain format (see: https://genome.ucsc.edu/goldenPath/help/chain.html):
+ chain 1 500 + 480 500 1 501 + 480 501 1
+ 12 3 1
+ 1 0 3
+ 484
+
+ chain line is:
+ - chain
+ - score (sum of the length of ungapped block in this case)
+ - ref_seqname (from the fasta header, parsed by htslib)
+ - ref_seqlength (from the fasta header)
+ - ref_strand (+ or -; always + for bcf-consensus)
+ - ref_start (as defined in the fasta header)
+ - ref_end (as defined in the fasta header)
+ - alt_seqname (same as ref_seqname as bcf-consensus only considers SNPs and indels)
+ - alt_seqlength (adjusted to match the length of the alt sequence)
+ - alt_strand (+ or -; always + for bcf-consensus)
+ - alt_start (same as ref_start, as no edits are recorded/applied before that position)
+ - alt_end (adjusted to match the length of the alt sequence)
+ - chain_num (just an auto-increment id)
+
+ the other (sorted) lines are:
+ - length of the ungapped alignment block
+ - gap on the ref sequence between this and the next block (all but the last line)
+ - gap on the alt sequence between this and the next block (all but the last line)
+ */
+ chain_t *chain = args->chain;
+ int n = chain->num;
+ int ref_end_pos = args->fa_length + chain->ori_pos;
+ int last_block_size = ref_end_pos - chain->ref_last_block_ori;
+ int alt_end_pos = chain->alt_last_block_ori + last_block_size;
+ int score = 0;
+ for (n=0; n<chain->num; n++) {
+ score += chain->block_lengths[n];
+ }
+ score += last_block_size;
+ fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
+ for (n=0; n<chain->num; n++) {
+ fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
+ }
+ fprintf(args->fp_chain, "%d\n\n", last_block_size);
+}
+
+static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
+{
+// fprintf(pysamerr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
+ int num = chain->num;
+
+ if (ref_start <= chain->ref_last_block_ori) {
+ // In case this variant is back-to-back with the previous one
+ chain->ref_last_block_ori = ref_start + ref_len;
+ chain->alt_last_block_ori = alt_start + alt_len;
+ chain->ref_gaps[num-1] += ref_len;
+ chain->alt_gaps[num-1] += alt_len;
+
+ } else {
+ // Extend the ungapped blocks, store the gap length
+ chain->block_lengths = (int*) realloc(chain->block_lengths, (num + 1) * sizeof(int));
+ chain->ref_gaps = (int*) realloc(chain->ref_gaps, (num + 1) * sizeof(int));
+ chain->alt_gaps = (int*) realloc(chain->alt_gaps, (num + 1) * sizeof(int));
+ chain->block_lengths[num] = ref_start - chain->ref_last_block_ori;
+ chain->ref_gaps[num] = ref_len;
+ chain->alt_gaps[num] = alt_len;
+ // Update the start positions of the next block
+ chain->ref_last_block_ori = ref_start + ref_len;
+ chain->alt_last_block_ori = alt_start + alt_len;
+ // Increment the number of ungapped blocks
+ chain->num++;
+ }
+}
+
+static void init_data(args_t *args)
+{
+ args->files = bcf_sr_init();
+ args->files->require_index = 1;
+ if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum));
+ args->hdr = args->files->readers[0].header;
+ args->isample = -1;
+ if ( args->sample )
+ {
+ args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
+ if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
+ }
+ if ( args->haplotype && args->isample<0 )
+ {
+ if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
+ args->isample = 0;
+ }
+ if ( args->mask_fname )
+ {
+ args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
+ if ( !args->mask ) error("Failed to initialize mask regions\n");
+ }
+ // In case we want to store the chains
+ if ( args->chain_fname )
+ {
+ args->fp_chain = fopen(args->chain_fname,"w");
+ if ( ! args->fp_chain ) error("Failed to create %s: %s\n", args->chain_fname, strerror(errno));
+ args->chain_id = 0;
+ }
+ rbuf_init(&args->vcf_rbuf, 100);
+ args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*));
+ if ( args->output_fname ) {
+ args->fp_out = fopen(args->output_fname,"w");
+ if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
+ }
+ else args->fp_out = stdout;
+}
+
+static void destroy_data(args_t *args)
+{
+ bcf_sr_destroy(args->files);
+ int i;
+ for (i=0; i<args->vcf_rbuf.m; i++)
+ if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
+ free(args->vcf_buf);
+ free(args->fa_buf.s);
+ if ( args->mask ) regidx_destroy(args->mask);
+ if ( args->chain_fname )
+ if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
+ if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
+}
+
+static void init_region(args_t *args, char *line)
+{
+ char *ss, *se = line;
+ while ( *se && !isspace(*se) && *se!=':' ) se++;
+ int from = 0, to = 0;
+ char tmp, *tmp_ptr = NULL;
+ if ( *se )
+ {
+ tmp = *se; *se = 0; tmp_ptr = se;
+ ss = ++se;
+ from = strtol(ss,&se,10);
+ if ( ss==se || !*se || *se!='-' ) from = 0;
+ else
+ {
+ from--;
+ ss = ++se;
+ to = strtol(ss,&se,10);
+ if ( ss==se || (*se && !isspace(*se)) ) { from = 0; to = 0; }
+ else to--;
+ }
+ }
+ args->rid = bcf_hdr_name2id(args->hdr,line);
+ if ( args->rid<0 ) fprintf(pysamerr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
+ args->fa_buf.l = 0;
+ args->fa_length = 0;
+ args->fa_end_pos = to;
+ args->fa_ori_pos = from;
+ args->fa_src_pos = from;
+ args->fa_mod_off = 0;
+ args->fa_frz_pos = -1;
+ args->fa_case = -1;
+ args->vcf_rbuf.n = 0;
+ bcf_sr_seek(args->files,line,args->fa_ori_pos);
+ if ( tmp_ptr ) *tmp_ptr = tmp;
+ fprintf(args->fp_out,">%s\n",line);
+ if (args->chain_fname )
+ {
+ args->chain = init_chain(args->chain, args->fa_ori_pos);
+ } else {
+ args->chain = NULL;
+ }
+}
+
+static bcf1_t **next_vcf_line(args_t *args)
+{
+ if ( args->vcf_rbuf.n )
+ {
+ int i = rbuf_shift(&args->vcf_rbuf);
+ return &args->vcf_buf[i];
+ }
+ else if ( bcf_sr_next_line(args->files) )
+ return &args->files->readers[0].buffer[0];
+
+ return NULL;
+}
+static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
+{
+ bcf1_t *rec = *rec_ptr;
+ if ( args->vcf_rbuf.n >= args->vcf_rbuf.m )
+ error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+
+ // Insert the new record in the buffer. The line would be overwritten in
+ // the next bcf_sr_next_line call, therefore we need to swap it with an
+ // unused one
+ int i = rbuf_append(&args->vcf_rbuf);
+ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = bcf_init1();
+ bcf1_t *tmp = rec; *rec_ptr = args->vcf_buf[i]; args->vcf_buf[i] = tmp;
+}
+static void flush_fa_buffer(args_t *args, int len)
+{
+ if ( !args->fa_buf.l ) return;
+
+ int nwr = 0;
+ while ( nwr + 60 <= args->fa_buf.l )
+ {
+ if ( fwrite(args->fa_buf.s+nwr,1,60,args->fp_out) != 60 ) error("Could not write: %s\n", args->output_fname);
+ if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
+ nwr += 60;
+ }
+ if ( nwr )
+ args->fa_ori_pos += nwr;
+
+ if ( len )
+ {
+ // not finished on this chr yet and the buffer cannot be emptied completely
+ if ( nwr && nwr < args->fa_buf.l )
+ memmove(args->fa_buf.s,args->fa_buf.s+nwr,args->fa_buf.l-nwr);
+ args->fa_buf.l -= nwr;
+ return;
+ }
+
+ // empty the whole buffer
+ if ( nwr == args->fa_buf.l ) { args->fa_buf.l = 0; return; }
+
+ if ( fwrite(args->fa_buf.s+nwr,1,args->fa_buf.l - nwr,args->fp_out) != args->fa_buf.l - nwr ) error("Could not write: %s\n", args->output_fname);
+ if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
+
+ args->fa_ori_pos += args->fa_buf.l - nwr - args->fa_mod_off;
+ args->fa_mod_off = 0;
+ args->fa_buf.l = 0;
+}
+static void apply_variant(args_t *args, bcf1_t *rec)
+{
+ if ( rec->n_allele==1 ) return;
+
+ if ( rec->pos <= args->fa_frz_pos )
+ {
+ fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ return;
+ }
+ if ( args->mask )
+ {
+ char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
+ int start = rec->pos;
+ int end = rec->pos + rec->rlen - 1;
+ if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
+ }
+
+ int i, ialt = 1;
+ if ( args->isample >= 0 )
+ {
+ bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
+ if ( !fmt ) return;
+ if ( args->haplotype )
+ {
+ if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
+ uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
+ ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
+ ialt = bcf_gt_allele(ialt);
+ }
+ else if ( args->output_iupac )
+ {
+ uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
+ ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
+ ialt = bcf_gt_allele(ialt);
+
+ int jalt;
+ if ( fmt->n>1 )
+ {
+ ptr = fmt->p + fmt->size*args->isample + 1;
+ jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
+ else jalt = bcf_gt_allele(jalt);
+ }
+ else jalt = ialt;
+ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
+ {
+ char ial = rec->d.allele[ialt][0];
+ char jal = rec->d.allele[jalt][0];
+ rec->d.allele[ialt][0] = gt2iupac(ial,jal);
+ }
+ }
+ else
+ {
+ for (i=0; i<fmt->n; i++)
+ {
+ uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
+ ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
+ ialt = bcf_gt_allele(ialt);
+ if ( ialt ) break;
+ }
+ }
+ if ( !ialt ) return; // ref allele
+ if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ }
+ else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
+ {
+ char ial = rec->d.allele[0][0];
+ char jal = rec->d.allele[1][0];
+ rec->d.allele[1][0] = gt2iupac(ial,jal);
+ }
+
+ int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
+ if ( idx<0 || idx>=args->fa_buf.l )
+ error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
+
+ // sanity check the reference base
+ int len_diff = 0, alen = 0;
+ if ( rec->d.allele[ialt][0]=='<' )
+ {
+ if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
+ error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1);
+ assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1
+ len_diff = 1-rec->rlen;
+ rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event
+ alen = strlen(rec->d.allele[ialt]);
+ }
+ else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
+ {
+ // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
+ char tmp = 0;
+ if ( args->fa_buf.l - idx > rec->rlen )
+ {
+ tmp = args->fa_buf.s[idx+rec->rlen];
+ args->fa_buf.s[idx+rec->rlen] = 0;
+ }
+ error(
+ "The fasta sequence does not match the REF allele at %s:%d:\n"
+ " .vcf: [%s]\n"
+ " .vcf: [%s] <- (ALT)\n"
+ " .fa: [%s]%c%s\n",
+ bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
+ tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
+ );
+ }
+ else
+ {
+ alen = strlen(rec->d.allele[ialt]);
+ len_diff = alen - rec->rlen;
+ }
+
+ if ( args->fa_case )
+ for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
+ else
+ for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);
+
+ if ( len_diff <= 0 )
+ {
+ // deletion or same size event
+ for (i=0; i<alen; i++)
+ args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ if ( len_diff )
+ memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
+ }
+ else
+ {
+ // insertion
+ ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
+ memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
+ for (i=0; i<alen; i++)
+ args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ }
+ if (args->chain && len_diff != 0)
+ {
+ // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
+ if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
+ {
+ // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
+ push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
+ }
+ else
+ {
+ // otherwise, just the coordinates of the variant as given
+ push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
+ }
+ }
+ args->fa_buf.l += len_diff;
+ args->fa_mod_off += len_diff;
+ args->fa_frz_pos = rec->pos + rec->rlen - 1;
+}
+
+
+static void mask_region(args_t *args, char *seq, int len)
+{
+ char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
+ int start = args->fa_src_pos - len;
+ int end = args->fa_src_pos;
+
+ regitr_t itr;
+ if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return;
+
+ int idx_start, idx_end, i;
+ while ( REGITR_OVERLAP(itr,start,end) )
+ {
+ idx_start = REGITR_START(itr) - start;
+ idx_end = REGITR_END(itr) - start;
+ if ( idx_start < 0 ) idx_start = 0;
+ if ( idx_end >= len ) idx_end = len - 1;
+ for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
+ itr.i++;
+ }
+}
+
+static void consensus(args_t *args)
+{
+ htsFile *fasta = hts_open(args->ref_fname, "rb");
+ if ( !fasta ) error("Error reading %s\n", args->ref_fname);
+ kstring_t str = {0,0,0};
+ while ( hts_getline(fasta, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( str.s[0]=='>' )
+ {
+ // new sequence encountered, apply all chached variants
+ while ( args->vcf_rbuf.n )
+ {
+ if (args->chain) {
+ print_chain(args);
+ destroy_chain(args);
+ }
+ bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
+ if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
+ int i = rbuf_shift(&args->vcf_rbuf);
+ apply_variant(args, args->vcf_buf[i]);
+ }
+ flush_fa_buffer(args, 0);
+ init_region(args, str.s+1);
+ continue;
+ }
+ args->fa_length += str.l;
+ args->fa_src_pos += str.l;
+
+ // determine if uppercase or lowercase is used in this fasta file
+ if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;
+
+ if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l);
+ kputs(str.s, &args->fa_buf);
+
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
+ {
+ bcf1_t *rec = *rec_ptr;
+
+ // still the same chr and the same region? if not, fasta buf can be flushed
+ if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) )
+ {
+ // save the vcf record until next time and flush
+ unread_vcf_line(args, rec_ptr);
+ rec_ptr = NULL;
+ break;
+ }
+
+ // is the vcf record well beyond cached fasta buffer? if yes, the buf can be flushed
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos )
+ {
+ unread_vcf_line(args, rec_ptr);
+ rec_ptr = NULL;
+ break;
+ }
+
+ // is the cached fasta buffer full enough? if not, read more fasta, no flushing
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off < rec->pos + rec->rlen )
+ {
+ unread_vcf_line(args, rec_ptr);
+ break;
+ }
+ apply_variant(args, rec);
+ }
+ if ( !rec_ptr ) flush_fa_buffer(args, 60);
+ }
+ if (args->chain) {
+ print_chain(args);
+ destroy_chain(args);
+ }
+ flush_fa_buffer(args, 0);
+ hts_close(fasta);
+ free(str.s);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Create consensus sequence by applying VCF variants to a reference\n");
+ fprintf(pysamerr, " fasta file.\n");
+ fprintf(pysamerr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(pysamerr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
+ fprintf(pysamerr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(pysamerr, " -m, --mask <file> replace regions with N\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -c, --chain <file> write a chain file for liftover\n");
+ fprintf(pysamerr, " -s, --sample <name> apply variants of the given sample\n");
+ fprintf(pysamerr, "Examples:\n");
+ fprintf(pysamerr, " # Get the consensus for one region. The fasta header lines are then expected\n");
+ fprintf(pysamerr, " # in the form \">chr:from-to\".\n");
+ fprintf(pysamerr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_consensus(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+
+ static struct option loptions[] =
+ {
+ {"sample",1,0,'s'},
+ {"iupac-codes",0,0,'i'},
+ {"haplotype",1,0,'H'},
+ {"output",1,0,'o'},
+ {"fasta-ref",1,0,'f'},
+ {"mask",1,0,'m'},
+ {"chain",1,0,'c'},
+ {0,0,0,0}
+ };
+ char c;
+ while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 's': args->sample = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'i': args->output_iupac = 1; break;
+ case 'f': args->ref_fname = optarg; break;
+ case 'm': args->mask_fname = optarg; break;
+ case 'c': args->chain_fname = optarg; break;
+ case 'H':
+ args->haplotype = optarg[0] - '0';
+ if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ break;
+ default: usage(args); break;
+ }
+ }
+ if ( optind>=argc ) usage(args);
+ args->fname = argv[optind];
+
+ if ( !args->ref_fname && !isatty(fileno((FILE *)stdin)) ) args->ref_fname = "-";
+ if ( !args->ref_fname ) usage(args);
+
+ init_data(args);
+ consensus(args);
+ destroy_data(args);
+ free(args);
+
+ return 0;
+}
+
+
diff --git a/bcftools/convert.c b/bcftools/convert.c
new file mode 100644
index 0000000..3e289f0
--- /dev/null
+++ b/bcftools/convert.c
@@ -0,0 +1,1056 @@
+/* convert.c -- functions for converting between VCF/BCF and related formats.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "convert.h"
+
+#define T_CHROM 1
+#define T_POS 2
+#define T_ID 3
+#define T_REF 4
+#define T_ALT 5
+#define T_QUAL 6
+#define T_FILTER 7
+#define T_INFO 8
+#define T_FORMAT 9
+#define T_SAMPLE 10
+#define T_SEP 11
+#define T_IS_TS 12
+#define T_TYPE 13
+#define T_MASK 14
+#define T_GT 15
+#define T_TGT 16
+#define T_LINE 17
+#define T_CHROM_POS_ID 18 // not publicly advertised
+#define T_GT_TO_PROB3 19 // not publicly advertised
+#define T_PL_TO_PROB3 20 // not publicly advertised
+#define T_GP_TO_PROB3 21 // not publicly advertised
+#define T_FIRST_ALT 22 // not publicly advertised
+#define T_IUPAC_GT 23
+#define T_GT_TO_HAP 24 // not publicly advertised
+#define T_GT_TO_HAP2 25 // not publicly advertised
+
+typedef struct _fmt_t
+{
+ int type, id, is_gt_field, ready, subscript;
+ char *key;
+ bcf_fmt_t *fmt;
+ void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *);
+}
+fmt_t;
+
+struct _convert_t
+{
+ fmt_t *fmt;
+ int nfmt, mfmt;
+ int nsamples, *samples;
+ bcf_hdr_t *header;
+ int max_unpack;
+ char *format_str;
+ bcf_srs_t *readers; // required only for %MASK
+ int nreaders;
+ void *dat;
+ int ndat;
+ char *undef_info_tag;
+ int allow_undef_tags;
+};
+
+
+static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
+static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
+static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); }
+static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); }
+static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int i;
+ if ( line->n_allele==1 )
+ {
+ kputc('.', str);
+ return;
+ }
+ if ( fmt->subscript>=0 )
+ {
+ if ( line->n_allele > fmt->subscript+1 )
+ kputs(line->d.allele[fmt->subscript+1], str);
+ else
+ kputc('.', str);
+ return;
+ }
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( i>1 ) kputc(',', str);
+ kputs(line->d.allele[i], str);
+ }
+}
+static void process_first_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( line->n_allele==1 )
+ kputc('.', str);
+ else
+ kputs(line->d.allele[1], str);
+}
+static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( bcf_float_is_missing(line->qual) ) kputc('.', str);
+ else ksprintf(str, "%g", line->qual);
+}
+static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int i;
+ if ( line->d.n_flt )
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ {
+ if (i) kputc(';', str);
+ kputs(convert->header->id[BCF_DT_ID][line->d.flt[i]].key, str);
+ }
+ }
+ else kputc('.', str);
+}
+static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
+{
+ if ( type==BCF_BT_INT8 )
+ {
+ int8_t val = ((int8_t*)bcf_array)[idx];
+ if ( val==bcf_int8_missing ) return bcf_int32_missing;
+ if ( val==bcf_int8_vector_end ) return bcf_int32_vector_end;
+ return val;
+ }
+ if ( type==BCF_BT_INT16 )
+ {
+ int16_t val = ((int16_t*)bcf_array)[idx];
+ if ( val==bcf_int16_missing ) return bcf_int32_missing;
+ if ( val==bcf_int16_vector_end ) return bcf_int32_vector_end;
+ return val;
+ }
+ return ((int32_t*)bcf_array)[idx];
+}
+static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( fmt->id<0 )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ int i;
+ for (i=0; i<line->n_info; i++)
+ if ( line->d.info[i].key == fmt->id ) break;
+
+ // output "." if the tag is not present
+ if ( i==line->n_info )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ bcf_info_t *info = &line->d.info[i];
+
+ // if this is a flag, output 1
+ if ( info->len <=0 )
+ {
+ kputc('1', str);
+ return;
+ }
+
+ if ( info->len == 1 )
+ {
+ switch (info->type)
+ {
+ case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+ case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+ case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
+ case BCF_BT_CHAR: kputc(info->v1.i, str); break;
+ default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
+ }
+ }
+ else if ( fmt->subscript >=0 )
+ {
+ if ( info->len <= fmt->subscript )
+ {
+ kputc('.', str);
+ return;
+ }
+ #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
+ type_t val = ((type_t *) info->vptr)[fmt->subscript]; \
+ if ( is_missing || is_vector_end ) kputc('.',str); \
+ else kprint; \
+ }
+ switch (info->type)
+ {
+ case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break;
+ case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
+ case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
+ default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
+ }
+ #undef BRANCH
+ }
+ else
+ bcf_fmt_array(str, info->len, info->type, info->vptr);
+}
+static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt)
+{
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
+ fmt->fmt = NULL;
+ if ( fmt->id >= 0 )
+ {
+ int i;
+ for (i=0; i<(int)line->n_fmt; i++)
+ if ( line->d.fmt[i].id==fmt->id ) { fmt->fmt = &line->d.fmt[i]; break; }
+ }
+ else if ( !convert->allow_undef_tags )
+ error("Error: no such tag defined in the VCF header: FORMAT/%s\n", fmt->key);
+
+ fmt->ready = 1;
+}
+static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+ else if ( fmt->subscript >=0 )
+ {
+ if ( fmt->fmt->n <= fmt->subscript )
+ {
+ kputc('.', str);
+ return;
+ }
+ if ( fmt->fmt->type == BCF_BT_FLOAT )
+ {
+ float *ptr = (float*)(fmt->fmt->p + isample*fmt->fmt->size);
+ if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
+ kputc('.', str);
+ else
+ ksprintf(str, "%g", ptr[fmt->subscript]);
+ }
+ else if ( fmt->fmt->type != BCF_BT_CHAR )
+ {
+ int32_t ival = bcf_array_ivalue(fmt->fmt->p+isample*fmt->fmt->size,fmt->fmt->type,fmt->subscript);
+ if ( ival==bcf_int32_missing || ival==bcf_int32_vector_end )
+ kputc('.', str);
+ else
+ kputw(ival, str);
+ }
+ else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
+ }
+ else
+ bcf_fmt_array(str, fmt->fmt->n, fmt->fmt->type, fmt->fmt->p + isample*fmt->fmt->size);
+}
+static void process_gt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+ bcf_format_gt(fmt->fmt, isample, str);
+}
+static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ assert( fmt->fmt->type==BCF_BT_INT8 );
+
+ int l;
+ int8_t *x = (int8_t*)(fmt->fmt->p + isample*fmt->fmt->size); // FIXME: does not work with n_alt >= 64
+ for (l = 0; l < fmt->fmt->n && x[l] != bcf_int8_vector_end; ++l)
+ {
+ if (l) kputc("/|"[x[l]&1], str);
+ if (x[l]>>1)
+ {
+ int ial = (x[l]>>1) - 1;
+ kputs(line->d.allele[ial], str);
+ }
+ else
+ kputc('.', str);
+ }
+ if (l == 0) kputc('.', str);
+}
+static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
+{
+ init_format(convert, line, fmt);
+ if ( fmt->fmt==NULL ) return;
+
+ // Init mapping between alleles and IUPAC table
+ hts_expand(uint8_t, line->n_allele, convert->ndat, convert->dat);
+ int8_t *dat = (int8_t*)convert->dat;
+ int i;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][1] ) dat[i] = -1;
+ else
+ {
+ switch (line->d.allele[i][0])
+ {
+ case 'A': dat[i] = 0; break;
+ case 'C': dat[i] = 1; break;
+ case 'G': dat[i] = 2; break;
+ case 'T': dat[i] = 3; break;
+ case 'a': dat[i] = 0; break;
+ case 'c': dat[i] = 1; break;
+ case 'g': dat[i] = 2; break;
+ case 't': dat[i] = 3; break;
+ default: dat[i] = -1;
+ }
+ }
+ }
+}
+static void process_iupac_gt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format_iupac(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ assert( fmt->fmt->type==BCF_BT_INT8 );
+
+ static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
+ int8_t *dat = (int8_t*)convert->dat;
+
+ int8_t *x = (int8_t*)(fmt->fmt->p + isample*fmt->fmt->size); // FIXME: does not work with n_alt >= 64
+ int l = 0;
+ while ( l<fmt->fmt->n && x[l]!=bcf_int8_vector_end && x[l]!=bcf_int8_missing ) l++;
+
+ if ( l==2 )
+ {
+ // diploid
+ int ia = (x[0]>>1) - 1, ib = (x[1]>>1) - 1;
+ if ( ia>=0 && ia<line->n_allele && ib>=0 && ib<line->n_allele && dat[ia]>=0 && dat[ib]>=0 )
+ {
+ kputc(iupac[dat[ia]][dat[ib]], str);
+ return;
+ }
+ }
+ for (l = 0; l < fmt->fmt->n && x[l] != bcf_int8_vector_end; ++l)
+ {
+ if (l) kputc("/|"[x[l]&1], str);
+ if (x[l]>>1)
+ {
+ int ial = (x[l]>>1) - 1;
+ kputs(line->d.allele[ial], str);
+ }
+ else
+ kputc('.', str);
+ }
+ if (l == 0) kputc('.', str);
+}
+static void process_sample(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ kputs(convert->header->samples[isample], str);
+}
+static void process_sep(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { if (fmt->key) kputs(fmt->key, str); }
+static void process_is_ts(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int is_ts = 0;
+ if ( bcf_get_variant_types(line) & (VCF_SNP|VCF_MNP) )
+ is_ts = abs(bcf_acgt2int(*line->d.allele[0])-bcf_acgt2int(*line->d.allele[1])) == 2 ? 1 : 0;
+ kputc(is_ts ? '1' : '0', str);
+}
+static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int line_type = bcf_get_variant_types(line);
+ int i = 0;
+ if ( line_type == VCF_REF ) { kputs("REF", str); i++; }
+ if ( line_type & VCF_SNP ) { if (i) kputc(',',str); kputs("SNP", str); i++; }
+ if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; }
+ if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; }
+ if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; }
+}
+static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ vcf_format1(convert->header, line, str);
+}
+static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( line->d.id[0]!='.' || line->d.id[1] )
+ {
+ // ID is present
+ kputs(line->d.id, str);
+ }
+ else
+ {
+ // use CHROM:POS instead of ID
+ kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str);
+ kputc(':', str);
+ kputw(line->pos+1, str);
+ }
+}
+static void process_gt_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int m,n,i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_genotypes(convert->header,line,&convert->dat,&m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" 0.33 0.33 0.33", str);
+ // return;
+
+ error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+
+ if ( j==2 )
+ {
+ // diploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs(" 0.33 0.33 0.33", str);
+ else if ( bcf_gt_allele(ptr[0])!=bcf_gt_allele(ptr[1]) )
+ kputs(" 0 1 0", str); // HET
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs(" 0 0 1", str); // ALT HOM, first ALT allele
+ else
+ kputs(" 1 0 0", str); // REF HOM or something else than first ALT
+ }
+ else if ( j==1 )
+ {
+ // haploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs(" 0.5 0.0 0.5", str);
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs(" 0 0 1", str); // first ALT allele
+ else
+ kputs(" 1 0 0", str); // REF or something else than first ALT
+ }
+ else error("FIXME: not ready for ploidy %d\n", j);
+ }
+}
+static void process_pl_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int m,n,i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_format_int32(convert->header,line,"PL",&convert->dat,&m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" 0.33 0.33 0.33", str);
+ // return;
+
+ error("Error parsing PL tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ float sum = 0;
+ for (j=0; j<n; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ sum += pow(10,-0.1*ptr[j]);
+ }
+ if ( j==line->n_allele )
+ {
+ // haploid
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[0])/sum);
+ kputs(" 0 ", str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[1])/sum);
+ }
+ else
+ {
+ // diploid
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[0])/sum);
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[1])/sum);
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[2])/sum);
+ }
+ }
+}
+static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int m,n,i;
+
+ m = convert->ndat / sizeof(float);
+ n = bcf_get_format_float(convert->header,line,"GP",&convert->dat,&m);
+ convert->ndat = m * sizeof(float);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" 0.33 0.33 0.33", str);
+ // return;
+
+ error("Error parsing GP tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ float sum = 0, *ptr = (float*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; }
+ if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%d:%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),line->pos+1,ptr[j]);
+ sum+=ptr[j];
+ }
+ if ( j==line->n_allele )
+ ksprintf(str," %f %f %f",ptr[0],0.,ptr[1]); // haploid
+ else
+ ksprintf(str," %f %f %f",ptr[0],ptr[1],ptr[2]); // diploid
+ }
+}
+
+static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ // https://mathgen.stats.ox.ac.uk/impute/impute_v2.html#-known_haps_g
+
+ // File containing known haplotypes for the study cohort. The format
+ // is the same as the output format from IMPUTE2's -phase option:
+ // five header columns (as in the -g file) followed by two columns
+ // (haplotypes) per individual. Allowed values in the haplotype
+ // columns are 0, 1, and ?.
+
+ // If your study dataset is fully phased, you can replace the -g file
+ // with a -known_haps_g file. This will cause IMPUTE2 to perform
+ // haploid imputation, although it will still report diploid imputation
+ // probabilities in the main output file. If any genotypes are missing,
+ // they can be marked as '? ?' (two question marks separated by one
+ // space) in the input file. (The program does not allow just one
+ // allele from a diploid genotype to be missing.) If the reference
+ // panels are also phased, IMPUTE2 will perform a single, fast
+ // imputation step rather than its standard MCMC module this is how
+ // the program imputes into pre-phased GWAS haplotypes.
+
+ // The -known_haps_g file can also be used to specify study
+ // genotypes that are "partially" phased, in the sense that some
+ // genotypes are phased relative to a fixed reference point while
+ // others are not. We anticipate that this will be most useful when
+ // trying to phase resequencing data onto a scaffold of known
+ // haplotypes. To mark a known genotype as unphased, place an
+ // asterisk immediately after each allele, with no space between
+ // the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a
+ // heterozygous genotype of unknown phase.
+
+ int m, n, i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" ...", str);
+ // return;
+
+ error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+
+ if (i>0) kputs(" ", str); // no space separation for first column
+ if ( j==2 )
+ {
+ // diploid
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
+ kputs("? ?", str);
+ }
+ else if ( bcf_gt_is_phased(ptr[1])) {
+ ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ else {
+ ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ }
+ else if ( j==1 )
+ {
+ // haploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs("? -", str);
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs("1 -", str); // first ALT allele
+ else
+ kputs("0 -", str); // REF or something else than first ALT
+ }
+ else error("FIXME: not ready for ploidy %d\n", j);
+ }
+}
+static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ // same as process_gt_to_hap but converts haploid genotypes into diploid
+ int m, n, i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+
+ if (i>0) kputs(" ", str); // no space separation for first column
+ if ( j==2 )
+ {
+ // diploid
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
+ kputs("? ?", str);
+ }
+ else if ( bcf_gt_is_phased(ptr[1])) {
+ ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ else {
+ ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ }
+ else if ( j==1 )
+ {
+ // haploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs("? ?", str);
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs("1 1", str); // first ALT allele
+ else
+ kputs("0 0", str); // REF or something else than first ALT
+ }
+ else error("FIXME: not ready for ploidy %d\n", j);
+ }
+}
+
+static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
+{
+ convert->nfmt++;
+ if ( convert->nfmt > convert->mfmt )
+ {
+ convert->mfmt += 10;
+ convert->fmt = (fmt_t*) realloc(convert->fmt, convert->mfmt*sizeof(fmt_t));
+ }
+ fmt_t *fmt = &convert->fmt[ convert->nfmt-1 ];
+ fmt->type = type;
+ fmt->key = key ? strdup(key) : NULL;
+ fmt->is_gt_field = is_gtf;
+ fmt->subscript = -1;
+
+ // Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags.
+ if ( key )
+ {
+ int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) )
+ {
+ if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
+ else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
+ else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
+ else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
+ else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
+ else if ( !strcmp("FIRST_ALT",key) ) { fmt->type = T_FIRST_ALT; }
+ else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; }
+ else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; }
+ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
+ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
+ {
+ fmt->type = T_INFO;
+ fprintf(stderr,"Warning: Assuming INFO/%s\n", key);
+ }
+ }
+ }
+
+ switch (fmt->type)
+ {
+ case T_FIRST_ALT: fmt->handler = &process_first_alt; break;
+ case T_CHROM_POS_ID: fmt->handler = &process_chrom_pos_id; break;
+ case T_GT_TO_PROB3: fmt->handler = &process_gt_to_prob3; break;
+ case T_PL_TO_PROB3: fmt->handler = &process_pl_to_prob3; break;
+ case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break;
+ case T_CHROM: fmt->handler = &process_chrom; break;
+ case T_POS: fmt->handler = &process_pos; break;
+ case T_ID: fmt->handler = &process_id; break;
+ case T_REF: fmt->handler = &process_ref; break;
+ case T_ALT: fmt->handler = &process_alt; break;
+ case T_QUAL: fmt->handler = &process_qual; break;
+ case T_FILTER: fmt->handler = &process_filter; convert->max_unpack |= BCF_UN_FLT; break;
+ case T_INFO: fmt->handler = &process_info; convert->max_unpack |= BCF_UN_INFO; break;
+ case T_FORMAT: fmt->handler = &process_format; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_SAMPLE: fmt->handler = &process_sample; break;
+ case T_SEP: fmt->handler = &process_sep; break;
+ case T_IS_TS: fmt->handler = &process_is_ts; break;
+ case T_TYPE: fmt->handler = &process_type; break;
+ case T_MASK: fmt->handler = NULL; break;
+ case T_GT: fmt->handler = &process_gt; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_TGT: fmt->handler = &process_tgt; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_LINE: fmt->handler = &process_line; break;
+ default: error("TODO: handler for type %d\n", fmt->type);
+ }
+ if ( key )
+ {
+ if ( fmt->type==T_INFO )
+ {
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key);
+ }
+ }
+ return fmt;
+}
+
+static int parse_subscript(char **p)
+{
+ char *q = *p;
+ if ( *q!='{' ) return -1;
+ q++;
+ while ( *q && *q!='}' && isdigit(*q) ) q++;
+ if ( *q!='}' ) return -1;
+ int idx = atoi((*p)+1);
+ *p = q+1;
+ return idx;
+}
+
+static char *parse_tag(convert_t *convert, char *p, int is_gtf)
+{
+ char *q = ++p;
+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
+ kstring_t str = {0,0,0};
+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p, &str);
+ if ( is_gtf )
+ {
+ if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
+ else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "INFO") )
+ {
+ if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str);
+ p = ++q;
+ str.l = 0;
+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p, &str);
+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else
+ {
+ fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ }
+ else
+ {
+ if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
+ else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
+ else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
+ else if ( !strcmp(str.s, "ALT") )
+ {
+ fmt_t *fmt = register_tag(convert, T_ALT, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else if ( !strcmp(str.s, "FIRST_ALT") ) register_tag(convert, T_FIRST_ALT, str.s, is_gtf);
+ else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
+ else if ( !strcmp(str.s, "FILTER") ) register_tag(convert, T_FILTER, str.s, is_gtf);
+ else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
+ else if ( !strcmp(str.s, "IS_TS") ) register_tag(convert, T_IS_TS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "TYPE") ) register_tag(convert, T_TYPE, str.s, is_gtf);
+ else if ( !strcmp(str.s, "MASK") ) register_tag(convert, T_MASK, str.s, is_gtf);
+ else if ( !strcmp(str.s, "LINE") ) register_tag(convert, T_LINE, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, T_CHROM_POS_ID, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, T_GT_TO_PROB3, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, T_PL_TO_PROB3, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
+ else if ( !strcmp(str.s, "INFO") )
+ {
+ if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str);
+ p = ++q;
+ str.l = 0;
+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p, &str);
+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else
+ {
+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ }
+ free(str.s);
+ return q;
+}
+
+static char *parse_sep(convert_t *convert, char *p, int is_gtf)
+{
+ char *q = p;
+ kstring_t str = {0,0,0};
+ while ( *q && *q!='[' && *q!=']' && *q!='%' )
+ {
+ if ( *q=='\\' )
+ {
+ q++;
+ if ( *q=='n' ) kputc('\n', &str);
+ else if ( *q=='t' ) kputc('\t', &str);
+ else kputc(*q, &str);
+ }
+ else kputc(*q, &str);
+ q++;
+ }
+ if ( !str.l ) error("Could not parse format string: %s\n", convert->format_str);
+ register_tag(convert, T_SEP, str.s, is_gtf);
+ free(str.s);
+ return q;
+}
+
+convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *format_str)
+{
+ convert_t *convert = (convert_t*) calloc(1,sizeof(convert_t));
+ convert->header = hdr;
+ convert->format_str = strdup(format_str);
+ convert->max_unpack = BCF_UN_STR;
+
+ int i, is_gtf = 0;
+ char *p = convert->format_str;
+ while ( *p )
+ {
+ //fprintf(stderr,"<%s>\n", p);
+ switch (*p)
+ {
+ case '[': is_gtf = 1; p++; break;
+ case ']': is_gtf = 0; register_tag(convert, T_SEP, NULL, 0); p++; break;
+ case '%': p = parse_tag(convert, p, is_gtf); break;
+ default: p = parse_sep(convert, p, is_gtf); break;
+ }
+ }
+
+ if ( nsamples )
+ {
+ convert->nsamples = nsamples;
+ convert->samples = (int*) malloc(sizeof(int)*nsamples);
+ for (i=0; i<convert->nsamples; i++) convert->samples[i] = samples[i];
+ }
+ else
+ {
+ convert->nsamples = bcf_hdr_nsamples(convert->header);
+ convert->samples = (int*) malloc(sizeof(int)*convert->nsamples);
+ for (i=0; i<convert->nsamples; i++) convert->samples[i] = i;
+ }
+ return convert;
+}
+
+void convert_destroy(convert_t *convert)
+{
+ int i;
+ for (i=0; i<convert->nfmt; i++)
+ free(convert->fmt[i].key);
+ free(convert->fmt);
+ free(convert->undef_info_tag);
+ free(convert->dat);
+ free(convert->samples);
+ free(convert->format_str);
+ free(convert);
+}
+
+
+int convert_header(convert_t *convert, kstring_t *str)
+{
+ int i, icol = 0, l_ori = str->l;
+ bcf_hdr_t *hdr = convert->header;
+
+ // Supress the header output if LINE is present
+ for (i=0; i<convert->nfmt; i++)
+ if ( convert->fmt[i].type == T_LINE ) break;
+ if ( i!=convert->nfmt )
+ return str->l - l_ori;
+
+ kputs("# ", str);
+ for (i=0; i<convert->nfmt; i++)
+ {
+ // Genotype fields
+ if ( convert->fmt[i].is_gt_field )
+ {
+ int j = i, js, k;
+ while ( convert->fmt[j].is_gt_field ) j++;
+ for (js=0; js<convert->nsamples; js++)
+ {
+ int ks = convert->samples[js];
+ for (k=i; k<j; k++)
+ {
+ if ( convert->fmt[k].type == T_SEP )
+ {
+ if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+ }
+ else if ( convert->fmt[k].type == T_SAMPLE )
+ ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
+ else
+ ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+ }
+ }
+ i = j-1;
+ continue;
+ }
+ // Fixed fields
+ if ( convert->fmt[i].type == T_SEP )
+ {
+ if ( convert->fmt[i].key ) kputs(convert->fmt[i].key, str);
+ continue;
+ }
+ ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
+ }
+ return str->l - l_ori;
+}
+
+int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
+{
+ if ( !convert->allow_undef_tags && convert->undef_info_tag )
+ error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag);
+
+ int l_ori = str->l;
+ bcf_unpack(line, convert->max_unpack);
+
+ int i, ir;
+ str->l = 0;
+ for (i=0; i<convert->nfmt; i++)
+ {
+ // Genotype fields
+ if ( convert->fmt[i].is_gt_field )
+ {
+ int j = i, js, k;
+ while ( convert->fmt[j].is_gt_field )
+ {
+ convert->fmt[j].ready = 0;
+ j++;
+ }
+ for (js=0; js<convert->nsamples; js++)
+ {
+ int ks = convert->samples[js];
+ for (k=i; k<j; k++)
+ {
+ if ( convert->fmt[k].type == T_MASK )
+ {
+ for (ir=0; ir<convert->nreaders; ir++)
+ kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
+ }
+ else if ( convert->fmt[k].handler )
+ convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+ }
+ }
+ i = j-1;
+ continue;
+ }
+ // Fixed fields
+ if ( convert->fmt[i].type == T_MASK )
+ {
+ for (ir=0; ir<convert->nreaders; ir++)
+ kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
+ }
+ else if ( convert->fmt[i].handler )
+ convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str);
+ }
+ return str->l - l_ori;
+}
+
+int convert_set_option(convert_t *convert, enum convert_option opt, ...)
+{
+ int ret = 0;
+ va_list args;
+
+ va_start(args, opt);
+ switch (opt)
+ {
+ case allow_undef_tags:
+ convert->allow_undef_tags = va_arg(args, int);
+ break;
+ default:
+ ret = -1;
+ }
+ va_end(args);
+ return ret;
+}
+
+int convert_max_unpack(convert_t *convert)
+{
+ return convert->max_unpack;
+}
+
diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c
new file mode 100644
index 0000000..ee27882
--- /dev/null
+++ b/bcftools/convert.c.pysam.c
@@ -0,0 +1,1058 @@
+#include "pysam.h"
+
+/* convert.c -- functions for converting between VCF/BCF and related formats.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "convert.h"
+
+#define T_CHROM 1
+#define T_POS 2
+#define T_ID 3
+#define T_REF 4
+#define T_ALT 5
+#define T_QUAL 6
+#define T_FILTER 7
+#define T_INFO 8
+#define T_FORMAT 9
+#define T_SAMPLE 10
+#define T_SEP 11
+#define T_IS_TS 12
+#define T_TYPE 13
+#define T_MASK 14
+#define T_GT 15
+#define T_TGT 16
+#define T_LINE 17
+#define T_CHROM_POS_ID 18 // not publicly advertised
+#define T_GT_TO_PROB3 19 // not publicly advertised
+#define T_PL_TO_PROB3 20 // not publicly advertised
+#define T_GP_TO_PROB3 21 // not publicly advertised
+#define T_FIRST_ALT 22 // not publicly advertised
+#define T_IUPAC_GT 23
+#define T_GT_TO_HAP 24 // not publicly advertised
+#define T_GT_TO_HAP2 25 // not publicly advertised
+
+typedef struct _fmt_t
+{
+ int type, id, is_gt_field, ready, subscript;
+ char *key;
+ bcf_fmt_t *fmt;
+ void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *);
+}
+fmt_t;
+
+struct _convert_t
+{
+ fmt_t *fmt;
+ int nfmt, mfmt;
+ int nsamples, *samples;
+ bcf_hdr_t *header;
+ int max_unpack;
+ char *format_str;
+ bcf_srs_t *readers; // required only for %MASK
+ int nreaders;
+ void *dat;
+ int ndat;
+ char *undef_info_tag;
+ int allow_undef_tags;
+};
+
+
+static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
+static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
+static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); }
+static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); }
+static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int i;
+ if ( line->n_allele==1 )
+ {
+ kputc('.', str);
+ return;
+ }
+ if ( fmt->subscript>=0 )
+ {
+ if ( line->n_allele > fmt->subscript+1 )
+ kputs(line->d.allele[fmt->subscript+1], str);
+ else
+ kputc('.', str);
+ return;
+ }
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( i>1 ) kputc(',', str);
+ kputs(line->d.allele[i], str);
+ }
+}
+static void process_first_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( line->n_allele==1 )
+ kputc('.', str);
+ else
+ kputs(line->d.allele[1], str);
+}
+static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( bcf_float_is_missing(line->qual) ) kputc('.', str);
+ else ksprintf(str, "%g", line->qual);
+}
+static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int i;
+ if ( line->d.n_flt )
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ {
+ if (i) kputc(';', str);
+ kputs(convert->header->id[BCF_DT_ID][line->d.flt[i]].key, str);
+ }
+ }
+ else kputc('.', str);
+}
+static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
+{
+ if ( type==BCF_BT_INT8 )
+ {
+ int8_t val = ((int8_t*)bcf_array)[idx];
+ if ( val==bcf_int8_missing ) return bcf_int32_missing;
+ if ( val==bcf_int8_vector_end ) return bcf_int32_vector_end;
+ return val;
+ }
+ if ( type==BCF_BT_INT16 )
+ {
+ int16_t val = ((int16_t*)bcf_array)[idx];
+ if ( val==bcf_int16_missing ) return bcf_int32_missing;
+ if ( val==bcf_int16_vector_end ) return bcf_int32_vector_end;
+ return val;
+ }
+ return ((int32_t*)bcf_array)[idx];
+}
+static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( fmt->id<0 )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ int i;
+ for (i=0; i<line->n_info; i++)
+ if ( line->d.info[i].key == fmt->id ) break;
+
+ // output "." if the tag is not present
+ if ( i==line->n_info )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ bcf_info_t *info = &line->d.info[i];
+
+ // if this is a flag, output 1
+ if ( info->len <=0 )
+ {
+ kputc('1', str);
+ return;
+ }
+
+ if ( info->len == 1 )
+ {
+ switch (info->type)
+ {
+ case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+ case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+ case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
+ case BCF_BT_CHAR: kputc(info->v1.i, str); break;
+ default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break;
+ }
+ }
+ else if ( fmt->subscript >=0 )
+ {
+ if ( info->len <= fmt->subscript )
+ {
+ kputc('.', str);
+ return;
+ }
+ #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
+ type_t val = ((type_t *) info->vptr)[fmt->subscript]; \
+ if ( is_missing || is_vector_end ) kputc('.',str); \
+ else kprint; \
+ }
+ switch (info->type)
+ {
+ case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break;
+ case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
+ case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
+ default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break;
+ }
+ #undef BRANCH
+ }
+ else
+ bcf_fmt_array(str, info->len, info->type, info->vptr);
+}
+static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt)
+{
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
+ fmt->fmt = NULL;
+ if ( fmt->id >= 0 )
+ {
+ int i;
+ for (i=0; i<(int)line->n_fmt; i++)
+ if ( line->d.fmt[i].id==fmt->id ) { fmt->fmt = &line->d.fmt[i]; break; }
+ }
+ else if ( !convert->allow_undef_tags )
+ error("Error: no such tag defined in the VCF header: FORMAT/%s\n", fmt->key);
+
+ fmt->ready = 1;
+}
+static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+ else if ( fmt->subscript >=0 )
+ {
+ if ( fmt->fmt->n <= fmt->subscript )
+ {
+ kputc('.', str);
+ return;
+ }
+ if ( fmt->fmt->type == BCF_BT_FLOAT )
+ {
+ float *ptr = (float*)(fmt->fmt->p + isample*fmt->fmt->size);
+ if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
+ kputc('.', str);
+ else
+ ksprintf(str, "%g", ptr[fmt->subscript]);
+ }
+ else if ( fmt->fmt->type != BCF_BT_CHAR )
+ {
+ int32_t ival = bcf_array_ivalue(fmt->fmt->p+isample*fmt->fmt->size,fmt->fmt->type,fmt->subscript);
+ if ( ival==bcf_int32_missing || ival==bcf_int32_vector_end )
+ kputc('.', str);
+ else
+ kputw(ival, str);
+ }
+ else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
+ }
+ else
+ bcf_fmt_array(str, fmt->fmt->n, fmt->fmt->type, fmt->fmt->p + isample*fmt->fmt->size);
+}
+static void process_gt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+ bcf_format_gt(fmt->fmt, isample, str);
+}
+static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ assert( fmt->fmt->type==BCF_BT_INT8 );
+
+ int l;
+ int8_t *x = (int8_t*)(fmt->fmt->p + isample*fmt->fmt->size); // FIXME: does not work with n_alt >= 64
+ for (l = 0; l < fmt->fmt->n && x[l] != bcf_int8_vector_end; ++l)
+ {
+ if (l) kputc("/|"[x[l]&1], str);
+ if (x[l]>>1)
+ {
+ int ial = (x[l]>>1) - 1;
+ kputs(line->d.allele[ial], str);
+ }
+ else
+ kputc('.', str);
+ }
+ if (l == 0) kputc('.', str);
+}
+static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
+{
+ init_format(convert, line, fmt);
+ if ( fmt->fmt==NULL ) return;
+
+ // Init mapping between alleles and IUPAC table
+ hts_expand(uint8_t, line->n_allele, convert->ndat, convert->dat);
+ int8_t *dat = (int8_t*)convert->dat;
+ int i;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][1] ) dat[i] = -1;
+ else
+ {
+ switch (line->d.allele[i][0])
+ {
+ case 'A': dat[i] = 0; break;
+ case 'C': dat[i] = 1; break;
+ case 'G': dat[i] = 2; break;
+ case 'T': dat[i] = 3; break;
+ case 'a': dat[i] = 0; break;
+ case 'c': dat[i] = 1; break;
+ case 'g': dat[i] = 2; break;
+ case 't': dat[i] = 3; break;
+ default: dat[i] = -1;
+ }
+ }
+ }
+}
+static void process_iupac_gt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ init_format_iupac(convert, line, fmt);
+
+ if ( fmt->fmt==NULL )
+ {
+ kputc('.', str);
+ return;
+ }
+
+ assert( fmt->fmt->type==BCF_BT_INT8 );
+
+ static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
+ int8_t *dat = (int8_t*)convert->dat;
+
+ int8_t *x = (int8_t*)(fmt->fmt->p + isample*fmt->fmt->size); // FIXME: does not work with n_alt >= 64
+ int l = 0;
+ while ( l<fmt->fmt->n && x[l]!=bcf_int8_vector_end && x[l]!=bcf_int8_missing ) l++;
+
+ if ( l==2 )
+ {
+ // diploid
+ int ia = (x[0]>>1) - 1, ib = (x[1]>>1) - 1;
+ if ( ia>=0 && ia<line->n_allele && ib>=0 && ib<line->n_allele && dat[ia]>=0 && dat[ib]>=0 )
+ {
+ kputc(iupac[dat[ia]][dat[ib]], str);
+ return;
+ }
+ }
+ for (l = 0; l < fmt->fmt->n && x[l] != bcf_int8_vector_end; ++l)
+ {
+ if (l) kputc("/|"[x[l]&1], str);
+ if (x[l]>>1)
+ {
+ int ial = (x[l]>>1) - 1;
+ kputs(line->d.allele[ial], str);
+ }
+ else
+ kputc('.', str);
+ }
+ if (l == 0) kputc('.', str);
+}
+static void process_sample(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ kputs(convert->header->samples[isample], str);
+}
+static void process_sep(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { if (fmt->key) kputs(fmt->key, str); }
+static void process_is_ts(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int is_ts = 0;
+ if ( bcf_get_variant_types(line) & (VCF_SNP|VCF_MNP) )
+ is_ts = abs(bcf_acgt2int(*line->d.allele[0])-bcf_acgt2int(*line->d.allele[1])) == 2 ? 1 : 0;
+ kputc(is_ts ? '1' : '0', str);
+}
+static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int line_type = bcf_get_variant_types(line);
+ int i = 0;
+ if ( line_type == VCF_REF ) { kputs("REF", str); i++; }
+ if ( line_type & VCF_SNP ) { if (i) kputc(',',str); kputs("SNP", str); i++; }
+ if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; }
+ if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; }
+ if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; }
+}
+static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ vcf_format1(convert->header, line, str);
+}
+static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( line->d.id[0]!='.' || line->d.id[1] )
+ {
+ // ID is present
+ kputs(line->d.id, str);
+ }
+ else
+ {
+ // use CHROM:POS instead of ID
+ kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str);
+ kputc(':', str);
+ kputw(line->pos+1, str);
+ }
+}
+static void process_gt_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int m,n,i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_genotypes(convert->header,line,&convert->dat,&m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" 0.33 0.33 0.33", str);
+ // return;
+
+ error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+
+ if ( j==2 )
+ {
+ // diploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs(" 0.33 0.33 0.33", str);
+ else if ( bcf_gt_allele(ptr[0])!=bcf_gt_allele(ptr[1]) )
+ kputs(" 0 1 0", str); // HET
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs(" 0 0 1", str); // ALT HOM, first ALT allele
+ else
+ kputs(" 1 0 0", str); // REF HOM or something else than first ALT
+ }
+ else if ( j==1 )
+ {
+ // haploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs(" 0.5 0.0 0.5", str);
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs(" 0 0 1", str); // first ALT allele
+ else
+ kputs(" 1 0 0", str); // REF or something else than first ALT
+ }
+ else error("FIXME: not ready for ploidy %d\n", j);
+ }
+}
+static void process_pl_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int m,n,i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_format_int32(convert->header,line,"PL",&convert->dat,&m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" 0.33 0.33 0.33", str);
+ // return;
+
+ error("Error parsing PL tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ float sum = 0;
+ for (j=0; j<n; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ sum += pow(10,-0.1*ptr[j]);
+ }
+ if ( j==line->n_allele )
+ {
+ // haploid
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[0])/sum);
+ kputs(" 0 ", str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[1])/sum);
+ }
+ else
+ {
+ // diploid
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[0])/sum);
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[1])/sum);
+ kputc(' ',str);
+ ksprintf(str,"%f",pow(10,-0.1*ptr[2])/sum);
+ }
+ }
+}
+static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int m,n,i;
+
+ m = convert->ndat / sizeof(float);
+ n = bcf_get_format_float(convert->header,line,"GP",&convert->dat,&m);
+ convert->ndat = m * sizeof(float);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" 0.33 0.33 0.33", str);
+ // return;
+
+ error("Error parsing GP tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ float sum = 0, *ptr = (float*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; }
+ if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%d:%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),line->pos+1,ptr[j]);
+ sum+=ptr[j];
+ }
+ if ( j==line->n_allele )
+ ksprintf(str," %f %f %f",ptr[0],0.,ptr[1]); // haploid
+ else
+ ksprintf(str," %f %f %f",ptr[0],ptr[1],ptr[2]); // diploid
+ }
+}
+
+static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ // https://mathgen.stats.ox.ac.uk/impute/impute_v2.html#-known_haps_g
+
+ // File containing known haplotypes for the study cohort. The format
+ // is the same as the output format from IMPUTE2's -phase option:
+ // five header columns (as in the -g file) followed by two columns
+ // (haplotypes) per individual. Allowed values in the haplotype
+ // columns are 0, 1, and ?.
+
+ // If your study dataset is fully phased, you can replace the -g file
+ // with a -known_haps_g file. This will cause IMPUTE2 to perform
+ // haploid imputation, although it will still report diploid imputation
+ // probabilities in the main output file. If any genotypes are missing,
+ // they can be marked as '? ?' (two question marks separated by one
+ // space) in the input file. (The program does not allow just one
+ // allele from a diploid genotype to be missing.) If the reference
+ // panels are also phased, IMPUTE2 will perform a single, fast
+ // imputation step rather than its standard MCMC module this is how
+ // the program imputes into pre-phased GWAS haplotypes.
+
+ // The -known_haps_g file can also be used to specify study
+ // genotypes that are "partially" phased, in the sense that some
+ // genotypes are phased relative to a fixed reference point while
+ // others are not. We anticipate that this will be most useful when
+ // trying to phase resequencing data onto a scaffold of known
+ // haplotypes. To mark a known genotype as unphased, place an
+ // asterisk immediately after each allele, with no space between
+ // the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a
+ // heterozygous genotype of unknown phase.
+
+ int m, n, i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ {
+ // Throw an error or silently proceed?
+ //
+ // for (i=0; i<convert->nsamples; i++) kputs(" ...", str);
+ // return;
+
+ error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ }
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+
+ if (i>0) kputs(" ", str); // no space separation for first column
+ if ( j==2 )
+ {
+ // diploid
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
+ kputs("? ?", str);
+ }
+ else if ( bcf_gt_is_phased(ptr[1])) {
+ ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ else {
+ ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ }
+ else if ( j==1 )
+ {
+ // haploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs("? -", str);
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs("1 -", str); // first ALT allele
+ else
+ kputs("0 -", str); // REF or something else than first ALT
+ }
+ else error("FIXME: not ready for ploidy %d\n", j);
+ }
+}
+static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ // same as process_gt_to_hap but converts haploid genotypes into diploid
+ int m, n, i;
+
+ m = convert->ndat / sizeof(int32_t);
+ n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
+ convert->ndat = m * sizeof(int32_t);
+
+ if ( n<=0 )
+ error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ n /= convert->nsamples;
+ for (i=0; i<convert->nsamples; i++)
+ {
+ int32_t *ptr = (int32_t*)convert->dat + i*n;
+ int j;
+ for (j=0; j<n; j++)
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+
+ if (i>0) kputs(" ", str); // no space separation for first column
+ if ( j==2 )
+ {
+ // diploid
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
+ kputs("? ?", str);
+ }
+ else if ( bcf_gt_is_phased(ptr[1])) {
+ ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ else {
+ ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ }
+ }
+ else if ( j==1 )
+ {
+ // haploid
+ if ( bcf_gt_is_missing(ptr[0]) )
+ kputs("? ?", str);
+ else if ( bcf_gt_allele(ptr[0])==1 )
+ kputs("1 1", str); // first ALT allele
+ else
+ kputs("0 0", str); // REF or something else than first ALT
+ }
+ else error("FIXME: not ready for ploidy %d\n", j);
+ }
+}
+
+static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
+{
+ convert->nfmt++;
+ if ( convert->nfmt > convert->mfmt )
+ {
+ convert->mfmt += 10;
+ convert->fmt = (fmt_t*) realloc(convert->fmt, convert->mfmt*sizeof(fmt_t));
+ }
+ fmt_t *fmt = &convert->fmt[ convert->nfmt-1 ];
+ fmt->type = type;
+ fmt->key = key ? strdup(key) : NULL;
+ fmt->is_gt_field = is_gtf;
+ fmt->subscript = -1;
+
+ // Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags.
+ if ( key )
+ {
+ int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) )
+ {
+ if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
+ else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
+ else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
+ else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
+ else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
+ else if ( !strcmp("FIRST_ALT",key) ) { fmt->type = T_FIRST_ALT; }
+ else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; }
+ else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; }
+ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
+ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
+ {
+ fmt->type = T_INFO;
+ fprintf(pysamerr,"Warning: Assuming INFO/%s\n", key);
+ }
+ }
+ }
+
+ switch (fmt->type)
+ {
+ case T_FIRST_ALT: fmt->handler = &process_first_alt; break;
+ case T_CHROM_POS_ID: fmt->handler = &process_chrom_pos_id; break;
+ case T_GT_TO_PROB3: fmt->handler = &process_gt_to_prob3; break;
+ case T_PL_TO_PROB3: fmt->handler = &process_pl_to_prob3; break;
+ case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break;
+ case T_CHROM: fmt->handler = &process_chrom; break;
+ case T_POS: fmt->handler = &process_pos; break;
+ case T_ID: fmt->handler = &process_id; break;
+ case T_REF: fmt->handler = &process_ref; break;
+ case T_ALT: fmt->handler = &process_alt; break;
+ case T_QUAL: fmt->handler = &process_qual; break;
+ case T_FILTER: fmt->handler = &process_filter; convert->max_unpack |= BCF_UN_FLT; break;
+ case T_INFO: fmt->handler = &process_info; convert->max_unpack |= BCF_UN_INFO; break;
+ case T_FORMAT: fmt->handler = &process_format; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_SAMPLE: fmt->handler = &process_sample; break;
+ case T_SEP: fmt->handler = &process_sep; break;
+ case T_IS_TS: fmt->handler = &process_is_ts; break;
+ case T_TYPE: fmt->handler = &process_type; break;
+ case T_MASK: fmt->handler = NULL; break;
+ case T_GT: fmt->handler = &process_gt; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_TGT: fmt->handler = &process_tgt; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_LINE: fmt->handler = &process_line; break;
+ default: error("TODO: handler for type %d\n", fmt->type);
+ }
+ if ( key )
+ {
+ if ( fmt->type==T_INFO )
+ {
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key);
+ }
+ }
+ return fmt;
+}
+
+static int parse_subscript(char **p)
+{
+ char *q = *p;
+ if ( *q!='{' ) return -1;
+ q++;
+ while ( *q && *q!='}' && isdigit(*q) ) q++;
+ if ( *q!='}' ) return -1;
+ int idx = atoi((*p)+1);
+ *p = q+1;
+ return idx;
+}
+
+static char *parse_tag(convert_t *convert, char *p, int is_gtf)
+{
+ char *q = ++p;
+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
+ kstring_t str = {0,0,0};
+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p, &str);
+ if ( is_gtf )
+ {
+ if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
+ else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "INFO") )
+ {
+ if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str);
+ p = ++q;
+ str.l = 0;
+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p, &str);
+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else
+ {
+ fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ }
+ else
+ {
+ if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
+ else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
+ else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
+ else if ( !strcmp(str.s, "ALT") )
+ {
+ fmt_t *fmt = register_tag(convert, T_ALT, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else if ( !strcmp(str.s, "FIRST_ALT") ) register_tag(convert, T_FIRST_ALT, str.s, is_gtf);
+ else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
+ else if ( !strcmp(str.s, "FILTER") ) register_tag(convert, T_FILTER, str.s, is_gtf);
+ else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
+ else if ( !strcmp(str.s, "IS_TS") ) register_tag(convert, T_IS_TS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "TYPE") ) register_tag(convert, T_TYPE, str.s, is_gtf);
+ else if ( !strcmp(str.s, "MASK") ) register_tag(convert, T_MASK, str.s, is_gtf);
+ else if ( !strcmp(str.s, "LINE") ) register_tag(convert, T_LINE, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, T_CHROM_POS_ID, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, T_GT_TO_PROB3, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, T_PL_TO_PROB3, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
+ else if ( !strcmp(str.s, "INFO") )
+ {
+ if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str);
+ p = ++q;
+ str.l = 0;
+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p, &str);
+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else
+ {
+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ }
+ }
+ free(str.s);
+ return q;
+}
+
+static char *parse_sep(convert_t *convert, char *p, int is_gtf)
+{
+ char *q = p;
+ kstring_t str = {0,0,0};
+ while ( *q && *q!='[' && *q!=']' && *q!='%' )
+ {
+ if ( *q=='\\' )
+ {
+ q++;
+ if ( *q=='n' ) kputc('\n', &str);
+ else if ( *q=='t' ) kputc('\t', &str);
+ else kputc(*q, &str);
+ }
+ else kputc(*q, &str);
+ q++;
+ }
+ if ( !str.l ) error("Could not parse format string: %s\n", convert->format_str);
+ register_tag(convert, T_SEP, str.s, is_gtf);
+ free(str.s);
+ return q;
+}
+
+convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *format_str)
+{
+ convert_t *convert = (convert_t*) calloc(1,sizeof(convert_t));
+ convert->header = hdr;
+ convert->format_str = strdup(format_str);
+ convert->max_unpack = BCF_UN_STR;
+
+ int i, is_gtf = 0;
+ char *p = convert->format_str;
+ while ( *p )
+ {
+ //fprintf(pysamerr,"<%s>\n", p);
+ switch (*p)
+ {
+ case '[': is_gtf = 1; p++; break;
+ case ']': is_gtf = 0; register_tag(convert, T_SEP, NULL, 0); p++; break;
+ case '%': p = parse_tag(convert, p, is_gtf); break;
+ default: p = parse_sep(convert, p, is_gtf); break;
+ }
+ }
+
+ if ( nsamples )
+ {
+ convert->nsamples = nsamples;
+ convert->samples = (int*) malloc(sizeof(int)*nsamples);
+ for (i=0; i<convert->nsamples; i++) convert->samples[i] = samples[i];
+ }
+ else
+ {
+ convert->nsamples = bcf_hdr_nsamples(convert->header);
+ convert->samples = (int*) malloc(sizeof(int)*convert->nsamples);
+ for (i=0; i<convert->nsamples; i++) convert->samples[i] = i;
+ }
+ return convert;
+}
+
+void convert_destroy(convert_t *convert)
+{
+ int i;
+ for (i=0; i<convert->nfmt; i++)
+ free(convert->fmt[i].key);
+ free(convert->fmt);
+ free(convert->undef_info_tag);
+ free(convert->dat);
+ free(convert->samples);
+ free(convert->format_str);
+ free(convert);
+}
+
+
+int convert_header(convert_t *convert, kstring_t *str)
+{
+ int i, icol = 0, l_ori = str->l;
+ bcf_hdr_t *hdr = convert->header;
+
+ // Supress the header output if LINE is present
+ for (i=0; i<convert->nfmt; i++)
+ if ( convert->fmt[i].type == T_LINE ) break;
+ if ( i!=convert->nfmt )
+ return str->l - l_ori;
+
+ kputs("# ", str);
+ for (i=0; i<convert->nfmt; i++)
+ {
+ // Genotype fields
+ if ( convert->fmt[i].is_gt_field )
+ {
+ int j = i, js, k;
+ while ( convert->fmt[j].is_gt_field ) j++;
+ for (js=0; js<convert->nsamples; js++)
+ {
+ int ks = convert->samples[js];
+ for (k=i; k<j; k++)
+ {
+ if ( convert->fmt[k].type == T_SEP )
+ {
+ if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+ }
+ else if ( convert->fmt[k].type == T_SAMPLE )
+ ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
+ else
+ ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+ }
+ }
+ i = j-1;
+ continue;
+ }
+ // Fixed fields
+ if ( convert->fmt[i].type == T_SEP )
+ {
+ if ( convert->fmt[i].key ) kputs(convert->fmt[i].key, str);
+ continue;
+ }
+ ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
+ }
+ return str->l - l_ori;
+}
+
+int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
+{
+ if ( !convert->allow_undef_tags && convert->undef_info_tag )
+ error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag);
+
+ int l_ori = str->l;
+ bcf_unpack(line, convert->max_unpack);
+
+ int i, ir;
+ str->l = 0;
+ for (i=0; i<convert->nfmt; i++)
+ {
+ // Genotype fields
+ if ( convert->fmt[i].is_gt_field )
+ {
+ int j = i, js, k;
+ while ( convert->fmt[j].is_gt_field )
+ {
+ convert->fmt[j].ready = 0;
+ j++;
+ }
+ for (js=0; js<convert->nsamples; js++)
+ {
+ int ks = convert->samples[js];
+ for (k=i; k<j; k++)
+ {
+ if ( convert->fmt[k].type == T_MASK )
+ {
+ for (ir=0; ir<convert->nreaders; ir++)
+ kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
+ }
+ else if ( convert->fmt[k].handler )
+ convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+ }
+ }
+ i = j-1;
+ continue;
+ }
+ // Fixed fields
+ if ( convert->fmt[i].type == T_MASK )
+ {
+ for (ir=0; ir<convert->nreaders; ir++)
+ kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
+ }
+ else if ( convert->fmt[i].handler )
+ convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str);
+ }
+ return str->l - l_ori;
+}
+
+int convert_set_option(convert_t *convert, enum convert_option opt, ...)
+{
+ int ret = 0;
+ va_list args;
+
+ va_start(args, opt);
+ switch (opt)
+ {
+ case allow_undef_tags:
+ convert->allow_undef_tags = va_arg(args, int);
+ break;
+ default:
+ ret = -1;
+ }
+ va_end(args);
+ return ret;
+}
+
+int convert_max_unpack(convert_t *convert)
+{
+ return convert->max_unpack;
+}
+
diff --git a/samtools/samtools.h b/bcftools/convert.h
similarity index 51%
copy from samtools/samtools.h
copy to bcftools/convert.h
index 3161822..3712338 100644
--- a/samtools/samtools.h
+++ b/bcftools/convert.h
@@ -1,6 +1,6 @@
-/* samtools.h -- utility routines.
+/* convert.h -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2014 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -16,18 +16,29 @@ all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#ifndef SAMTOOLS_H
-#define SAMTOOLS_H
-
-const char *samtools_version(void);
-
-void print_error(const char *format, ...);
-void print_error_errno(const char *format, ...);
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#ifndef __CONVERT_H__
+#define __CONVERT_H__
+
+#include <htslib/vcf.h>
+
+typedef struct _convert_t convert_t;
+enum convert_option
+{
+ allow_undef_tags
+};
+
+convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
+void convert_destroy(convert_t *convert);
+int convert_set_option(convert_t *convert, enum convert_option opt, ...);
+int convert_header(convert_t *convert, kstring_t *str);
+int convert_line(convert_t *convert, bcf1_t *rec, kstring_t *str);
+int convert_max_unpack(convert_t *convert);
#endif
+
diff --git a/bcftools/em.c b/bcftools/em.c
new file mode 100644
index 0000000..a976f22
--- /dev/null
+++ b/bcftools/em.c
@@ -0,0 +1,259 @@
+/* em.c -- mathematical functions.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Portions copyright (C) 2013 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at live.co.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include "kmin.h"
+#include "call.h"
+
+#define ITER_MAX 50
+#define ITER_TRY 10
+#define EPS 1e-5
+
+extern double kf_gammaq(double, double);
+
+/*
+ Generic routines
+ */
+
+// estimate site allele frequency in a very naive and inaccurate way
+static double est_freq(int n, const double *pdg)
+{
+ int i, gcnt[3], tmp1;
+ // get a rough estimate of the genotype frequency
+ gcnt[0] = gcnt[1] = gcnt[2] = 0;
+ for (i = 0; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ if (p[0] != 1. || p[1] != 1. || p[2] != 1.) {
+ int which = p[0] > p[1]? 0 : 1;
+ which = p[which] > p[2]? which : 2;
+ ++gcnt[which];
+ }
+ }
+ tmp1 = gcnt[0] + gcnt[1] + gcnt[2];
+ return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1;
+}
+
+/*
+ Single-locus EM
+ */
+
+typedef struct {
+ int beg, end;
+ const double *pdg;
+} minaux1_t;
+
+static double prob1(double f, void *data)
+{
+ minaux1_t *a = (minaux1_t*)data;
+ double p = 1., l = 0., f3[3];
+ int i;
+// printf("brent %lg\n", f);
+ if (f < 0 || f > 1) return 1e300;
+ f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
+ for (i = a->beg; i < a->end; ++i) {
+ const double *pdg = a->pdg + i * 3;
+ p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
+ if (p < 1e-200) l -= log(p), p = 1.;
+ }
+ return l - log(p);
+}
+
+// one EM iteration for allele frequency estimate
+static double freq_iter(double *f, const double *_pdg, int beg, int end)
+{
+ double f0 = *f, f3[3], err;
+ int i;
+// printf("em %lg\n", *f);
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = beg, f0 = 0.; i < end; ++i) {
+ const double *pdg = _pdg + i * 3;
+ f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
+ / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
+ }
+ f0 /= (end - beg) * 2;
+ err = fabs(f0 - *f);
+ *f = f0;
+ return err;
+}
+
+/* The following function combines EM and Brent's method. When the signal from
+ * the data is strong, EM is faster but sometimes, EM may converge very slowly.
+ * When this happens, we switch to Brent's method. The idea is learned from
+ * Rasmus Nielsen.
+ */
+static double freqml(double f0, int beg, int end, const double *pdg)
+{
+ int i;
+ double f;
+ for (i = 0, f = f0; i < ITER_TRY; ++i)
+ if (freq_iter(&f, pdg, beg, end) < EPS) break;
+ if (i == ITER_TRY) { // haven't converged yet; try Brent's method
+ minaux1_t a;
+ a.beg = beg; a.end = end; a.pdg = pdg;
+ kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f);
+ }
+ return f;
+}
+
+// one EM iteration for genotype frequency estimate
+static double g3_iter(double g[3], const double *_pdg, int beg, int end)
+{
+ double err, gg[3];
+ int i;
+ gg[0] = gg[1] = gg[2] = 0.;
+// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
+ for (i = beg; i < end; ++i) {
+ double sum, tmp[3];
+ const double *pdg = _pdg + i * 3;
+ tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
+ sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
+ gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
+ }
+ err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
+ err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
+ g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
+ return err;
+}
+
+// perform likelihood ratio test
+static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
+{
+ double r;
+ int i;
+ for (i = 0, r = 1.; i < n1; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ for (; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ return r;
+}
+
+// x[0]: ref frequency
+// x[1..3]: alt-alt, alt-ref, ref-ref frequenc
+// x[4]: HWE P-value
+// x[5..6]: group1 freq, group2 freq
+// x[7]: 1-degree P-value
+// x[8]: 2-degree P-value
+int bcf_em1(call_t *call, const bcf1_t *rec, int n1, int flag, double x[10])
+{
+ double *pdg;
+ int i, n; //, n2;
+ if (rec->n_allele < 2) return -1; // one allele only
+ // initialization
+ if (n1 < 0 || n1 > rec->n_sample) n1 = 0;
+ if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required
+ if (flag & 0xf<<1) flag |= 0xf<<1;
+ n = rec->n_sample; //n2 = n - n1;
+ pdg = call->pdg;
+ if (pdg == 0) return -1;
+ for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative
+ {
+ if ((x[0] = est_freq(n, pdg)) < 0.) return -1; // no data
+ x[0] = freqml(x[0], 0, n, pdg);
+ }
+ if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE
+ double *g = x + 1, f3[3], r;
+ f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
+ f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
+ f3[2] = g[2] = x[0] * x[0];
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g, pdg, 0, n) < EPS) break;
+ // Hardy-Weinberg equilibrium (HWE)
+ for (i = 0, r = 1.; i < n; ++i) {
+ double *p = pdg + i * 3;
+ r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]);
+ }
+ x[4] = kf_gammaq(.5, log(r));
+ }
+ if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency
+ x[5] = freqml(x[0], 0, n1, pdg);
+ x[6] = freqml(x[0], n1, n, pdg);
+ }
+ if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
+ double f[3], f3[3][3], tmp;
+ f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
+ for (i = 0; i < 3; ++i)
+ f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
+ tmp = log(lk_ratio_test(n, n1, pdg, f3));
+ if (tmp < 0) tmp = 0;
+ x[7] = kf_gammaq(.5, tmp);
+ }
+ if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
+ double g[3][3], tmp;
+ for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[2], pdg, n1, n) < EPS) break;
+ tmp = log(lk_ratio_test(n, n1, pdg, g));
+ if (tmp < 0) tmp = 0;
+ x[8] = kf_gammaq(1., tmp);
+ }
+ return 0;
+}
+
+/*
+ Two-locus EM (LD)
+ */
+
+#define _G1(h, k) ((h>>1&1) + (k>>1&1))
+#define _G2(h, k) ((h&1) + (k&1))
+
+#if 0
+// 0: the previous site; 1: the current site
+static int pair_freq_iter(int n, double *pdg[2], double f[4])
+{
+ double ff[4];
+ int i, k, h;
+// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
+ memset(ff, 0, 4 * sizeof(double));
+ for (i = 0; i < n; ++i) {
+ double *p[2], sum, tmp;
+ p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
+ for (k = 0, sum = 0.; k < 4; ++k)
+ for (h = 0; h < 4; ++h)
+ sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
+ for (k = 0; k < 4; ++k) {
+ tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
+ + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
+ + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
+ + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
+ ff[k] += f[k] * tmp / sum;
+ }
+ }
+ for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
+ return 0;
+}
+#endif
+
+
diff --git a/bcftools/em.c.pysam.c b/bcftools/em.c.pysam.c
new file mode 100644
index 0000000..758d919
--- /dev/null
+++ b/bcftools/em.c.pysam.c
@@ -0,0 +1,261 @@
+#include "pysam.h"
+
+/* em.c -- mathematical functions.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Portions copyright (C) 2013 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at live.co.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include "kmin.h"
+#include "call.h"
+
+#define ITER_MAX 50
+#define ITER_TRY 10
+#define EPS 1e-5
+
+extern double kf_gammaq(double, double);
+
+/*
+ Generic routines
+ */
+
+// estimate site allele frequency in a very naive and inaccurate way
+static double est_freq(int n, const double *pdg)
+{
+ int i, gcnt[3], tmp1;
+ // get a rough estimate of the genotype frequency
+ gcnt[0] = gcnt[1] = gcnt[2] = 0;
+ for (i = 0; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ if (p[0] != 1. || p[1] != 1. || p[2] != 1.) {
+ int which = p[0] > p[1]? 0 : 1;
+ which = p[which] > p[2]? which : 2;
+ ++gcnt[which];
+ }
+ }
+ tmp1 = gcnt[0] + gcnt[1] + gcnt[2];
+ return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1;
+}
+
+/*
+ Single-locus EM
+ */
+
+typedef struct {
+ int beg, end;
+ const double *pdg;
+} minaux1_t;
+
+static double prob1(double f, void *data)
+{
+ minaux1_t *a = (minaux1_t*)data;
+ double p = 1., l = 0., f3[3];
+ int i;
+// printf("brent %lg\n", f);
+ if (f < 0 || f > 1) return 1e300;
+ f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
+ for (i = a->beg; i < a->end; ++i) {
+ const double *pdg = a->pdg + i * 3;
+ p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
+ if (p < 1e-200) l -= log(p), p = 1.;
+ }
+ return l - log(p);
+}
+
+// one EM iteration for allele frequency estimate
+static double freq_iter(double *f, const double *_pdg, int beg, int end)
+{
+ double f0 = *f, f3[3], err;
+ int i;
+// printf("em %lg\n", *f);
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = beg, f0 = 0.; i < end; ++i) {
+ const double *pdg = _pdg + i * 3;
+ f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
+ / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
+ }
+ f0 /= (end - beg) * 2;
+ err = fabs(f0 - *f);
+ *f = f0;
+ return err;
+}
+
+/* The following function combines EM and Brent's method. When the signal from
+ * the data is strong, EM is faster but sometimes, EM may converge very slowly.
+ * When this happens, we switch to Brent's method. The idea is learned from
+ * Rasmus Nielsen.
+ */
+static double freqml(double f0, int beg, int end, const double *pdg)
+{
+ int i;
+ double f;
+ for (i = 0, f = f0; i < ITER_TRY; ++i)
+ if (freq_iter(&f, pdg, beg, end) < EPS) break;
+ if (i == ITER_TRY) { // haven't converged yet; try Brent's method
+ minaux1_t a;
+ a.beg = beg; a.end = end; a.pdg = pdg;
+ kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f);
+ }
+ return f;
+}
+
+// one EM iteration for genotype frequency estimate
+static double g3_iter(double g[3], const double *_pdg, int beg, int end)
+{
+ double err, gg[3];
+ int i;
+ gg[0] = gg[1] = gg[2] = 0.;
+// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
+ for (i = beg; i < end; ++i) {
+ double sum, tmp[3];
+ const double *pdg = _pdg + i * 3;
+ tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
+ sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
+ gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
+ }
+ err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
+ err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
+ g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
+ return err;
+}
+
+// perform likelihood ratio test
+static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
+{
+ double r;
+ int i;
+ for (i = 0, r = 1.; i < n1; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ for (; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ return r;
+}
+
+// x[0]: ref frequency
+// x[1..3]: alt-alt, alt-ref, ref-ref frequenc
+// x[4]: HWE P-value
+// x[5..6]: group1 freq, group2 freq
+// x[7]: 1-degree P-value
+// x[8]: 2-degree P-value
+int bcf_em1(call_t *call, const bcf1_t *rec, int n1, int flag, double x[10])
+{
+ double *pdg;
+ int i, n; //, n2;
+ if (rec->n_allele < 2) return -1; // one allele only
+ // initialization
+ if (n1 < 0 || n1 > rec->n_sample) n1 = 0;
+ if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required
+ if (flag & 0xf<<1) flag |= 0xf<<1;
+ n = rec->n_sample; //n2 = n - n1;
+ pdg = call->pdg;
+ if (pdg == 0) return -1;
+ for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative
+ {
+ if ((x[0] = est_freq(n, pdg)) < 0.) return -1; // no data
+ x[0] = freqml(x[0], 0, n, pdg);
+ }
+ if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE
+ double *g = x + 1, f3[3], r;
+ f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
+ f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
+ f3[2] = g[2] = x[0] * x[0];
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g, pdg, 0, n) < EPS) break;
+ // Hardy-Weinberg equilibrium (HWE)
+ for (i = 0, r = 1.; i < n; ++i) {
+ double *p = pdg + i * 3;
+ r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]);
+ }
+ x[4] = kf_gammaq(.5, log(r));
+ }
+ if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency
+ x[5] = freqml(x[0], 0, n1, pdg);
+ x[6] = freqml(x[0], n1, n, pdg);
+ }
+ if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
+ double f[3], f3[3][3], tmp;
+ f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
+ for (i = 0; i < 3; ++i)
+ f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
+ tmp = log(lk_ratio_test(n, n1, pdg, f3));
+ if (tmp < 0) tmp = 0;
+ x[7] = kf_gammaq(.5, tmp);
+ }
+ if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
+ double g[3][3], tmp;
+ for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[2], pdg, n1, n) < EPS) break;
+ tmp = log(lk_ratio_test(n, n1, pdg, g));
+ if (tmp < 0) tmp = 0;
+ x[8] = kf_gammaq(1., tmp);
+ }
+ return 0;
+}
+
+/*
+ Two-locus EM (LD)
+ */
+
+#define _G1(h, k) ((h>>1&1) + (k>>1&1))
+#define _G2(h, k) ((h&1) + (k&1))
+
+#if 0
+// 0: the previous site; 1: the current site
+static int pair_freq_iter(int n, double *pdg[2], double f[4])
+{
+ double ff[4];
+ int i, k, h;
+// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
+ memset(ff, 0, 4 * sizeof(double));
+ for (i = 0; i < n; ++i) {
+ double *p[2], sum, tmp;
+ p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
+ for (k = 0, sum = 0.; k < 4; ++k)
+ for (h = 0; h < 4; ++h)
+ sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
+ for (k = 0; k < 4; ++k) {
+ tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
+ + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
+ + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
+ + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
+ ff[k] += f[k] * tmp / sum;
+ }
+ }
+ for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
+ return 0;
+}
+#endif
+
+
diff --git a/bcftools/filter.c b/bcftools/filter.c
new file mode 100644
index 0000000..c56ae6d
--- /dev/null
+++ b/bcftools/filter.c
@@ -0,0 +1,1788 @@
+/* filter.c -- filter expressions.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <math.h>
+#include <wordexp.h>
+#include <regex.h>
+#include <htslib/khash_str2int.h>
+#include "filter.h"
+#include "bcftools.h"
+#include <htslib/hts_defs.h>
+#include <htslib/vcfutils.h>
+
+typedef struct _token_t
+{
+ // read-only values, same for all VCF lines
+ int tok_type; // one of the TOK_* keys below
+ char *key; // set only for string constants, otherwise NULL
+ char *tag; // for debugging and printout only, VCF tag name
+ float threshold; // filtering threshold
+ int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
+ int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
+ void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
+ int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
+ void *hash; // test presence of str value in the hash via comparator
+ regex_t *regex; // precompiled regex for string comparison
+
+ // modified on filter evaluation at each VCF line
+ float *values; // In case str_value is set, values[0] is one sample's string length
+ char *str_value; // and values[0]*nsamples gives the total length;
+ int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
+ int pass_site; // -1 not applicable, 0 fails, >0 pass
+ uint8_t *pass_samples; // status of individual samples
+ int nsamples; // number of samples
+ int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars
+ // for strings, total length of str_value
+}
+token_t;
+
+struct _filter_t
+{
+ bcf_hdr_t *hdr;
+ char *str;
+ int nfilters;
+ token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
+ int32_t *tmpi;
+ int max_unpack, mtmpi, nsamples;
+};
+
+
+#define TOK_VAL 0
+#define TOK_LFT 1 // (
+#define TOK_RGT 2 // )
+#define TOK_LE 3 // less or equal
+#define TOK_LT 4 // less than
+#define TOK_EQ 5 // equal
+#define TOK_BT 6 // bigger than
+#define TOK_BE 7 // bigger or equal
+#define TOK_NE 8 // not equal
+#define TOK_OR 9 // |
+#define TOK_AND 10 // &
+#define TOK_ADD 11 // +
+#define TOK_SUB 12 // -
+#define TOK_MULT 13 // *
+#define TOK_DIV 14 // /
+#define TOK_MAX 15
+#define TOK_MIN 16
+#define TOK_AVG 17
+#define TOK_AND_VEC 18 // && (operator applied in samples)
+#define TOK_OR_VEC 19 // || (operator applied in samples)
+#define TOK_LIKE 20 // ~ regular expression
+#define TOK_NLIKE 21 // !~ regular expression
+#define TOK_SUM 22
+#define TOK_ABS 23
+#define TOK_LEN 24
+#define TOK_FUNC 25
+
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8};
+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
+
+static int filters_next_token(char **str, int *len)
+{
+ char *tmp = *str;
+ while ( *tmp && isspace(*tmp) ) tmp++;
+ *str = tmp;
+ *len = 0;
+
+ // test for doubles: d.ddde[+-]dd
+ if ( isdigit(*str[0]) || *str[0]=='.' ) // strtod would eat +/-
+ {
+ double HTS_UNUSED v = strtod(*str, &tmp);
+ if ( *str!=tmp && (!tmp[0] || !isalnum(tmp[0])) )
+ {
+ *len = tmp - (*str);
+ return TOK_VAL;
+ }
+ tmp = *str;
+ }
+
+ if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
+ if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
+ if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
+ if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
+ if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
+ if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
+ if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
+ if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
+ if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility
+ if ( !strncasecmp(tmp,"%SUM(",5) ) { (*str) += 4; return TOK_SUM; } // for backward compatibility
+ if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5;
+ if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7;
+ if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4;
+
+ if ( tmp[0]=='@' ) // file name
+ {
+ while ( *tmp && !isspace(*tmp) && *tmp!='=' && *tmp!='!' ) tmp++;
+ *len = tmp - (*str);
+ return TOK_VAL;
+ }
+
+ while ( tmp[0] )
+ {
+ if ( tmp[0]=='"' ) break;
+ if ( tmp[0]=='\'' ) break;
+ if ( isspace(tmp[0]) ) break;
+ if ( tmp[0]=='<' ) break;
+ if ( tmp[0]=='>' ) break;
+ if ( tmp[0]=='=' ) break;
+ if ( tmp[0]=='!' ) break;
+ if ( tmp[0]=='&' ) break;
+ if ( tmp[0]=='|' ) break;
+ if ( tmp[0]=='(' ) break;
+ if ( tmp[0]==')' ) break;
+ if ( tmp[0]=='+' ) break;
+ // hacky: so that [*] is not split, the tokenizer does not recognise square brackets []
+ if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break;
+ if ( tmp[0]=='-' ) break;
+ if ( tmp[0]=='/' ) break;
+ if ( tmp[0]=='~' ) break;
+ tmp++;
+ }
+ if ( tmp > *str )
+ {
+ *len = tmp - (*str);
+ return TOK_VAL;
+ }
+ if ( tmp[0]=='"' || tmp[0]=='\'' )
+ {
+ int quote = tmp[0];
+ tmp++;
+ while ( *tmp && tmp[0]!=quote ) tmp++;
+ if ( !*tmp ) return -1; // missing quotes
+ *len = tmp - (*str) + 1;
+ return TOK_VAL;
+ }
+ if ( tmp[0]=='!' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_NE; }
+ if ( tmp[1]=='~' ) { (*str) += 2; return TOK_NLIKE; }
+ }
+ if ( tmp[0]=='<' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_LE; }
+ (*str) += 1; return TOK_LT;
+ }
+ if ( tmp[0]=='>' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_BE; }
+ (*str) += 1; return TOK_BT;
+ }
+ if ( tmp[0]=='=' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_EQ; }
+ (*str) += 1; return TOK_EQ;
+ }
+ if ( tmp[0]=='(' ) { (*str) += 1; return TOK_LFT; }
+ if ( tmp[0]==')' ) { (*str) += 1; return TOK_RGT; }
+ if ( tmp[0]=='&' && tmp[1]=='&' ) { (*str) += 2; return TOK_AND_VEC; }
+ if ( tmp[0]=='|' && tmp[1]=='|' ) { (*str) += 2; return TOK_OR_VEC; }
+ if ( tmp[0]=='&' ) { (*str) += 1; return TOK_AND; }
+ if ( tmp[0]=='|' ) { (*str) += 1; return TOK_OR; }
+ if ( tmp[0]=='+' ) { (*str) += 1; return TOK_ADD; }
+ if ( tmp[0]=='-' ) { (*str) += 1; return TOK_SUB; }
+ if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; }
+ if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; }
+ if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; }
+
+ *len = tmp - (*str);
+ return TOK_VAL;
+}
+
+static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float *ptr = &line->qual;
+ if ( bcf_float_is_missing(*ptr) )
+ tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = line->qual;
+ tok->nvalues = 1;
+ }
+}
+static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->values[0] = bcf_get_variant_types(line);
+ tok->nvalues = 1;
+}
+static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ assert( tok->hdr_id >=0 );
+ int i;
+ for (i=0; i<line->n_info; i++)
+ if ( line->d.info[i].key == tok->hdr_id ) break;
+
+ if ( i==line->n_info )
+ tok->nvalues = 0;
+ else if ( line->d.info[i].type==BCF_BT_CHAR )
+ {
+ int n = line->d.info[i].len;
+ int m = (int)tok->values[0];
+ hts_expand(char,n+1,m,tok->str_value);
+ memcpy(tok->str_value,line->d.info[i].vptr,n);
+ tok->str_value[n] = 0;
+ tok->values[0] = m;
+ tok->nvalues = n;
+ }
+ else if ( line->d.info[i].type==BCF_BT_FLOAT )
+ {
+ if ( bcf_float_is_missing(line->d.info[i].v1.f) ) tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = line->d.info[i].v1.f;
+ tok->nvalues = 1;
+ }
+ tok->str_value = NULL;
+ }
+ else
+ {
+ if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
+ else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
+ else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = line->d.info[i].v1.i;
+ tok->nvalues = 1;
+ }
+ tok->str_value = NULL;
+ }
+}
+static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ int i;
+ if ( op_type==TOK_NE ) // AND logic: none of the filters can match
+ {
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) return 0; // missing value
+ return 1; // no filter present, eval to true
+ }
+ for (i=0; i<line->d.n_flt; i++)
+ if ( atok->hdr_id==line->d.flt[i] ) return 0;
+ return 1;
+ }
+ // TOK_EQ with OR logic: at least one of the filters must match
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) return 1;
+ return 0; // no filter present, eval to false
+ }
+ for (i=0; i<line->d.n_flt; i++)
+ if ( atok->hdr_id==line->d.flt[i] ) return 1;
+ return 0;
+}
+static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ // multiple IDs not supported yet (easy to add though)
+
+ if ( btok->hash )
+ {
+ token_t *tmp = atok; atok = btok; btok = tmp;
+ }
+ if ( atok->hash )
+ {
+ int ret = khash_str2int_has_key(atok->hash, line->d.id);
+ if ( op_type==TOK_EQ ) return ret;
+ return ret ? 0 : 1;
+ }
+
+ if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1;
+ return strcmp(btok->str_value,line->d.id) ? 1 : 0;
+}
+
+/**
+ * bcf_get_info_value() - get single INFO value, int or float
+ * @line: BCF line
+ * @info_id: tag ID, as returned by bcf_hdr_id2int
+ * @ivec: 0-based index to retrieve, -1 when single value is expected
+ * @vptr: pointer to memory location of sufficient size to accomodate
+ * info_id's type
+ *
+ * The returned value is -1 if tag is not present, 0 if present but
+ * values is missing or ivec is out of range, and 1 on success.
+ */
+static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
+{
+ int j;
+ for (j=0; j<line->n_info; j++)
+ if ( line->d.info[j].key == info_id ) break;
+ if ( j==line->n_info ) return -1;
+
+ bcf_info_t *info = &line->d.info[j];
+ if ( info->len == 1 )
+ {
+ if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f;
+ else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i;
+ return 1;
+ }
+
+ if ( ivec<0 ) ivec = 0;
+
+ #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
+ type_t *p = (type_t *) info->vptr; \
+ for (j=0; j<ivec && j<info->len; j++) \
+ { \
+ if ( is_vector_end ) return 0; \
+ } \
+ if ( is_missing ) return 0; \
+ *((out_type_t*)value) = p[j]; \
+ return 1; \
+ }
+ switch (info->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break;
+ default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
+ }
+ #undef BRANCH
+ return -1; // this shouldn't happen
+}
+
+static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->values[0] = line->pos+1;
+ tok->nvalues = 1;
+}
+
+static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( tok->idx==-2 )
+ {
+ int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
+ tok->nvalues = n;
+ hts_expand(float,n,tok->mvalues,tok->values);
+ for (i=0; i<n; i++) tok->values[i] = flt->tmpi[i];
+ }
+ else
+ {
+ int32_t value;
+ if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
+ tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = value;
+ tok->nvalues = 1;
+ }
+ }
+}
+
+static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( tok->idx==-2 )
+ {
+ tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues);
+ if ( tok->nvalues<0 ) tok->nvalues = 0;
+ }
+ else
+ {
+ float value;
+ if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
+ tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = value;
+ tok->nvalues = 1;
+ }
+ }
+}
+
+static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int m = (int)tok->values[0];
+ int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m);
+ if ( n<0 ) { tok->nvalues = 0; return; }
+ tok->values[0] = m; // allocated length
+
+ if ( tok->idx>=0 )
+ {
+ // get ith field (i=tok->idx)
+ int i = 0;
+ char *ss = tok->str_value, *se = tok->str_value + n;
+ while ( ss<se && i<tok->idx )
+ {
+ if ( *ss==',' ) i++;
+ ss++;
+ }
+ if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; }
+ se = ss;
+ while ( se-tok->str_value<n && *se!=',' ) se++;
+ if ( ss==tok->str_value ) *se = 0;
+ else
+ {
+ memmove(tok->str_value,ss,se-ss);
+ tok->str_value[se-ss] = 0;
+ }
+ tok->nvalues = se-ss;
+ }
+ else if ( tok->idx==-2 ) tok->nvalues = n;
+}
+
+static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int j;
+ for (j=0; j<line->n_info; j++)
+ if ( line->d.info[j].key == tok->hdr_id ) break;
+ tok->values[0] = j==line->n_info ? 0 : 1;
+ tok->nvalues = 1;
+}
+
+static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int i;
+ if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
+ tok->nvalues = 0;
+ else
+ {
+ int is_missing = 1;
+ hts_expand(float,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
+ bcf_float_set_missing(tok->values[i]);
+ else
+ {
+ tok->values[i] = flt->tmpi[i];
+ is_missing = 0;
+ }
+ }
+ if ( is_missing ) tok->nvalues = 0;
+ else if ( tok->idx >= 0 )
+ {
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nvalues = nsmpl;
+ }
+ }
+ }
+ tok->nsamples = tok->nvalues;
+}
+static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 )
+ tok->nvalues = tok->nsamples = 0; // missing values
+ else if ( tok->idx >= 0 )
+ {
+ int i, nsmpl, nvals;
+ nsmpl = bcf_hdr_nsamples(flt->hdr);
+ nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nsamples = tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nsamples = tok->nvalues = nsmpl;
+ }
+ }
+ tok->nsamples = tok->nvalues;
+}
+static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int ndim = tok->nsamples * (int)tok->values[0];
+ int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim);
+
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ ndim /= nsmpl;
+ tok->values[0] = ndim;
+
+ if ( ret<=0 )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+
+ if ( tok->idx < 0 ) // scalar
+ {
+ tok->nvalues = tok->nsamples = nsmpl;
+ return;
+ }
+
+ // vector
+ int i;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *ss = tok->str_value + i*ndim;
+ int is = 0, ivec = 0;
+ while ( ivec<tok->idx && is<ndim && ss[is] )
+ {
+ if ( ss[is]==',' ) ivec++;
+ is++;
+ }
+ if ( ivec!=tok->idx || is==ndim || !ss[is] )
+ {
+ ss[0] = '.';
+ ss[1] = 0;
+ continue;
+ }
+ int ie = is;
+ while ( ie<ndim && ss[ie] && ss[ie]!=',' ) ie++;
+ if ( is ) memmove(ss,&ss[is],ie-is);
+ if ( ndim-(ie-is) ) memset(ss+ie-is,0,ndim-(ie-is));
+ }
+ if ( !ndim )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ tok->nvalues = ret;
+ tok->nsamples = nsmpl;
+}
+static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
+ if ( !fmt )
+ {
+ tok->nvalues = tok->nsamples = 0;
+ return;
+ }
+ int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr);
+ kstring_t str;
+
+gt_length_too_big:
+ str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0;
+ for (i=0; i<nsmpl; i++)
+ {
+ int plen = str.l;
+
+ #define BRANCH(type_t) { \
+ type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
+ if ( !(ptr[0]>>1) ) kputc('.',&str); \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
+ }
+ #undef BRANCH
+
+ if ( plen==str.l )
+ {
+ bcf_format_gt(fmt, i, &str);
+ if ( str.l - plen > blen )
+ {
+ // too many alternate alleles or ploidy is too large, the genotype does not fit
+ // three characters ("0/0" vs "10/10").
+ tok->str_value = str.s;
+ blen *= 2;
+ goto gt_length_too_big;
+ }
+ }
+
+ plen = str.l - plen;
+ while ( plen<blen )
+ {
+ kputc_(0, &str);
+ plen++;
+ }
+ }
+ tok->nvalues = str.l;
+ tok->nsamples = nsmpl;
+ tok->values[0] = blen;
+ tok->str_value = str.s;
+}
+static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+ kputs(line->d.allele[0], &str);
+ tok->nvalues = str.l;
+ tok->values[0] = str.m;
+ tok->str_value = str.s;
+}
+static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+ if ( tok->idx>=0 )
+ {
+ if ( line->n_allele >= tok->idx )
+ kputs(line->d.allele[tok->idx], &str);
+ else
+ kputc('.', &str);
+ }
+ else if ( line->n_allele>1 )
+ {
+ kputs(line->d.allele[1], &str);
+ int i;
+ for (i=2; i<line->n_allele; i++)
+ {
+ kputc(',', &str);
+ kputs(line->d.allele[i], &str);
+ }
+ }
+ else if ( line->n_allele==1 )
+ kputc('.', &str);
+ tok->nvalues = str.l;
+ tok->values[0] = str.m;
+ tok->str_value = str.s;
+}
+static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->nvalues = 1;
+ tok->values[0] = line->n_allele - 1;
+}
+static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ hts_expand(int32_t, line->n_allele, flt->mtmpi, flt->tmpi);
+ if ( !bcf_calc_ac(flt->hdr, line, flt->tmpi, BCF_UN_INFO|BCF_UN_FMT) )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ int i, an = flt->tmpi[0];
+ for (i=1; i<line->n_allele; i++) an += flt->tmpi[i];
+ if ( !an )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ flt->tmpi[0] = an; // for filters_set_[mac|af|maf]
+ if ( tok->idx>=0 )
+ {
+ tok->nvalues = 1;
+ tok->values[0] = tok->idx+1<line->n_allele ? flt->tmpi[tok->idx+1] : 0;
+ }
+ else if ( line->n_allele==1 ) // no ALT
+ {
+ tok->nvalues = 1;
+ tok->values[0] = 0;
+ }
+ else
+ {
+ hts_expand(float,line->n_allele,tok->mvalues,tok->values);
+ for (i=1; i<line->n_allele; i++)
+ tok->values[i-1] = flt->tmpi[i];
+ tok->nvalues = line->n_allele - 1;
+ }
+}
+static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
+ tok->nvalues = 1;
+}
+static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ if ( !tok->nvalues ) return;
+ int i, an = flt->tmpi[0];
+ for (i=0; i<tok->nvalues; i++)
+ if ( tok->values[i] > an*0.5 ) tok->values[i] = an - tok->values[i];
+}
+static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ if ( !tok->nvalues ) return;
+ int i, an = flt->tmpi[0];
+ for (i=0; i<tok->nvalues; i++)
+ tok->values[i] /= (float)an;
+}
+static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ if ( !tok->nvalues ) return;
+ int i, an = flt->tmpi[0];
+ for (i=0; i<tok->nvalues; i++)
+ {
+ tok->values[i] /= (float)an;
+ if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
+ }
+}
+
+static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = -HUGE_VAL;
+ int i;
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ }
+ tok->values[0] = val;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = HUGE_VAL;
+ int i;
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
+ tok->values[0] = val;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = 0;
+ int i, n = 0;
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ tok->values[0] = n ? val / n : 0;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = 0;
+ int i, n = 0;
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ tok->values[0] = val;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
+ int i;
+ for (i=0; i<tok->nvalues; i++)
+ tok->values[i] = fabs(tok->values[i]);
+}
+static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->is_str = 0;
+ if ( !tok->nvalues ) return;
+ if ( tok->idx==-2 )
+ {
+ int i = 0;
+ char *ss = tok->str_value;
+ while ( *ss )
+ {
+ char *se = ss;
+ while ( *se && *se!=',' ) se++;
+ if ( !*se ) tok->values[i] = strlen(ss);
+ else
+ {
+ *se = 0;
+ tok->values[i] = strlen(ss);
+ *se = ',';
+ }
+ ss = *se ? se + 1 : se;
+ i++;
+ }
+ tok->nvalues = i;
+ }
+ else
+ {
+ tok->values[0] = strlen(tok->str_value);
+ tok->nvalues = 1;
+ }
+}
+#define VECTOR_ARITHMETICS(atok,btok,AOP) \
+{ \
+ int i, has_values = 0; \
+ if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \
+ { \
+ (atok)->nvalues = 0; (atok)->nsamples = 0; \
+ } \
+ else \
+ { \
+ if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \
+ { \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) ) continue; \
+ if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \
+ has_values = 1; \
+ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
+ } \
+ } \
+ else if ( (btok)->nsamples ) \
+ { \
+ hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
+ for (i=0; i<(btok)->nvalues; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \
+ { \
+ bcf_float_set_missing((atok)->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \
+ } \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
+ } \
+ else if ( (atok)->nsamples ) \
+ { \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \
+ { \
+ bcf_float_set_missing((atok)->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \
+ } \
+ } \
+ } \
+ if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \
+}
+
+static int vector_logic_and(token_t *atok, token_t *btok, int and_type)
+{
+ // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields),
+ // or a vector and a scalar (FORMAT field vs threshold)
+ int i, pass_site = 0;
+ if ( !atok->nvalues || !btok->nvalues )
+ {
+ atok->nvalues = atok->nsamples = 0;
+ return 0;
+ }
+ if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site;
+ if ( atok->nsamples && btok->nsamples )
+ {
+ if ( and_type==TOK_AND )
+ {
+ // perform AND within a sample
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ else
+ {
+ // perform AND across samples
+ int pass_a = 0, pass_b = 0;
+ for (i=0; i<atok->nsamples; i++)
+ {
+ if ( atok->pass_samples[i] ) pass_a = 1;
+ atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
+ }
+ for (i=0; i<btok->nsamples; i++)
+ {
+ if ( btok->pass_samples[i] ) { pass_b = 1; break; }
+ }
+ pass_site = pass_a && pass_b;
+ }
+ return pass_site;
+ }
+ if ( btok->nsamples )
+ {
+ for (i=0; i<btok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i];
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ atok->nsamples = btok->nsamples;
+ return pass_site;
+ }
+ /* atok->nsamples!=0 */
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site;
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ return pass_site;
+}
+static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
+{
+ int i, pass_site = 0;
+ if ( !atok->nvalues && !btok->nvalues ) // missing sites in both
+ {
+ atok->nvalues = atok->nsamples = 0;
+ return 0;
+ }
+ if ( !atok->nvalues ) // missing value in a
+ {
+ for (i=0; i<btok->nsamples; i++)
+ atok->pass_samples[i] = btok->pass_samples[i];
+ atok->nsamples = btok->nsamples;
+ return btok->pass_site;
+ }
+ if ( !btok->nvalues ) // missing value in b
+ return atok->pass_site;
+
+ if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
+ if ( !atok->nsamples )
+ {
+ if ( or_type==TOK_OR )
+ {
+ for (i=0; i<btok->nsamples; i++)
+ {
+ atok->pass_samples[i] = btok->pass_samples[i];
+ if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ else
+ {
+ for (i=0; i<btok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i];
+ if ( atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ atok->nsamples = btok->nsamples;
+ return pass_site;
+ }
+ if ( !btok->nsamples ) // vector vs site
+ {
+ if ( or_type==TOK_OR )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+ }
+ else
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site;
+ if ( atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ return pass_site;
+ }
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i];
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ return pass_site;
+}
+
+#define CMP_MISSING(atok,btok,CMP_OP,ret) \
+{ \
+ if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
+ token_t *tok = (atok)->is_missing ? (btok) : (atok); \
+ (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
+}
+
+#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
+{ \
+ int i, j, has_values = 0, pass_site = 0; \
+ if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
+ else \
+ { \
+ if ( (atok)->nsamples && (btok)->nsamples ) \
+ { \
+ for (i=0; i<(atok)->nsamples; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
+ } \
+ if ( !has_values ) (atok)->nvalues = 0; \
+ } \
+ else if ( (atok)->nsamples ) \
+ { \
+ if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
+ else \
+ { \
+ for (i=0; i<(atok)->nsamples; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
+ } \
+ } \
+ if ( !has_values ) (atok)->nvalues = 0; \
+ } \
+ else if ( (btok)->nsamples ) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
+ else \
+ { \
+ for (i=0; i<(btok)->nsamples; i++) \
+ { \
+ if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
+ } \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
+ } \
+ if ( !has_values ) (atok)->nvalues = 0; \
+ } \
+ else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
+ { \
+ /* any field can match: [*] */ \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ for (j=0; j<(btok)->nvalues; j++) \
+ if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+ } \
+ } \
+ else \
+ { \
+ if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
+ } \
+ /*fprintf(stderr,"pass=%d\n", pass_site);*/ \
+ (ret) = pass_site; \
+ } \
+}
+static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE
+{
+ if ( !atok->nvalues ) { return 0; }
+ if ( !btok->nvalues ) { atok->nvalues = 0; return 0; }
+ int i, pass_site = 0;
+ if ( atok->nsamples && atok->nsamples==btok->nsamples )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ char *astr = atok->str_value + i*(int)atok->values[0];
+ char *bstr = btok->str_value + i*(int)btok->values[0];
+ char *aend = astr + (int)atok->values[0], *a = astr;
+ while ( a<aend && *a ) a++;
+ char *bend = bstr + (int)btok->values[0], *b = bstr;
+ while ( b<bend && *b ) b++;
+ if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
+ else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
+ if ( logic!=TOK_EQ )
+ atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
+ }
+ else if ( !atok->nsamples && !btok->nsamples )
+ {
+ if ( atok->idx==-2 || btok->idx==-2 )
+ {
+ // any field can match: [*]
+ if ( atok->idx==-2 && btok->idx==-2 )
+ error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value);
+ token_t *xtok, *ytok; // xtok is scalar, ytok array
+ if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
+ else { xtok = btok; ytok = atok; }
+ char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues;
+ char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr;
+ while ( y<=yend )
+ {
+ if ( y==yend || *y==',' )
+ {
+ if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) )
+ {
+ pass_site = 1;
+ break;
+ }
+ ystr = y+1;
+ }
+ y++;
+ }
+ }
+ else
+ pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1;
+ if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
+ }
+ else
+ {
+ token_t *xtok, *ytok;
+ if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
+ else { xtok = btok; ytok = atok; }
+ char *xstr = xtok->str_value;
+ char *xend = xstr + (int)xtok->values[0], *x = xstr;
+ while ( x<xend && *x ) x++;
+ for (i=0; i<ytok->nsamples; i++)
+ {
+ char *ystr = ytok->str_value + i*(int)ytok->values[0];
+ char *yend = ystr + (int)ytok->values[0], *y = ystr;
+ while ( y<yend && *y ) y++;
+ if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
+ else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
+ if ( logic!=TOK_EQ )
+ atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ if ( !atok->nsamples )
+ atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
+ }
+ return pass_site;
+}
+static int regex_vector_strings(token_t *atok, token_t *btok)
+{
+ int ret = regexec(btok->regex, atok->str_value, 0,NULL,0);
+ return ret==0 ? 1 : 0;
+}
+
+static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
+{
+ tok->tok_type = TOK_VAL;
+ tok->hdr_id = -1;
+ tok->pass_site = -1;
+ tok->idx = -1;
+
+ // is this a string constant?
+ if ( str[0]=='"' || str[0]=='\'' )
+ {
+ int quote = str[0];
+ if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
+ tok->key = (char*) calloc(len-1,sizeof(char));
+ hts_expand(float,1,tok->mvalues,tok->values);
+ tok->values[0] = len-2;
+ memcpy(tok->key,str+1,len-2);
+ tok->key[len-2] = 0;
+ tok->is_str = 1;
+ tok->nvalues = len-2;
+ if ( !strcmp(".",tok->key) ) tok->is_missing = 1;
+ return 0;
+ }
+
+ // is it a file?
+ if ( str[0]=='@' )
+ {
+ tok->tag = (char*) calloc(len+1,sizeof(char));
+ memcpy(tok->tag,str,len);
+ tok->tag[len] = 0;
+ wordexp_t wexp;
+ wordexp(tok->tag+1, &wexp, 0);
+ if ( !wexp.we_wordc ) error("No such file: %s\n", tok->tag+1);
+ int i, n;
+ char **list = hts_readlist(wexp.we_wordv[0], 1, &n);
+ if ( !list ) error("Could not read: %s\n", wexp.we_wordv[0]);
+ wordfree(&wexp);
+ tok->hash = khash_str2int_init();
+ for (i=0; i<n; i++)
+ {
+ char *se = list[i];
+ while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ if ( !khash_str2int_has_key(tok->hash,list[i]) )
+ khash_str2int_inc(tok->hash,list[i]);
+ else
+ free(list[i]);
+ }
+ free(list);
+ return 0;
+ }
+
+ int is_fmt = -1;
+ if ( !strncasecmp(str,"FMT/",4) ) { str += 4; len -= 4; is_fmt = 1; }
+ else if ( !strncasecmp(str,"FORMAT/",7) ) { str += 7; len -= 7; is_fmt = 1; }
+ else
+ {
+ if ( !strncasecmp(str,"INFO/",5) ) { is_fmt = 0; str += 5; len -= 5; }
+ else if ( !strncasecmp(str,"QUAL",len) || !strncmp(str,"%QUAL",len) /* for backward compatibility */ )
+ {
+ tok->setter = filters_set_qual;
+ tok->tag = strdup("QUAL");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
+ {
+ tok->setter = filters_set_type;
+ tok->tag = strdup("TYPE");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ )
+ {
+ tok->comparator = filters_cmp_filter;
+ tok->tag = strdup("FILTER");
+ filter->max_unpack |= BCF_UN_FLT;
+ return 0;
+ }
+ else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
+ {
+ tok->comparator = filters_cmp_id;
+ tok->tag = strdup("ID");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"POS",len) )
+ {
+ tok->setter = &filters_set_pos;
+ tok->tag = strdup("POS");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"REF",len) )
+ {
+ tok->setter = &filters_set_ref_string;
+ tok->is_str = 1;
+ tok->tag = strdup("REF");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"ALT",len) )
+ {
+ tok->setter = &filters_set_alt_string;
+ tok->is_str = 1;
+ tok->tag = strdup("ALT");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"N_ALT",len) )
+ {
+ tok->setter = &filters_set_nalt;
+ tok->tag = strdup("N_ALT");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"N_SAMPLES",len) )
+ {
+ tok->tok_type = TOK_VAL;
+ tok->threshold = bcf_hdr_nsamples(filter->hdr);
+ return 0;
+ }
+ }
+
+ // does it have array subscript?
+ int is_array = 0;
+ kstring_t tmp = {0,0,0};
+ kputsn(str, len, &tmp);
+ if ( tmp.s[tmp.l-1] == ']' )
+ {
+ int i;
+ for (i=0; i<tmp.l; i++)
+ if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
+ if ( is_array )
+ {
+ if ( tmp.s[is_array]=='*' )
+ tok->idx = -2; // tag[*] .. any field
+ else
+ {
+ char *end;
+ tok->idx = strtol(tmp.s+is_array, &end, 10);
+ if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array);
+ }
+ }
+ }
+ tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
+ if ( is_fmt==-1 )
+ {
+ if ( tok->hdr_id >=0 )
+ {
+ if ( bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ) is_fmt = 0;
+ else if ( bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) is_fmt = 1;
+ }
+ if ( is_fmt==-1 ) is_fmt = 0;
+ }
+ tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
+ if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
+ if ( tok->hdr_id>=0 )
+ {
+ if ( is_fmt && !strcmp("GT",tmp.s) )
+ {
+ tok->setter = &filters_set_genotype_string; tok->is_str = 1;
+ }
+ else if ( is_fmt )
+ {
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
+ error("No such FORMAT field: %s\n", tmp.s);
+ if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
+ error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s);
+ switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
+ {
+ case BCF_HT_INT: tok->setter = &filters_set_format_int; break;
+ case BCF_HT_REAL: tok->setter = &filters_set_format_float; break;
+ case BCF_HT_STR: tok->setter = &filters_set_format_string; tok->is_str = 1; break;
+ default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
+ }
+ }
+ else if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
+ error("No such INFO field: %s\n", tmp.s);
+ else
+ {
+ if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG )
+ tok->setter = filters_set_info_flag;
+ else
+ {
+ if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1;
+ if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 )
+ tok->setter = filters_set_info;
+ else
+ {
+ switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
+ {
+ case BCF_HT_INT: tok->setter = &filters_set_info_int; break;
+ case BCF_HT_REAL: tok->setter = &filters_set_info_float; break;
+ case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
+ default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
+ }
+ if(!is_array) tok->idx = -2;
+ }
+ }
+ filter->max_unpack |= BCF_UN_INFO;
+ }
+ tok->tag = strdup(tmp.s);
+ if ( tmp.s ) free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"ALT") )
+ {
+ tok->setter = &filters_set_alt_string;
+ tok->is_str = 1;
+ tok->tag = strdup(tmp.s);
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"AN") )
+ {
+ tok->setter = &filters_set_an;
+ tok->tag = strdup("AN");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"AC") )
+ {
+ tok->setter = &filters_set_ac;
+ tok->tag = strdup("AC");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"MAC") )
+ {
+ tok->setter = &filters_set_mac;
+ tok->tag = strdup("MAC");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"AF") )
+ {
+ tok->setter = &filters_set_af;
+ tok->tag = strdup("AF");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"MAF") )
+ {
+ tok->setter = &filters_set_maf;
+ tok->tag = strdup("MAF");
+ free(tmp.s);
+ return 0;
+ }
+
+ // is it a value?
+ char *end;
+ errno = 0;
+ tok->threshold = strtod(tmp.s, &end);
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+
+ if ( tmp.s ) free(tmp.s);
+ return 0;
+}
+
+
+static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
+{
+ int i;
+ for (i=0; i<ntoks; i++)
+ {
+ token_t *tok = toks ? &toks[i] : tok_ptrs[i];
+ if ( tok->tok_type==TOK_VAL )
+ {
+ if ( tok->key )
+ fprintf(stderr,"%s", tok->key);
+ else if ( tok->tag )
+ fprintf(stderr,"%s", tok->tag);
+ else
+ fprintf(stderr,"%e", tok->threshold);
+ }
+ else
+ fprintf(stderr,"%c", TOKEN_STRING[tok->tok_type]);
+ if ( tok->setter ) fprintf(stderr,"\t[setter %p]", tok->setter);
+ fprintf(stderr,"\n");
+ }
+}
+
+
+// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
+filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
+{
+ filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
+ filter->str = strdup(str);
+ filter->hdr = hdr;
+ filter->max_unpack |= BCF_UN_STR;
+
+ int nops = 0, mops = 0, *ops = NULL; // operators stack
+ int nout = 0, mout = 0; // filter tokens, RPN
+ token_t *out = NULL;
+ char *tmp = filter->str;
+ int last_op = -1;
+ while ( *tmp )
+ {
+ int len, ret;
+ ret = filters_next_token(&tmp, &len);
+ if ( ret==-1 ) error("Missing quotes in: %s\n", str);
+
+ //fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
+ //int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
+
+ if ( ret==TOK_LFT ) // left bracket
+ {
+ nops++;
+ hts_expand(int, nops, mops, ops);
+ ops[nops-1] = ret;
+ }
+ else if ( ret==TOK_RGT ) // right bracket
+ {
+ while ( nops>0 && ops[nops-1]!=TOK_LFT )
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ out[nout-1].tok_type = ops[nops-1];
+ nops--;
+ }
+ if ( nops<=0 ) error("Could not parse: %s\n", str);
+ nops--;
+ }
+ else if ( ret!=TOK_VAL ) // one of the operators
+ {
+ // detect unary minus: replace -value with -1*(value)
+ if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT )
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ token_t *tok = &out[nout-1];
+ tok->tok_type = TOK_VAL;
+ tok->hdr_id = -1;
+ tok->pass_site = -1;
+ tok->threshold = -1.0;
+ ret = TOK_MULT;
+ }
+ else
+ {
+ while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] )
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ out[nout-1].tok_type = ops[nops-1];
+ nops--;
+ }
+ }
+ nops++;
+ hts_expand(int, nops, mops, ops);
+ ops[nops-1] = ret;
+ }
+ else if ( !len )
+ {
+ if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
+ break; // all tokens read
+ }
+ else // annotation name or filtering value
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ filters_init1(filter, tmp, len, &out[nout-1]);
+ tmp += len;
+ }
+ last_op = ret;
+ }
+ while ( nops>0 )
+ {
+ if ( ops[nops-1]==TOK_LFT || ops[nops-1]==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str);
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ out[nout-1].tok_type = ops[nops-1];
+ nops--;
+ }
+
+ // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the
+ // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
+ // just before or after the FILTER token and they must be followed with a comparison operator.
+ // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator.
+ // Additionally, treat "." as missing value rather than a string in numeric equalities.
+ // This code is fragile: improve me.
+ int i;
+ for (i=0; i<nout; i++)
+ {
+ if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
+ {
+ // Look for j="." and k numeric type
+ int j = i-1, k = i-2;
+ if ( !out[j].is_str ) { k = i-1, j = i-2; }
+ if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) )
+ {
+ int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
+ if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ }
+ }
+ if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
+ {
+ int j = i-1;
+ if ( !out[j].key )
+ error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
+ out[j].regex = (regex_t *) malloc(sizeof(regex_t));
+ if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) )
+ error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
+ }
+ if ( out[i].tok_type!=TOK_VAL ) continue;
+ if ( !out[i].tag ) continue;
+ if ( !strcmp(out[i].tag,"TYPE") )
+ {
+ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+ int j = i+1;
+ if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
+ if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; }
+ else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
+ out[j].tag = out[j].key; out[j].key = NULL;
+ i = j;
+ continue;
+ }
+ if ( !strcmp(out[i].tag,"FILTER") )
+ {
+ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+ int j = i+1;
+ if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value"
+ if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and !=
+ if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE;
+ if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ;
+ if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE;
+ if ( out[j].tok_type!=TOK_VAL || !out[j].key )
+ error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( strcmp(".",out[j].key) )
+ {
+ out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
+ error("The filter \"%s\" not present in the VCF header\n", out[j].key);
+ }
+ else
+ out[j].hdr_id = -1;
+ out[j].tag = out[j].key; out[j].key = NULL;
+ out[i].hdr_id = out[j].hdr_id;
+ i = j;
+ continue;
+ }
+ }
+ filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
+ for (i=0; i<nout; i++)
+ {
+ if ( out[i].tok_type==TOK_MAX ) { out[i].setter = set_max; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_MIN ) { out[i].setter = set_min; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_AVG ) { out[i].setter = set_avg; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
+ hts_expand0(float,1,out[i].mvalues,out[i].values);
+ if ( filter->nsamples )
+ {
+ out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
+ int j;
+ for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 1;
+ }
+ }
+
+ if (0) filter_debug_print(out, NULL, nout);
+
+ if ( mops ) free(ops);
+ filter->filters = out;
+ filter->nfilters = nout;
+ filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
+ return filter;
+}
+
+void filter_destroy(filter_t *filter)
+{
+ int i;
+ for (i=0; i<filter->nfilters; i++)
+ {
+ //if ( filter->filters[i].key ) free(filter->filters[i].key);
+ free(filter->filters[i].str_value);
+ free(filter->filters[i].tag);
+ free(filter->filters[i].values);
+ free(filter->filters[i].pass_samples);
+ if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
+ if (filter->filters[i].regex)
+ {
+ regfree(filter->filters[i].regex);
+ free(filter->filters[i].regex);
+ }
+ }
+ free(filter->filters);
+ free(filter->flt_stack);
+ free(filter->str);
+ free(filter->tmpi);
+ free(filter);
+}
+
+int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
+{
+ bcf_unpack(line, filter->max_unpack);
+
+ int i, nstack = 0;
+ for (i=0; i<filter->nfilters; i++)
+ {
+ filter->filters[i].nsamples = 0;
+ filter->filters[i].nvalues = 0;
+ filter->filters[i].pass_site = -1;
+
+ if ( filter->filters[i].tok_type == TOK_VAL )
+ {
+ if ( filter->filters[i].setter ) // variable, query the VCF line
+ filter->filters[i].setter(filter, line, &filter->filters[i]);
+ else if ( filter->filters[i].key ) // string constant
+ {
+ filter->filters[i].str_value = filter->filters[i].key;
+ filter->filters[i].values[0] = filter->filters[i].values[0];
+ filter->filters[i].nvalues = strlen(filter->filters[i].key);
+ }
+ else // numeric constant
+ {
+ filter->filters[i].values[0] = filter->filters[i].threshold;
+ filter->filters[i].nvalues = 1;
+ }
+
+ filter->flt_stack[nstack++] = &filter->filters[i];
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument
+ {
+ filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]);
+ continue;
+ }
+ if ( nstack<2 )
+ error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack); // too few values left on the stack
+
+ int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
+
+ if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC )
+ {
+ if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
+ error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
+ filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
+ nstack--;
+ continue;
+ }
+ if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC )
+ {
+ if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
+ error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
+ filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
+ nstack--;
+ continue;
+ }
+
+ if ( filter->filters[i].tok_type == TOK_ADD )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+);
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_SUB )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-);
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_MULT )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*);
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_DIV )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/);
+ nstack--;
+ continue;
+ }
+
+ int is_true = 0;
+ if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
+ {
+ int skip = 0;
+ if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
+ if ( filter->filters[i].tok_type != TOK_EQ && filter->filters[i].tok_type != TOK_NE ) skip = 1;
+
+ if ( skip )
+ filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0;
+ else if ( filter->filters[i].tok_type == TOK_EQ )
+ CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true)
+ else if ( filter->filters[i].tok_type == TOK_NE )
+ CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true)
+ }
+ else if ( filter->filters[i].tok_type == TOK_EQ )
+ {
+ if ( filter->flt_stack[nstack-1]->comparator )
+ is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line);
+ else if ( filter->flt_stack[nstack-2]->comparator )
+ is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line);
+ else if ( is_str==2 ) // both are strings
+ is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ);
+ else if ( is_str==1 )
+ error("Comparing string to numeric value: %s\n", filter->str);
+ else
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true);
+ }
+ else if ( filter->filters[i].tok_type == TOK_NE )
+ {
+ if ( filter->flt_stack[nstack-1]->comparator )
+ is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line);
+ else if ( filter->flt_stack[nstack-2]->comparator )
+ is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line);
+ else if ( is_str==2 )
+ is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE);
+ else if ( is_str==1 )
+ error("Comparing string to numeric value: %s\n", filter->str);
+ else
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true);
+ }
+ else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
+ {
+ if ( is_str==2 )
+ {
+ is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]);
+ if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1;
+ }
+ else
+ error("The regex operator can be used on strings only: %s\n", filter->str);
+ }
+ else if ( is_str>0 )
+ error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value);
+ else if ( filter->filters[i].tok_type == TOK_LE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true)
+ else if ( filter->filters[i].tok_type == TOK_LT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true)
+ else if ( filter->filters[i].tok_type == TOK_BT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true)
+ else if ( filter->filters[i].tok_type == TOK_BE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true)
+ else
+ error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type);
+
+ filter->flt_stack[nstack-2]->pass_site = is_true;
+ nstack--;
+ }
+ if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack
+ if ( samples )
+ {
+ *samples = filter->max_unpack&BCF_UN_FMT ? filter->flt_stack[0]->pass_samples : NULL;
+ if ( *samples && !filter->flt_stack[0]->nsamples )
+ {
+ for (i=0; i<filter->nsamples; i++)
+ filter->flt_stack[0]->pass_samples[i] = filter->flt_stack[0]->pass_site;
+ }
+ }
+ return filter->flt_stack[0]->pass_site;
+}
+
+int filter_max_unpack(filter_t *flt)
+{
+ return flt->max_unpack;
+}
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c
new file mode 100644
index 0000000..7520106
--- /dev/null
+++ b/bcftools/filter.c.pysam.c
@@ -0,0 +1,1790 @@
+#include "pysam.h"
+
+/* filter.c -- filter expressions.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <math.h>
+#include <wordexp.h>
+#include <regex.h>
+#include <htslib/khash_str2int.h>
+#include "filter.h"
+#include "bcftools.h"
+#include <htslib/hts_defs.h>
+#include <htslib/vcfutils.h>
+
+typedef struct _token_t
+{
+ // read-only values, same for all VCF lines
+ int tok_type; // one of the TOK_* keys below
+ char *key; // set only for string constants, otherwise NULL
+ char *tag; // for debugging and printout only, VCF tag name
+ float threshold; // filtering threshold
+ int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
+ int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
+ void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
+ int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
+ void *hash; // test presence of str value in the hash via comparator
+ regex_t *regex; // precompiled regex for string comparison
+
+ // modified on filter evaluation at each VCF line
+ float *values; // In case str_value is set, values[0] is one sample's string length
+ char *str_value; // and values[0]*nsamples gives the total length;
+ int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
+ int pass_site; // -1 not applicable, 0 fails, >0 pass
+ uint8_t *pass_samples; // status of individual samples
+ int nsamples; // number of samples
+ int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars
+ // for strings, total length of str_value
+}
+token_t;
+
+struct _filter_t
+{
+ bcf_hdr_t *hdr;
+ char *str;
+ int nfilters;
+ token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
+ int32_t *tmpi;
+ int max_unpack, mtmpi, nsamples;
+};
+
+
+#define TOK_VAL 0
+#define TOK_LFT 1 // (
+#define TOK_RGT 2 // )
+#define TOK_LE 3 // less or equal
+#define TOK_LT 4 // less than
+#define TOK_EQ 5 // equal
+#define TOK_BT 6 // bigger than
+#define TOK_BE 7 // bigger or equal
+#define TOK_NE 8 // not equal
+#define TOK_OR 9 // |
+#define TOK_AND 10 // &
+#define TOK_ADD 11 // +
+#define TOK_SUB 12 // -
+#define TOK_MULT 13 // *
+#define TOK_DIV 14 // /
+#define TOK_MAX 15
+#define TOK_MIN 16
+#define TOK_AVG 17
+#define TOK_AND_VEC 18 // && (operator applied in samples)
+#define TOK_OR_VEC 19 // || (operator applied in samples)
+#define TOK_LIKE 20 // ~ regular expression
+#define TOK_NLIKE 21 // !~ regular expression
+#define TOK_SUM 22
+#define TOK_ABS 23
+#define TOK_LEN 24
+#define TOK_FUNC 25
+
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8};
+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
+
+static int filters_next_token(char **str, int *len)
+{
+ char *tmp = *str;
+ while ( *tmp && isspace(*tmp) ) tmp++;
+ *str = tmp;
+ *len = 0;
+
+ // test for doubles: d.ddde[+-]dd
+ if ( isdigit(*str[0]) || *str[0]=='.' ) // strtod would eat +/-
+ {
+ double HTS_UNUSED v = strtod(*str, &tmp);
+ if ( *str!=tmp && (!tmp[0] || !isalnum(tmp[0])) )
+ {
+ *len = tmp - (*str);
+ return TOK_VAL;
+ }
+ tmp = *str;
+ }
+
+ if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
+ if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
+ if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
+ if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
+ if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
+ if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
+ if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
+ if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
+ if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility
+ if ( !strncasecmp(tmp,"%SUM(",5) ) { (*str) += 4; return TOK_SUM; } // for backward compatibility
+ if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5;
+ if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7;
+ if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4;
+
+ if ( tmp[0]=='@' ) // file name
+ {
+ while ( *tmp && !isspace(*tmp) && *tmp!='=' && *tmp!='!' ) tmp++;
+ *len = tmp - (*str);
+ return TOK_VAL;
+ }
+
+ while ( tmp[0] )
+ {
+ if ( tmp[0]=='"' ) break;
+ if ( tmp[0]=='\'' ) break;
+ if ( isspace(tmp[0]) ) break;
+ if ( tmp[0]=='<' ) break;
+ if ( tmp[0]=='>' ) break;
+ if ( tmp[0]=='=' ) break;
+ if ( tmp[0]=='!' ) break;
+ if ( tmp[0]=='&' ) break;
+ if ( tmp[0]=='|' ) break;
+ if ( tmp[0]=='(' ) break;
+ if ( tmp[0]==')' ) break;
+ if ( tmp[0]=='+' ) break;
+ // hacky: so that [*] is not split, the tokenizer does not recognise square brackets []
+ if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break;
+ if ( tmp[0]=='-' ) break;
+ if ( tmp[0]=='/' ) break;
+ if ( tmp[0]=='~' ) break;
+ tmp++;
+ }
+ if ( tmp > *str )
+ {
+ *len = tmp - (*str);
+ return TOK_VAL;
+ }
+ if ( tmp[0]=='"' || tmp[0]=='\'' )
+ {
+ int quote = tmp[0];
+ tmp++;
+ while ( *tmp && tmp[0]!=quote ) tmp++;
+ if ( !*tmp ) return -1; // missing quotes
+ *len = tmp - (*str) + 1;
+ return TOK_VAL;
+ }
+ if ( tmp[0]=='!' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_NE; }
+ if ( tmp[1]=='~' ) { (*str) += 2; return TOK_NLIKE; }
+ }
+ if ( tmp[0]=='<' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_LE; }
+ (*str) += 1; return TOK_LT;
+ }
+ if ( tmp[0]=='>' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_BE; }
+ (*str) += 1; return TOK_BT;
+ }
+ if ( tmp[0]=='=' )
+ {
+ if ( tmp[1]=='=' ) { (*str) += 2; return TOK_EQ; }
+ (*str) += 1; return TOK_EQ;
+ }
+ if ( tmp[0]=='(' ) { (*str) += 1; return TOK_LFT; }
+ if ( tmp[0]==')' ) { (*str) += 1; return TOK_RGT; }
+ if ( tmp[0]=='&' && tmp[1]=='&' ) { (*str) += 2; return TOK_AND_VEC; }
+ if ( tmp[0]=='|' && tmp[1]=='|' ) { (*str) += 2; return TOK_OR_VEC; }
+ if ( tmp[0]=='&' ) { (*str) += 1; return TOK_AND; }
+ if ( tmp[0]=='|' ) { (*str) += 1; return TOK_OR; }
+ if ( tmp[0]=='+' ) { (*str) += 1; return TOK_ADD; }
+ if ( tmp[0]=='-' ) { (*str) += 1; return TOK_SUB; }
+ if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; }
+ if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; }
+ if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; }
+
+ *len = tmp - (*str);
+ return TOK_VAL;
+}
+
+static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float *ptr = &line->qual;
+ if ( bcf_float_is_missing(*ptr) )
+ tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = line->qual;
+ tok->nvalues = 1;
+ }
+}
+static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->values[0] = bcf_get_variant_types(line);
+ tok->nvalues = 1;
+}
+static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ assert( tok->hdr_id >=0 );
+ int i;
+ for (i=0; i<line->n_info; i++)
+ if ( line->d.info[i].key == tok->hdr_id ) break;
+
+ if ( i==line->n_info )
+ tok->nvalues = 0;
+ else if ( line->d.info[i].type==BCF_BT_CHAR )
+ {
+ int n = line->d.info[i].len;
+ int m = (int)tok->values[0];
+ hts_expand(char,n+1,m,tok->str_value);
+ memcpy(tok->str_value,line->d.info[i].vptr,n);
+ tok->str_value[n] = 0;
+ tok->values[0] = m;
+ tok->nvalues = n;
+ }
+ else if ( line->d.info[i].type==BCF_BT_FLOAT )
+ {
+ if ( bcf_float_is_missing(line->d.info[i].v1.f) ) tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = line->d.info[i].v1.f;
+ tok->nvalues = 1;
+ }
+ tok->str_value = NULL;
+ }
+ else
+ {
+ if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
+ else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
+ else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = line->d.info[i].v1.i;
+ tok->nvalues = 1;
+ }
+ tok->str_value = NULL;
+ }
+}
+static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ int i;
+ if ( op_type==TOK_NE ) // AND logic: none of the filters can match
+ {
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) return 0; // missing value
+ return 1; // no filter present, eval to true
+ }
+ for (i=0; i<line->d.n_flt; i++)
+ if ( atok->hdr_id==line->d.flt[i] ) return 0;
+ return 1;
+ }
+ // TOK_EQ with OR logic: at least one of the filters must match
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) return 1;
+ return 0; // no filter present, eval to false
+ }
+ for (i=0; i<line->d.n_flt; i++)
+ if ( atok->hdr_id==line->d.flt[i] ) return 1;
+ return 0;
+}
+static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ // multiple IDs not supported yet (easy to add though)
+
+ if ( btok->hash )
+ {
+ token_t *tmp = atok; atok = btok; btok = tmp;
+ }
+ if ( atok->hash )
+ {
+ int ret = khash_str2int_has_key(atok->hash, line->d.id);
+ if ( op_type==TOK_EQ ) return ret;
+ return ret ? 0 : 1;
+ }
+
+ if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1;
+ return strcmp(btok->str_value,line->d.id) ? 1 : 0;
+}
+
+/**
+ * bcf_get_info_value() - get single INFO value, int or float
+ * @line: BCF line
+ * @info_id: tag ID, as returned by bcf_hdr_id2int
+ * @ivec: 0-based index to retrieve, -1 when single value is expected
+ * @vptr: pointer to memory location of sufficient size to accomodate
+ * info_id's type
+ *
+ * The returned value is -1 if tag is not present, 0 if present but
+ * values is missing or ivec is out of range, and 1 on success.
+ */
+static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
+{
+ int j;
+ for (j=0; j<line->n_info; j++)
+ if ( line->d.info[j].key == info_id ) break;
+ if ( j==line->n_info ) return -1;
+
+ bcf_info_t *info = &line->d.info[j];
+ if ( info->len == 1 )
+ {
+ if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f;
+ else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i;
+ return 1;
+ }
+
+ if ( ivec<0 ) ivec = 0;
+
+ #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
+ type_t *p = (type_t *) info->vptr; \
+ for (j=0; j<ivec && j<info->len; j++) \
+ { \
+ if ( is_vector_end ) return 0; \
+ } \
+ if ( is_missing ) return 0; \
+ *((out_type_t*)value) = p[j]; \
+ return 1; \
+ }
+ switch (info->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break;
+ default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break;
+ }
+ #undef BRANCH
+ return -1; // this shouldn't happen
+}
+
+static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->values[0] = line->pos+1;
+ tok->nvalues = 1;
+}
+
+static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( tok->idx==-2 )
+ {
+ int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
+ tok->nvalues = n;
+ hts_expand(float,n,tok->mvalues,tok->values);
+ for (i=0; i<n; i++) tok->values[i] = flt->tmpi[i];
+ }
+ else
+ {
+ int32_t value;
+ if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
+ tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = value;
+ tok->nvalues = 1;
+ }
+ }
+}
+
+static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( tok->idx==-2 )
+ {
+ tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues);
+ if ( tok->nvalues<0 ) tok->nvalues = 0;
+ }
+ else
+ {
+ float value;
+ if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
+ tok->nvalues = 0;
+ else
+ {
+ tok->values[0] = value;
+ tok->nvalues = 1;
+ }
+ }
+}
+
+static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int m = (int)tok->values[0];
+ int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m);
+ if ( n<0 ) { tok->nvalues = 0; return; }
+ tok->values[0] = m; // allocated length
+
+ if ( tok->idx>=0 )
+ {
+ // get ith field (i=tok->idx)
+ int i = 0;
+ char *ss = tok->str_value, *se = tok->str_value + n;
+ while ( ss<se && i<tok->idx )
+ {
+ if ( *ss==',' ) i++;
+ ss++;
+ }
+ if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; }
+ se = ss;
+ while ( se-tok->str_value<n && *se!=',' ) se++;
+ if ( ss==tok->str_value ) *se = 0;
+ else
+ {
+ memmove(tok->str_value,ss,se-ss);
+ tok->str_value[se-ss] = 0;
+ }
+ tok->nvalues = se-ss;
+ }
+ else if ( tok->idx==-2 ) tok->nvalues = n;
+}
+
+static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int j;
+ for (j=0; j<line->n_info; j++)
+ if ( line->d.info[j].key == tok->hdr_id ) break;
+ tok->values[0] = j==line->n_info ? 0 : 1;
+ tok->nvalues = 1;
+}
+
+static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int i;
+ if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
+ tok->nvalues = 0;
+ else
+ {
+ int is_missing = 1;
+ hts_expand(float,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
+ bcf_float_set_missing(tok->values[i]);
+ else
+ {
+ tok->values[i] = flt->tmpi[i];
+ is_missing = 0;
+ }
+ }
+ if ( is_missing ) tok->nvalues = 0;
+ else if ( tok->idx >= 0 )
+ {
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nvalues = nsmpl;
+ }
+ }
+ }
+ tok->nsamples = tok->nvalues;
+}
+static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 )
+ tok->nvalues = tok->nsamples = 0; // missing values
+ else if ( tok->idx >= 0 )
+ {
+ int i, nsmpl, nvals;
+ nsmpl = bcf_hdr_nsamples(flt->hdr);
+ nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nsamples = tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nsamples = tok->nvalues = nsmpl;
+ }
+ }
+ tok->nsamples = tok->nvalues;
+}
+static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ int ndim = tok->nsamples * (int)tok->values[0];
+ int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim);
+
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ ndim /= nsmpl;
+ tok->values[0] = ndim;
+
+ if ( ret<=0 )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+
+ if ( tok->idx < 0 ) // scalar
+ {
+ tok->nvalues = tok->nsamples = nsmpl;
+ return;
+ }
+
+ // vector
+ int i;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *ss = tok->str_value + i*ndim;
+ int is = 0, ivec = 0;
+ while ( ivec<tok->idx && is<ndim && ss[is] )
+ {
+ if ( ss[is]==',' ) ivec++;
+ is++;
+ }
+ if ( ivec!=tok->idx || is==ndim || !ss[is] )
+ {
+ ss[0] = '.';
+ ss[1] = 0;
+ continue;
+ }
+ int ie = is;
+ while ( ie<ndim && ss[ie] && ss[ie]!=',' ) ie++;
+ if ( is ) memmove(ss,&ss[is],ie-is);
+ if ( ndim-(ie-is) ) memset(ss+ie-is,0,ndim-(ie-is));
+ }
+ if ( !ndim )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ tok->nvalues = ret;
+ tok->nsamples = nsmpl;
+}
+static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
+ if ( !fmt )
+ {
+ tok->nvalues = tok->nsamples = 0;
+ return;
+ }
+ int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr);
+ kstring_t str;
+
+gt_length_too_big:
+ str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0;
+ for (i=0; i<nsmpl; i++)
+ {
+ int plen = str.l;
+
+ #define BRANCH(type_t) { \
+ type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
+ if ( !(ptr[0]>>1) ) kputc('.',&str); \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(pysamerr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
+ }
+ #undef BRANCH
+
+ if ( plen==str.l )
+ {
+ bcf_format_gt(fmt, i, &str);
+ if ( str.l - plen > blen )
+ {
+ // too many alternate alleles or ploidy is too large, the genotype does not fit
+ // three characters ("0/0" vs "10/10").
+ tok->str_value = str.s;
+ blen *= 2;
+ goto gt_length_too_big;
+ }
+ }
+
+ plen = str.l - plen;
+ while ( plen<blen )
+ {
+ kputc_(0, &str);
+ plen++;
+ }
+ }
+ tok->nvalues = str.l;
+ tok->nsamples = nsmpl;
+ tok->values[0] = blen;
+ tok->str_value = str.s;
+}
+static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+ kputs(line->d.allele[0], &str);
+ tok->nvalues = str.l;
+ tok->values[0] = str.m;
+ tok->str_value = str.s;
+}
+static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+ if ( tok->idx>=0 )
+ {
+ if ( line->n_allele >= tok->idx )
+ kputs(line->d.allele[tok->idx], &str);
+ else
+ kputc('.', &str);
+ }
+ else if ( line->n_allele>1 )
+ {
+ kputs(line->d.allele[1], &str);
+ int i;
+ for (i=2; i<line->n_allele; i++)
+ {
+ kputc(',', &str);
+ kputs(line->d.allele[i], &str);
+ }
+ }
+ else if ( line->n_allele==1 )
+ kputc('.', &str);
+ tok->nvalues = str.l;
+ tok->values[0] = str.m;
+ tok->str_value = str.s;
+}
+static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->nvalues = 1;
+ tok->values[0] = line->n_allele - 1;
+}
+static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ hts_expand(int32_t, line->n_allele, flt->mtmpi, flt->tmpi);
+ if ( !bcf_calc_ac(flt->hdr, line, flt->tmpi, BCF_UN_INFO|BCF_UN_FMT) )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ int i, an = flt->tmpi[0];
+ for (i=1; i<line->n_allele; i++) an += flt->tmpi[i];
+ if ( !an )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ flt->tmpi[0] = an; // for filters_set_[mac|af|maf]
+ if ( tok->idx>=0 )
+ {
+ tok->nvalues = 1;
+ tok->values[0] = tok->idx+1<line->n_allele ? flt->tmpi[tok->idx+1] : 0;
+ }
+ else if ( line->n_allele==1 ) // no ALT
+ {
+ tok->nvalues = 1;
+ tok->values[0] = 0;
+ }
+ else
+ {
+ hts_expand(float,line->n_allele,tok->mvalues,tok->values);
+ for (i=1; i<line->n_allele; i++)
+ tok->values[i-1] = flt->tmpi[i];
+ tok->nvalues = line->n_allele - 1;
+ }
+}
+static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
+ tok->nvalues = 1;
+}
+static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ if ( !tok->nvalues ) return;
+ int i, an = flt->tmpi[0];
+ for (i=0; i<tok->nvalues; i++)
+ if ( tok->values[i] > an*0.5 ) tok->values[i] = an - tok->values[i];
+}
+static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ if ( !tok->nvalues ) return;
+ int i, an = flt->tmpi[0];
+ for (i=0; i<tok->nvalues; i++)
+ tok->values[i] /= (float)an;
+}
+static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ filters_set_ac(flt,line,tok);
+ if ( !tok->nvalues ) return;
+ int i, an = flt->tmpi[0];
+ for (i=0; i<tok->nvalues; i++)
+ {
+ tok->values[i] /= (float)an;
+ if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
+ }
+}
+
+static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = -HUGE_VAL;
+ int i;
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ }
+ tok->values[0] = val;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = HUGE_VAL;
+ int i;
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
+ tok->values[0] = val;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = 0;
+ int i, n = 0;
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ tok->values[0] = n ? val / n : 0;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ float val = 0;
+ int i, n = 0;
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ tok->values[0] = val;
+ tok->nvalues = 1;
+ tok->nsamples = 0;
+}
+static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
+ int i;
+ for (i=0; i<tok->nvalues; i++)
+ tok->values[i] = fabs(tok->values[i]);
+}
+static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+ tok->is_str = 0;
+ if ( !tok->nvalues ) return;
+ if ( tok->idx==-2 )
+ {
+ int i = 0;
+ char *ss = tok->str_value;
+ while ( *ss )
+ {
+ char *se = ss;
+ while ( *se && *se!=',' ) se++;
+ if ( !*se ) tok->values[i] = strlen(ss);
+ else
+ {
+ *se = 0;
+ tok->values[i] = strlen(ss);
+ *se = ',';
+ }
+ ss = *se ? se + 1 : se;
+ i++;
+ }
+ tok->nvalues = i;
+ }
+ else
+ {
+ tok->values[0] = strlen(tok->str_value);
+ tok->nvalues = 1;
+ }
+}
+#define VECTOR_ARITHMETICS(atok,btok,AOP) \
+{ \
+ int i, has_values = 0; \
+ if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \
+ { \
+ (atok)->nvalues = 0; (atok)->nsamples = 0; \
+ } \
+ else \
+ { \
+ if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \
+ { \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) ) continue; \
+ if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \
+ has_values = 1; \
+ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
+ } \
+ } \
+ else if ( (btok)->nsamples ) \
+ { \
+ hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
+ for (i=0; i<(btok)->nvalues; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \
+ { \
+ bcf_float_set_missing((atok)->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \
+ } \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
+ } \
+ else if ( (atok)->nsamples ) \
+ { \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \
+ { \
+ bcf_float_set_missing((atok)->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \
+ } \
+ } \
+ } \
+ if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \
+}
+
+static int vector_logic_and(token_t *atok, token_t *btok, int and_type)
+{
+ // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields),
+ // or a vector and a scalar (FORMAT field vs threshold)
+ int i, pass_site = 0;
+ if ( !atok->nvalues || !btok->nvalues )
+ {
+ atok->nvalues = atok->nsamples = 0;
+ return 0;
+ }
+ if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site;
+ if ( atok->nsamples && btok->nsamples )
+ {
+ if ( and_type==TOK_AND )
+ {
+ // perform AND within a sample
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ else
+ {
+ // perform AND across samples
+ int pass_a = 0, pass_b = 0;
+ for (i=0; i<atok->nsamples; i++)
+ {
+ if ( atok->pass_samples[i] ) pass_a = 1;
+ atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
+ }
+ for (i=0; i<btok->nsamples; i++)
+ {
+ if ( btok->pass_samples[i] ) { pass_b = 1; break; }
+ }
+ pass_site = pass_a && pass_b;
+ }
+ return pass_site;
+ }
+ if ( btok->nsamples )
+ {
+ for (i=0; i<btok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i];
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ atok->nsamples = btok->nsamples;
+ return pass_site;
+ }
+ /* atok->nsamples!=0 */
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site;
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ return pass_site;
+}
+static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
+{
+ int i, pass_site = 0;
+ if ( !atok->nvalues && !btok->nvalues ) // missing sites in both
+ {
+ atok->nvalues = atok->nsamples = 0;
+ return 0;
+ }
+ if ( !atok->nvalues ) // missing value in a
+ {
+ for (i=0; i<btok->nsamples; i++)
+ atok->pass_samples[i] = btok->pass_samples[i];
+ atok->nsamples = btok->nsamples;
+ return btok->pass_site;
+ }
+ if ( !btok->nvalues ) // missing value in b
+ return atok->pass_site;
+
+ if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
+ if ( !atok->nsamples )
+ {
+ if ( or_type==TOK_OR )
+ {
+ for (i=0; i<btok->nsamples; i++)
+ {
+ atok->pass_samples[i] = btok->pass_samples[i];
+ if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ else
+ {
+ for (i=0; i<btok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i];
+ if ( atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ atok->nsamples = btok->nsamples;
+ return pass_site;
+ }
+ if ( !btok->nsamples ) // vector vs site
+ {
+ if ( or_type==TOK_OR )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+ }
+ else
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site;
+ if ( atok->pass_samples[i] ) pass_site = 1;
+ }
+ }
+ return pass_site;
+ }
+ for (i=0; i<atok->nsamples; i++)
+ {
+ atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i];
+ if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ }
+ return pass_site;
+}
+
+#define CMP_MISSING(atok,btok,CMP_OP,ret) \
+{ \
+ if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
+ token_t *tok = (atok)->is_missing ? (btok) : (atok); \
+ (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
+}
+
+#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
+{ \
+ int i, j, has_values = 0, pass_site = 0; \
+ if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
+ else \
+ { \
+ if ( (atok)->nsamples && (btok)->nsamples ) \
+ { \
+ for (i=0; i<(atok)->nsamples; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
+ } \
+ if ( !has_values ) (atok)->nvalues = 0; \
+ } \
+ else if ( (atok)->nsamples ) \
+ { \
+ if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
+ else \
+ { \
+ for (i=0; i<(atok)->nsamples; i++) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
+ } \
+ } \
+ if ( !has_values ) (atok)->nvalues = 0; \
+ } \
+ else if ( (btok)->nsamples ) \
+ { \
+ if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
+ else \
+ { \
+ for (i=0; i<(btok)->nsamples; i++) \
+ { \
+ if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
+ } \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
+ } \
+ if ( !has_values ) (atok)->nvalues = 0; \
+ } \
+ else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
+ { \
+ /* any field can match: [*] */ \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ for (j=0; j<(btok)->nvalues; j++) \
+ if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+ } \
+ } \
+ else \
+ { \
+ if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
+ } \
+ /*fprintf(pysamerr,"pass=%d\n", pass_site);*/ \
+ (ret) = pass_site; \
+ } \
+}
+static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE
+{
+ if ( !atok->nvalues ) { return 0; }
+ if ( !btok->nvalues ) { atok->nvalues = 0; return 0; }
+ int i, pass_site = 0;
+ if ( atok->nsamples && atok->nsamples==btok->nsamples )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ char *astr = atok->str_value + i*(int)atok->values[0];
+ char *bstr = btok->str_value + i*(int)btok->values[0];
+ char *aend = astr + (int)atok->values[0], *a = astr;
+ while ( a<aend && *a ) a++;
+ char *bend = bstr + (int)btok->values[0], *b = bstr;
+ while ( b<bend && *b ) b++;
+ if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
+ else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
+ if ( logic!=TOK_EQ )
+ atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
+ }
+ else if ( !atok->nsamples && !btok->nsamples )
+ {
+ if ( atok->idx==-2 || btok->idx==-2 )
+ {
+ // any field can match: [*]
+ if ( atok->idx==-2 && btok->idx==-2 )
+ error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value);
+ token_t *xtok, *ytok; // xtok is scalar, ytok array
+ if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
+ else { xtok = btok; ytok = atok; }
+ char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues;
+ char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr;
+ while ( y<=yend )
+ {
+ if ( y==yend || *y==',' )
+ {
+ if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) )
+ {
+ pass_site = 1;
+ break;
+ }
+ ystr = y+1;
+ }
+ y++;
+ }
+ }
+ else
+ pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1;
+ if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
+ }
+ else
+ {
+ token_t *xtok, *ytok;
+ if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
+ else { xtok = btok; ytok = atok; }
+ char *xstr = xtok->str_value;
+ char *xend = xstr + (int)xtok->values[0], *x = xstr;
+ while ( x<xend && *x ) x++;
+ for (i=0; i<ytok->nsamples; i++)
+ {
+ char *ystr = ytok->str_value + i*(int)ytok->values[0];
+ char *yend = ystr + (int)ytok->values[0], *y = ystr;
+ while ( y<yend && *y ) y++;
+ if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
+ else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
+ if ( logic!=TOK_EQ )
+ atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ if ( !atok->nsamples )
+ atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
+ }
+ return pass_site;
+}
+static int regex_vector_strings(token_t *atok, token_t *btok)
+{
+ int ret = regexec(btok->regex, atok->str_value, 0,NULL,0);
+ return ret==0 ? 1 : 0;
+}
+
+static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
+{
+ tok->tok_type = TOK_VAL;
+ tok->hdr_id = -1;
+ tok->pass_site = -1;
+ tok->idx = -1;
+
+ // is this a string constant?
+ if ( str[0]=='"' || str[0]=='\'' )
+ {
+ int quote = str[0];
+ if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
+ tok->key = (char*) calloc(len-1,sizeof(char));
+ hts_expand(float,1,tok->mvalues,tok->values);
+ tok->values[0] = len-2;
+ memcpy(tok->key,str+1,len-2);
+ tok->key[len-2] = 0;
+ tok->is_str = 1;
+ tok->nvalues = len-2;
+ if ( !strcmp(".",tok->key) ) tok->is_missing = 1;
+ return 0;
+ }
+
+ // is it a file?
+ if ( str[0]=='@' )
+ {
+ tok->tag = (char*) calloc(len+1,sizeof(char));
+ memcpy(tok->tag,str,len);
+ tok->tag[len] = 0;
+ wordexp_t wexp;
+ wordexp(tok->tag+1, &wexp, 0);
+ if ( !wexp.we_wordc ) error("No such file: %s\n", tok->tag+1);
+ int i, n;
+ char **list = hts_readlist(wexp.we_wordv[0], 1, &n);
+ if ( !list ) error("Could not read: %s\n", wexp.we_wordv[0]);
+ wordfree(&wexp);
+ tok->hash = khash_str2int_init();
+ for (i=0; i<n; i++)
+ {
+ char *se = list[i];
+ while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ if ( !khash_str2int_has_key(tok->hash,list[i]) )
+ khash_str2int_inc(tok->hash,list[i]);
+ else
+ free(list[i]);
+ }
+ free(list);
+ return 0;
+ }
+
+ int is_fmt = -1;
+ if ( !strncasecmp(str,"FMT/",4) ) { str += 4; len -= 4; is_fmt = 1; }
+ else if ( !strncasecmp(str,"FORMAT/",7) ) { str += 7; len -= 7; is_fmt = 1; }
+ else
+ {
+ if ( !strncasecmp(str,"INFO/",5) ) { is_fmt = 0; str += 5; len -= 5; }
+ else if ( !strncasecmp(str,"QUAL",len) || !strncmp(str,"%QUAL",len) /* for backward compatibility */ )
+ {
+ tok->setter = filters_set_qual;
+ tok->tag = strdup("QUAL");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
+ {
+ tok->setter = filters_set_type;
+ tok->tag = strdup("TYPE");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ )
+ {
+ tok->comparator = filters_cmp_filter;
+ tok->tag = strdup("FILTER");
+ filter->max_unpack |= BCF_UN_FLT;
+ return 0;
+ }
+ else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
+ {
+ tok->comparator = filters_cmp_id;
+ tok->tag = strdup("ID");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"POS",len) )
+ {
+ tok->setter = &filters_set_pos;
+ tok->tag = strdup("POS");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"REF",len) )
+ {
+ tok->setter = &filters_set_ref_string;
+ tok->is_str = 1;
+ tok->tag = strdup("REF");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"ALT",len) )
+ {
+ tok->setter = &filters_set_alt_string;
+ tok->is_str = 1;
+ tok->tag = strdup("ALT");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"N_ALT",len) )
+ {
+ tok->setter = &filters_set_nalt;
+ tok->tag = strdup("N_ALT");
+ return 0;
+ }
+ else if ( !strncasecmp(str,"N_SAMPLES",len) )
+ {
+ tok->tok_type = TOK_VAL;
+ tok->threshold = bcf_hdr_nsamples(filter->hdr);
+ return 0;
+ }
+ }
+
+ // does it have array subscript?
+ int is_array = 0;
+ kstring_t tmp = {0,0,0};
+ kputsn(str, len, &tmp);
+ if ( tmp.s[tmp.l-1] == ']' )
+ {
+ int i;
+ for (i=0; i<tmp.l; i++)
+ if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
+ if ( is_array )
+ {
+ if ( tmp.s[is_array]=='*' )
+ tok->idx = -2; // tag[*] .. any field
+ else
+ {
+ char *end;
+ tok->idx = strtol(tmp.s+is_array, &end, 10);
+ if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array);
+ }
+ }
+ }
+ tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
+ if ( is_fmt==-1 )
+ {
+ if ( tok->hdr_id >=0 )
+ {
+ if ( bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ) is_fmt = 0;
+ else if ( bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) is_fmt = 1;
+ }
+ if ( is_fmt==-1 ) is_fmt = 0;
+ }
+ tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
+ if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
+ if ( tok->hdr_id>=0 )
+ {
+ if ( is_fmt && !strcmp("GT",tmp.s) )
+ {
+ tok->setter = &filters_set_genotype_string; tok->is_str = 1;
+ }
+ else if ( is_fmt )
+ {
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
+ error("No such FORMAT field: %s\n", tmp.s);
+ if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
+ error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s);
+ switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
+ {
+ case BCF_HT_INT: tok->setter = &filters_set_format_int; break;
+ case BCF_HT_REAL: tok->setter = &filters_set_format_float; break;
+ case BCF_HT_STR: tok->setter = &filters_set_format_string; tok->is_str = 1; break;
+ default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
+ }
+ }
+ else if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
+ error("No such INFO field: %s\n", tmp.s);
+ else
+ {
+ if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG )
+ tok->setter = filters_set_info_flag;
+ else
+ {
+ if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1;
+ if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 )
+ tok->setter = filters_set_info;
+ else
+ {
+ switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) )
+ {
+ case BCF_HT_INT: tok->setter = &filters_set_info_int; break;
+ case BCF_HT_REAL: tok->setter = &filters_set_info_float; break;
+ case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
+ default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
+ }
+ if(!is_array) tok->idx = -2;
+ }
+ }
+ filter->max_unpack |= BCF_UN_INFO;
+ }
+ tok->tag = strdup(tmp.s);
+ if ( tmp.s ) free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"ALT") )
+ {
+ tok->setter = &filters_set_alt_string;
+ tok->is_str = 1;
+ tok->tag = strdup(tmp.s);
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"AN") )
+ {
+ tok->setter = &filters_set_an;
+ tok->tag = strdup("AN");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"AC") )
+ {
+ tok->setter = &filters_set_ac;
+ tok->tag = strdup("AC");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"MAC") )
+ {
+ tok->setter = &filters_set_mac;
+ tok->tag = strdup("MAC");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"AF") )
+ {
+ tok->setter = &filters_set_af;
+ tok->tag = strdup("AF");
+ free(tmp.s);
+ return 0;
+ }
+ else if ( !strcasecmp(tmp.s,"MAF") )
+ {
+ tok->setter = &filters_set_maf;
+ tok->tag = strdup("MAF");
+ free(tmp.s);
+ return 0;
+ }
+
+ // is it a value?
+ char *end;
+ errno = 0;
+ tok->threshold = strtod(tmp.s, &end);
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+
+ if ( tmp.s ) free(tmp.s);
+ return 0;
+}
+
+
+static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
+{
+ int i;
+ for (i=0; i<ntoks; i++)
+ {
+ token_t *tok = toks ? &toks[i] : tok_ptrs[i];
+ if ( tok->tok_type==TOK_VAL )
+ {
+ if ( tok->key )
+ fprintf(pysamerr,"%s", tok->key);
+ else if ( tok->tag )
+ fprintf(pysamerr,"%s", tok->tag);
+ else
+ fprintf(pysamerr,"%e", tok->threshold);
+ }
+ else
+ fprintf(pysamerr,"%c", TOKEN_STRING[tok->tok_type]);
+ if ( tok->setter ) fprintf(pysamerr,"\t[setter %p]", tok->setter);
+ fprintf(pysamerr,"\n");
+ }
+}
+
+
+// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
+filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
+{
+ filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
+ filter->str = strdup(str);
+ filter->hdr = hdr;
+ filter->max_unpack |= BCF_UN_STR;
+
+ int nops = 0, mops = 0, *ops = NULL; // operators stack
+ int nout = 0, mout = 0; // filter tokens, RPN
+ token_t *out = NULL;
+ char *tmp = filter->str;
+ int last_op = -1;
+ while ( *tmp )
+ {
+ int len, ret;
+ ret = filters_next_token(&tmp, &len);
+ if ( ret==-1 ) error("Missing quotes in: %s\n", str);
+
+ //fprintf(pysamerr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
+ //int i; for (i=0; i<nops; i++) fprintf(pysamerr," .%c.", TOKEN_STRING[ops[i]]); fprintf(pysamerr,"\n");
+
+ if ( ret==TOK_LFT ) // left bracket
+ {
+ nops++;
+ hts_expand(int, nops, mops, ops);
+ ops[nops-1] = ret;
+ }
+ else if ( ret==TOK_RGT ) // right bracket
+ {
+ while ( nops>0 && ops[nops-1]!=TOK_LFT )
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ out[nout-1].tok_type = ops[nops-1];
+ nops--;
+ }
+ if ( nops<=0 ) error("Could not parse: %s\n", str);
+ nops--;
+ }
+ else if ( ret!=TOK_VAL ) // one of the operators
+ {
+ // detect unary minus: replace -value with -1*(value)
+ if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT )
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ token_t *tok = &out[nout-1];
+ tok->tok_type = TOK_VAL;
+ tok->hdr_id = -1;
+ tok->pass_site = -1;
+ tok->threshold = -1.0;
+ ret = TOK_MULT;
+ }
+ else
+ {
+ while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] )
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ out[nout-1].tok_type = ops[nops-1];
+ nops--;
+ }
+ }
+ nops++;
+ hts_expand(int, nops, mops, ops);
+ ops[nops-1] = ret;
+ }
+ else if ( !len )
+ {
+ if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
+ break; // all tokens read
+ }
+ else // annotation name or filtering value
+ {
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ filters_init1(filter, tmp, len, &out[nout-1]);
+ tmp += len;
+ }
+ last_op = ret;
+ }
+ while ( nops>0 )
+ {
+ if ( ops[nops-1]==TOK_LFT || ops[nops-1]==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str);
+ nout++;
+ hts_expand0(token_t, nout, mout, out);
+ out[nout-1].tok_type = ops[nops-1];
+ nops--;
+ }
+
+ // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the
+ // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
+ // just before or after the FILTER token and they must be followed with a comparison operator.
+ // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator.
+ // Additionally, treat "." as missing value rather than a string in numeric equalities.
+ // This code is fragile: improve me.
+ int i;
+ for (i=0; i<nout; i++)
+ {
+ if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
+ {
+ // Look for j="." and k numeric type
+ int j = i-1, k = i-2;
+ if ( !out[j].is_str ) { k = i-1, j = i-2; }
+ if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) )
+ {
+ int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
+ if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ }
+ }
+ if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
+ {
+ int j = i-1;
+ if ( !out[j].key )
+ error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
+ out[j].regex = (regex_t *) malloc(sizeof(regex_t));
+ if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) )
+ error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
+ }
+ if ( out[i].tok_type!=TOK_VAL ) continue;
+ if ( !out[i].tag ) continue;
+ if ( !strcmp(out[i].tag,"TYPE") )
+ {
+ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+ int j = i+1;
+ if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
+ if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; }
+ else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; }
+ else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
+ out[j].tag = out[j].key; out[j].key = NULL;
+ i = j;
+ continue;
+ }
+ if ( !strcmp(out[i].tag,"FILTER") )
+ {
+ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+ int j = i+1;
+ if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value"
+ if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and !=
+ if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE;
+ if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ;
+ if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE;
+ if ( out[j].tok_type!=TOK_VAL || !out[j].key )
+ error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( strcmp(".",out[j].key) )
+ {
+ out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
+ error("The filter \"%s\" not present in the VCF header\n", out[j].key);
+ }
+ else
+ out[j].hdr_id = -1;
+ out[j].tag = out[j].key; out[j].key = NULL;
+ out[i].hdr_id = out[j].hdr_id;
+ i = j;
+ continue;
+ }
+ }
+ filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
+ for (i=0; i<nout; i++)
+ {
+ if ( out[i].tok_type==TOK_MAX ) { out[i].setter = set_max; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_MIN ) { out[i].setter = set_min; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_AVG ) { out[i].setter = set_avg; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
+ hts_expand0(float,1,out[i].mvalues,out[i].values);
+ if ( filter->nsamples )
+ {
+ out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
+ int j;
+ for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 1;
+ }
+ }
+
+ if (0) filter_debug_print(out, NULL, nout);
+
+ if ( mops ) free(ops);
+ filter->filters = out;
+ filter->nfilters = nout;
+ filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
+ return filter;
+}
+
+void filter_destroy(filter_t *filter)
+{
+ int i;
+ for (i=0; i<filter->nfilters; i++)
+ {
+ //if ( filter->filters[i].key ) free(filter->filters[i].key);
+ free(filter->filters[i].str_value);
+ free(filter->filters[i].tag);
+ free(filter->filters[i].values);
+ free(filter->filters[i].pass_samples);
+ if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
+ if (filter->filters[i].regex)
+ {
+ regfree(filter->filters[i].regex);
+ free(filter->filters[i].regex);
+ }
+ }
+ free(filter->filters);
+ free(filter->flt_stack);
+ free(filter->str);
+ free(filter->tmpi);
+ free(filter);
+}
+
+int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
+{
+ bcf_unpack(line, filter->max_unpack);
+
+ int i, nstack = 0;
+ for (i=0; i<filter->nfilters; i++)
+ {
+ filter->filters[i].nsamples = 0;
+ filter->filters[i].nvalues = 0;
+ filter->filters[i].pass_site = -1;
+
+ if ( filter->filters[i].tok_type == TOK_VAL )
+ {
+ if ( filter->filters[i].setter ) // variable, query the VCF line
+ filter->filters[i].setter(filter, line, &filter->filters[i]);
+ else if ( filter->filters[i].key ) // string constant
+ {
+ filter->filters[i].str_value = filter->filters[i].key;
+ filter->filters[i].values[0] = filter->filters[i].values[0];
+ filter->filters[i].nvalues = strlen(filter->filters[i].key);
+ }
+ else // numeric constant
+ {
+ filter->filters[i].values[0] = filter->filters[i].threshold;
+ filter->filters[i].nvalues = 1;
+ }
+
+ filter->flt_stack[nstack++] = &filter->filters[i];
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument
+ {
+ filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]);
+ continue;
+ }
+ if ( nstack<2 )
+ error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack); // too few values left on the stack
+
+ int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
+
+ if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC )
+ {
+ if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
+ error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
+ filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
+ nstack--;
+ continue;
+ }
+ if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC )
+ {
+ if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
+ error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
+ filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
+ nstack--;
+ continue;
+ }
+
+ if ( filter->filters[i].tok_type == TOK_ADD )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+);
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_SUB )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-);
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_MULT )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*);
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_DIV )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/);
+ nstack--;
+ continue;
+ }
+
+ int is_true = 0;
+ if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
+ {
+ int skip = 0;
+ if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
+ if ( filter->filters[i].tok_type != TOK_EQ && filter->filters[i].tok_type != TOK_NE ) skip = 1;
+
+ if ( skip )
+ filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0;
+ else if ( filter->filters[i].tok_type == TOK_EQ )
+ CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true)
+ else if ( filter->filters[i].tok_type == TOK_NE )
+ CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true)
+ }
+ else if ( filter->filters[i].tok_type == TOK_EQ )
+ {
+ if ( filter->flt_stack[nstack-1]->comparator )
+ is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line);
+ else if ( filter->flt_stack[nstack-2]->comparator )
+ is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line);
+ else if ( is_str==2 ) // both are strings
+ is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ);
+ else if ( is_str==1 )
+ error("Comparing string to numeric value: %s\n", filter->str);
+ else
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true);
+ }
+ else if ( filter->filters[i].tok_type == TOK_NE )
+ {
+ if ( filter->flt_stack[nstack-1]->comparator )
+ is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line);
+ else if ( filter->flt_stack[nstack-2]->comparator )
+ is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line);
+ else if ( is_str==2 )
+ is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE);
+ else if ( is_str==1 )
+ error("Comparing string to numeric value: %s\n", filter->str);
+ else
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true);
+ }
+ else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
+ {
+ if ( is_str==2 )
+ {
+ is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]);
+ if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1;
+ }
+ else
+ error("The regex operator can be used on strings only: %s\n", filter->str);
+ }
+ else if ( is_str>0 )
+ error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value);
+ else if ( filter->filters[i].tok_type == TOK_LE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true)
+ else if ( filter->filters[i].tok_type == TOK_LT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true)
+ else if ( filter->filters[i].tok_type == TOK_BT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true)
+ else if ( filter->filters[i].tok_type == TOK_BE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true)
+ else
+ error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type);
+
+ filter->flt_stack[nstack-2]->pass_site = is_true;
+ nstack--;
+ }
+ if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack
+ if ( samples )
+ {
+ *samples = filter->max_unpack&BCF_UN_FMT ? filter->flt_stack[0]->pass_samples : NULL;
+ if ( *samples && !filter->flt_stack[0]->nsamples )
+ {
+ for (i=0; i<filter->nsamples; i++)
+ filter->flt_stack[0]->pass_samples[i] = filter->flt_stack[0]->pass_site;
+ }
+ }
+ return filter->flt_stack[0]->pass_site;
+}
+
+int filter_max_unpack(filter_t *flt)
+{
+ return flt->max_unpack;
+}
diff --git a/bcftools/filter.h b/bcftools/filter.h
new file mode 100644
index 0000000..ccd3fe3
--- /dev/null
+++ b/bcftools/filter.h
@@ -0,0 +1,52 @@
+/* filter.h -- filter expressions.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+#include <htslib/vcf.h>
+
+typedef struct _filter_t filter_t;
+
+/**
+ * @hdr: BCF header file
+ * @str: see the bcftools filter command help for description
+ */
+filter_t *filter_init(bcf_hdr_t *hdr, const char *str);
+
+void filter_destroy(filter_t *filter);
+
+/**
+ * filter_test() - test whether the BCF record passes the test
+ * @samples: if not NULL, a pointer to an array with samples statuses is
+ * stored in the location referenced by @samples. The pointer
+ * will be set to NULL if the FORMAT fields were not queried.
+ * Returns 1 if the expression is true and 0 if false.
+ */
+int filter_test(filter_t *filter, bcf1_t *rec, const uint8_t **samples);
+
+void filter_expression_info(FILE *fp);
+int filter_max_unpack(filter_t *filter);
+
+#endif
diff --git a/bcftools/gvcf.c b/bcftools/gvcf.c
new file mode 100644
index 0000000..b82d658
--- /dev/null
+++ b/bcftools/gvcf.c
@@ -0,0 +1,227 @@
+/* gvcf.c -- support for gVCF files.
+
+ Copyright (C) 2014-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include "gvcf.h"
+#include "bcftools.h"
+
+struct _gvcf_t
+{
+ int *dp_range, ndp_range; // per-sample DP ranges
+ int prev_range; // 0 if not in a block
+ int32_t *dp, mdp, *pl, mpl, npl;
+ int32_t *tmp, mtmp, *gts, ngts,mgts, nqsum,mqsum;
+ float *qsum;
+ int32_t rid, start, end, min_dp;
+ kstring_t als;
+ bcf1_t *line;
+};
+
+void gvcf_update_header(gvcf_t *gvcf, bcf_hdr_t *hdr)
+{
+ bcf_hdr_append(hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(hdr,"##INFO=<ID=MinDP,Number=1,Type=Integer,Description=\"Minimum per-sample depth in this gVCF block\">");
+}
+
+gvcf_t *gvcf_init(const char *dp_ranges)
+{
+ gvcf_t *gvcf = (gvcf_t*) calloc(1,sizeof(gvcf_t));
+ gvcf->line = bcf_init();
+
+ int n = 1;
+ const char *ss = dp_ranges;
+ while ( *ss )
+ {
+ if ( *ss==',' ) n++;
+ ss++;
+ }
+ gvcf->ndp_range = n;
+ gvcf->dp_range = (int*) malloc(sizeof(int)*gvcf->ndp_range);
+
+ n = 0;
+ ss = dp_ranges;
+ while ( *ss )
+ {
+ char *se = (char*) ss;
+ gvcf->dp_range[n++] = strtol(ss,&se,10);
+ if ( se==ss ) return NULL;
+ if ( *se==',' && se[1] ) { ss = se+1; continue; }
+ else if ( !*se ) break;
+ return NULL;
+ }
+ return gvcf;
+}
+
+void gvcf_destroy(gvcf_t *gvcf)
+{
+ free(gvcf->dp_range);
+ free(gvcf->dp);
+ free(gvcf->pl);
+ free(gvcf->tmp);
+ free(gvcf->qsum);
+ free(gvcf->gts);
+ free(gvcf->als.s);
+ if ( gvcf->line ) bcf_destroy(gvcf->line);
+ free(gvcf);
+}
+
+bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int is_ref)
+{
+ int i, ret, nsmpl = bcf_hdr_nsamples(hdr);
+ int can_collapse = is_ref ? 1 : 0;
+ int32_t dp_range = 0, min_dp = 0;
+
+ // No record and nothing to flush?
+ if ( !rec && !gvcf->prev_range ) return NULL;
+
+ // Flush gVCF block if there are no more records, chr changed, a gap
+ // encountered, or other conditions not met (block broken by a non-ref or DP too low).
+ int needs_flush = can_collapse ? 0 : 1;
+
+
+ // Can the record be included in a gVCF block? That is, is this a ref-only site?
+ if ( rec && can_collapse )
+ {
+ bcf_unpack(rec, BCF_UN_ALL);
+
+ // per-sample depth
+ ret = bcf_get_format_int32(hdr, rec, "DP", &gvcf->tmp, &gvcf->mtmp);
+ if ( ret==nsmpl )
+ {
+ min_dp = gvcf->tmp[0];
+ for (i=1; i<nsmpl; i++)
+ if ( min_dp > gvcf->tmp[i] ) min_dp = gvcf->tmp[i];
+
+ for (i=0; i<gvcf->ndp_range; i++)
+ if ( min_dp < gvcf->dp_range[i] ) break;
+
+ dp_range = i;
+ if ( !dp_range )
+ {
+ // leave the record unchanged, DP is too small. Alternatively, return NULL here
+ // to skip these sites
+ needs_flush = 1;
+ can_collapse = 0;
+ }
+ }
+ else
+ needs_flush = 1; // DP field not present
+ }
+
+ if ( gvcf->prev_range && gvcf->prev_range!=dp_range ) needs_flush = 1;
+ if ( !rec || gvcf->rid!=rec->rid || rec->pos > gvcf->end+1 ) needs_flush = 1;
+
+ // If prev_range is set, something can be flushed
+ if ( gvcf->prev_range && needs_flush )
+ {
+ // mpileup can output two records with the same position, SNP and
+ // indel. Make sure the end position does not include the non-variant
+ // SNP position just before the indel.
+ if ( rec && rec->rid==gvcf->rid && rec->pos==gvcf->end ) gvcf->end--;
+
+ gvcf->end++; // from 0-based to 1-based coordinate
+
+ bcf_clear1(gvcf->line);
+ gvcf->line->rid = gvcf->rid;
+ gvcf->line->pos = gvcf->start;
+ gvcf->line->rlen = gvcf->end - gvcf->start;
+ bcf_update_alleles_str(hdr, gvcf->line, gvcf->als.s);
+ if ( gvcf->start+1 < gvcf->end ) // create gVCF record only if it spans at least two sites
+ bcf_update_info_int32(hdr, gvcf->line, "END", &gvcf->end, 1);
+ bcf_update_info_int32(hdr, gvcf->line, "MinDP", &gvcf->min_dp, 1);
+ if ( gvcf->nqsum>0 )
+ bcf_update_info_float(hdr, gvcf->line, "QS", gvcf->qsum, gvcf->nqsum);
+ if ( gvcf->ngts )
+ bcf_update_genotypes(hdr,gvcf->line,gvcf->gts,gvcf->ngts);
+ if ( gvcf->npl>0 )
+ bcf_update_format_int32(hdr, gvcf->line, "PL", gvcf->pl, gvcf->npl);
+ bcf_update_format_int32(hdr, gvcf->line, "DP", gvcf->dp, nsmpl);
+ bcf_write1(fh, hdr, gvcf->line);
+ gvcf->prev_range = 0;
+ gvcf->rid = -1;
+ gvcf->npl = 0;
+ gvcf->nqsum = 0;
+ gvcf->ngts = 0;
+
+ if ( !rec ) return NULL; // just flushing the buffer, this was last record
+ }
+
+ if ( can_collapse )
+ {
+ if ( !gvcf->prev_range )
+ {
+ hts_expand(int32_t,nsmpl,gvcf->mdp,gvcf->dp);
+ memcpy(gvcf->dp,gvcf->tmp,nsmpl*sizeof(int32_t)); // tmp still contains DP from rec
+ gvcf->npl = bcf_get_format_int32(hdr, rec, "PL", &gvcf->pl, &gvcf->mpl);
+
+ gvcf->nqsum = bcf_get_info_float(hdr,rec,"QS",&gvcf->qsum,&gvcf->mqsum);
+ gvcf->ngts = bcf_get_genotypes(hdr,rec,&gvcf->gts,&gvcf->mgts);
+
+ gvcf->rid = rec->rid;
+ gvcf->start = rec->pos;
+ gvcf->als.l = 0;
+ kputs(rec->d.allele[0],&gvcf->als);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ kputc(',',&gvcf->als);
+ kputs(rec->d.allele[i],&gvcf->als);
+ }
+ gvcf->min_dp = min_dp;
+ }
+ else
+ {
+ if ( gvcf->min_dp > min_dp ) gvcf->min_dp = min_dp;
+ for (i=0; i<nsmpl; i++)
+ if ( gvcf->dp[i] > gvcf->tmp[i] ) gvcf->dp[i] = gvcf->tmp[i];
+ ret = bcf_get_format_int32(hdr, rec, "PL", &gvcf->tmp, &gvcf->mtmp);
+ if ( ret>=0 )
+ {
+ if ( ret!=nsmpl*3 ) error("Unexpected number of PL fields\n");
+ for (i=0; i<nsmpl; i++)
+ {
+ if ( gvcf->pl[3*i+1] > gvcf->tmp[3*i+1] )
+ {
+ gvcf->pl[3*i+1] = gvcf->tmp[3*i+1];
+ gvcf->pl[3*i+2] = gvcf->tmp[3*i+2];
+ }
+ else if ( gvcf->pl[3*i+1]==gvcf->tmp[3*i+1] && gvcf->pl[3*i+2] > gvcf->tmp[3*i+2] )
+ gvcf->pl[3*i+2] = gvcf->tmp[3*i+2];
+ }
+ }
+ else
+ gvcf->npl = 0;
+ }
+ gvcf->prev_range = dp_range;
+ if ( bcf_get_info_int32(hdr,rec,"END",&gvcf->tmp,&gvcf->mtmp)==1 )
+ gvcf->end = gvcf->tmp[0] - 1; // from 1-based to 0-based
+ else
+ gvcf->end = rec->pos;
+ return NULL;
+ }
+
+ if ( is_ref && min_dp )
+ bcf_update_info_int32(hdr, rec, "MinDP", &min_dp, 1);
+
+ return rec;
+}
+
diff --git a/bcftools/gvcf.c.pysam.c b/bcftools/gvcf.c.pysam.c
new file mode 100644
index 0000000..b269b21
--- /dev/null
+++ b/bcftools/gvcf.c.pysam.c
@@ -0,0 +1,229 @@
+#include "pysam.h"
+
+/* gvcf.c -- support for gVCF files.
+
+ Copyright (C) 2014-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include "gvcf.h"
+#include "bcftools.h"
+
+struct _gvcf_t
+{
+ int *dp_range, ndp_range; // per-sample DP ranges
+ int prev_range; // 0 if not in a block
+ int32_t *dp, mdp, *pl, mpl, npl;
+ int32_t *tmp, mtmp, *gts, ngts,mgts, nqsum,mqsum;
+ float *qsum;
+ int32_t rid, start, end, min_dp;
+ kstring_t als;
+ bcf1_t *line;
+};
+
+void gvcf_update_header(gvcf_t *gvcf, bcf_hdr_t *hdr)
+{
+ bcf_hdr_append(hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(hdr,"##INFO=<ID=MinDP,Number=1,Type=Integer,Description=\"Minimum per-sample depth in this gVCF block\">");
+}
+
+gvcf_t *gvcf_init(const char *dp_ranges)
+{
+ gvcf_t *gvcf = (gvcf_t*) calloc(1,sizeof(gvcf_t));
+ gvcf->line = bcf_init();
+
+ int n = 1;
+ const char *ss = dp_ranges;
+ while ( *ss )
+ {
+ if ( *ss==',' ) n++;
+ ss++;
+ }
+ gvcf->ndp_range = n;
+ gvcf->dp_range = (int*) malloc(sizeof(int)*gvcf->ndp_range);
+
+ n = 0;
+ ss = dp_ranges;
+ while ( *ss )
+ {
+ char *se = (char*) ss;
+ gvcf->dp_range[n++] = strtol(ss,&se,10);
+ if ( se==ss ) return NULL;
+ if ( *se==',' && se[1] ) { ss = se+1; continue; }
+ else if ( !*se ) break;
+ return NULL;
+ }
+ return gvcf;
+}
+
+void gvcf_destroy(gvcf_t *gvcf)
+{
+ free(gvcf->dp_range);
+ free(gvcf->dp);
+ free(gvcf->pl);
+ free(gvcf->tmp);
+ free(gvcf->qsum);
+ free(gvcf->gts);
+ free(gvcf->als.s);
+ if ( gvcf->line ) bcf_destroy(gvcf->line);
+ free(gvcf);
+}
+
+bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int is_ref)
+{
+ int i, ret, nsmpl = bcf_hdr_nsamples(hdr);
+ int can_collapse = is_ref ? 1 : 0;
+ int32_t dp_range = 0, min_dp = 0;
+
+ // No record and nothing to flush?
+ if ( !rec && !gvcf->prev_range ) return NULL;
+
+ // Flush gVCF block if there are no more records, chr changed, a gap
+ // encountered, or other conditions not met (block broken by a non-ref or DP too low).
+ int needs_flush = can_collapse ? 0 : 1;
+
+
+ // Can the record be included in a gVCF block? That is, is this a ref-only site?
+ if ( rec && can_collapse )
+ {
+ bcf_unpack(rec, BCF_UN_ALL);
+
+ // per-sample depth
+ ret = bcf_get_format_int32(hdr, rec, "DP", &gvcf->tmp, &gvcf->mtmp);
+ if ( ret==nsmpl )
+ {
+ min_dp = gvcf->tmp[0];
+ for (i=1; i<nsmpl; i++)
+ if ( min_dp > gvcf->tmp[i] ) min_dp = gvcf->tmp[i];
+
+ for (i=0; i<gvcf->ndp_range; i++)
+ if ( min_dp < gvcf->dp_range[i] ) break;
+
+ dp_range = i;
+ if ( !dp_range )
+ {
+ // leave the record unchanged, DP is too small. Alternatively, return NULL here
+ // to skip these sites
+ needs_flush = 1;
+ can_collapse = 0;
+ }
+ }
+ else
+ needs_flush = 1; // DP field not present
+ }
+
+ if ( gvcf->prev_range && gvcf->prev_range!=dp_range ) needs_flush = 1;
+ if ( !rec || gvcf->rid!=rec->rid || rec->pos > gvcf->end+1 ) needs_flush = 1;
+
+ // If prev_range is set, something can be flushed
+ if ( gvcf->prev_range && needs_flush )
+ {
+ // mpileup can output two records with the same position, SNP and
+ // indel. Make sure the end position does not include the non-variant
+ // SNP position just before the indel.
+ if ( rec && rec->rid==gvcf->rid && rec->pos==gvcf->end ) gvcf->end--;
+
+ gvcf->end++; // from 0-based to 1-based coordinate
+
+ bcf_clear1(gvcf->line);
+ gvcf->line->rid = gvcf->rid;
+ gvcf->line->pos = gvcf->start;
+ gvcf->line->rlen = gvcf->end - gvcf->start;
+ bcf_update_alleles_str(hdr, gvcf->line, gvcf->als.s);
+ if ( gvcf->start+1 < gvcf->end ) // create gVCF record only if it spans at least two sites
+ bcf_update_info_int32(hdr, gvcf->line, "END", &gvcf->end, 1);
+ bcf_update_info_int32(hdr, gvcf->line, "MinDP", &gvcf->min_dp, 1);
+ if ( gvcf->nqsum>0 )
+ bcf_update_info_float(hdr, gvcf->line, "QS", gvcf->qsum, gvcf->nqsum);
+ if ( gvcf->ngts )
+ bcf_update_genotypes(hdr,gvcf->line,gvcf->gts,gvcf->ngts);
+ if ( gvcf->npl>0 )
+ bcf_update_format_int32(hdr, gvcf->line, "PL", gvcf->pl, gvcf->npl);
+ bcf_update_format_int32(hdr, gvcf->line, "DP", gvcf->dp, nsmpl);
+ bcf_write1(fh, hdr, gvcf->line);
+ gvcf->prev_range = 0;
+ gvcf->rid = -1;
+ gvcf->npl = 0;
+ gvcf->nqsum = 0;
+ gvcf->ngts = 0;
+
+ if ( !rec ) return NULL; // just flushing the buffer, this was last record
+ }
+
+ if ( can_collapse )
+ {
+ if ( !gvcf->prev_range )
+ {
+ hts_expand(int32_t,nsmpl,gvcf->mdp,gvcf->dp);
+ memcpy(gvcf->dp,gvcf->tmp,nsmpl*sizeof(int32_t)); // tmp still contains DP from rec
+ gvcf->npl = bcf_get_format_int32(hdr, rec, "PL", &gvcf->pl, &gvcf->mpl);
+
+ gvcf->nqsum = bcf_get_info_float(hdr,rec,"QS",&gvcf->qsum,&gvcf->mqsum);
+ gvcf->ngts = bcf_get_genotypes(hdr,rec,&gvcf->gts,&gvcf->mgts);
+
+ gvcf->rid = rec->rid;
+ gvcf->start = rec->pos;
+ gvcf->als.l = 0;
+ kputs(rec->d.allele[0],&gvcf->als);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ kputc(',',&gvcf->als);
+ kputs(rec->d.allele[i],&gvcf->als);
+ }
+ gvcf->min_dp = min_dp;
+ }
+ else
+ {
+ if ( gvcf->min_dp > min_dp ) gvcf->min_dp = min_dp;
+ for (i=0; i<nsmpl; i++)
+ if ( gvcf->dp[i] > gvcf->tmp[i] ) gvcf->dp[i] = gvcf->tmp[i];
+ ret = bcf_get_format_int32(hdr, rec, "PL", &gvcf->tmp, &gvcf->mtmp);
+ if ( ret>=0 )
+ {
+ if ( ret!=nsmpl*3 ) error("Unexpected number of PL fields\n");
+ for (i=0; i<nsmpl; i++)
+ {
+ if ( gvcf->pl[3*i+1] > gvcf->tmp[3*i+1] )
+ {
+ gvcf->pl[3*i+1] = gvcf->tmp[3*i+1];
+ gvcf->pl[3*i+2] = gvcf->tmp[3*i+2];
+ }
+ else if ( gvcf->pl[3*i+1]==gvcf->tmp[3*i+1] && gvcf->pl[3*i+2] > gvcf->tmp[3*i+2] )
+ gvcf->pl[3*i+2] = gvcf->tmp[3*i+2];
+ }
+ }
+ else
+ gvcf->npl = 0;
+ }
+ gvcf->prev_range = dp_range;
+ if ( bcf_get_info_int32(hdr,rec,"END",&gvcf->tmp,&gvcf->mtmp)==1 )
+ gvcf->end = gvcf->tmp[0] - 1; // from 1-based to 0-based
+ else
+ gvcf->end = rec->pos;
+ return NULL;
+ }
+
+ if ( is_ref && min_dp )
+ bcf_update_info_int32(hdr, rec, "MinDP", &min_dp, 1);
+
+ return rec;
+}
+
diff --git a/bcftools/gvcf.h b/bcftools/gvcf.h
new file mode 100644
index 0000000..784e1f6
--- /dev/null
+++ b/bcftools/gvcf.h
@@ -0,0 +1,41 @@
+/* gvcf.[ch] - Helper functions for gVCF support
+
+ The MIT License
+
+ Copyright (c) 2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#ifndef __GVCF_H__
+#define __GVCF_H__
+
+#include "bcftools.h"
+
+typedef struct _gvcf_t gvcf_t;
+
+gvcf_t *gvcf_init(const char *dp_ranges);
+void gvcf_update_header(gvcf_t *gvcf, bcf_hdr_t *hdr);
+bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int is_ref);
+void gvcf_destroy(gvcf_t *gvcf);
+
+#endif
diff --git a/bcftools/khash_str2str.h b/bcftools/khash_str2str.h
new file mode 100644
index 0000000..ecf4e0b
--- /dev/null
+++ b/bcftools/khash_str2str.h
@@ -0,0 +1,89 @@
+/* khash_str2str.h -- C-string to C-string hash table.
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef KHASH_STR2STR_H
+#define KHASH_STR2STR_H
+
+#include <htslib/khash.h>
+
+KHASH_MAP_INIT_STR(str2str, const char*)
+
+/*
+ * Wrappers for khash dictionaries used by mpileup.
+ */
+
+static inline void *khash_str2str_init(void)
+{
+ return kh_init(str2str);
+}
+
+/*
+ * Destroy the hash structure, but not the keys
+ */
+static inline void khash_str2str_destroy(void *_hash)
+{
+ khash_t(str2str) *hash = (khash_t(str2str)*)_hash;
+ if (hash) kh_destroy(str2str, hash); // Note that strings are not freed.
+}
+
+/*
+ * Destroys both the hash structure and the keys
+ */
+static inline void khash_str2str_destroy_free(void *_hash)
+{
+ khash_t(str2str) *hash = (khash_t(str2str)*)_hash;
+ khint_t k;
+ if (hash == 0) return;
+ for (k = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
+ kh_destroy(str2str, hash);
+}
+
+/*
+ * Returns value if key exists or NULL if not
+ */
+static inline char *khash_str2str_get(void *_hash, const char *str)
+{
+ khash_t(str2str) *hash = (khash_t(str2str)*)_hash;
+ khint_t k = kh_get(str2str, hash, str);
+ if ( k == kh_end(hash) ) return NULL;
+ return (char*)kh_val(hash, k);
+}
+
+/*
+ * Set a new key,value pair. On success returns the bin index, on
+ * error -1 is returned.
+ */
+static inline int khash_str2str_set(void *_hash, const char *str, const char *value)
+{
+ khint_t k;
+ int ret;
+ khash_t(str2str) *hash = (khash_t(str2str)*)_hash;
+ if ( !hash ) return -1;
+ k = kh_put(str2str, hash, str, &ret);
+ kh_val(hash,k) = value;
+ return k;
+}
+
+#endif
diff --git a/bcftools/kmin.c b/bcftools/kmin.c
new file mode 100644
index 0000000..5b8193b
--- /dev/null
+++ b/bcftools/kmin.c
@@ -0,0 +1,209 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Hooke-Jeeves algorithm for nonlinear minimization
+
+ Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
+ the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
+ papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
+ 6(6):313-314). The original algorithm was designed by Hooke and
+ Jeeves (ACM 8:212-229). This program is further revised according to
+ Johnson's implementation at Netlib (opt/hooke.c).
+
+ Hooke-Jeeves algorithm is very simple and it works quite well on a
+ few examples. However, it might fail to converge due to its heuristic
+ nature. A possible improvement, as is suggested by Johnson, may be to
+ choose a small r at the beginning to quickly approach to the minimum
+ and a large r at later step to hit the minimum.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kmin.h"
+
+static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
+{
+ int k, j = *n_calls;
+ double ftmp;
+ for (k = 0; k != n; ++k) {
+ x1[k] += dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else { /* search the opposite direction */
+ dx[k] = 0.0 - dx[k];
+ x1[k] += dx[k] + dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else x1[k] -= dx[k]; /* back to the original x[k] */
+ }
+ }
+ *n_calls = j;
+ return fx1; /* here: fx1=f(n,x1) */
+}
+
+double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
+{
+ double fx, fx1, *x1, *dx, radius;
+ int k, n_calls = 0;
+ x1 = (double*)calloc(n, sizeof(double));
+ dx = (double*)calloc(n, sizeof(double));
+ for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
+ dx[k] = fabs(x[k]) * r;
+ if (dx[k] == 0) dx[k] = r;
+ }
+ radius = r;
+ fx1 = fx = func(n, x, data); ++n_calls;
+ for (;;) {
+ memcpy(x1, x, n * sizeof(double)); /* x1 = x */
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
+ while (fx1 < fx) {
+ for (k = 0; k != n; ++k) {
+ double t = x[k];
+ dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
+ x[k] = x1[k];
+ x1[k] = x1[k] + x1[k] - t;
+ }
+ fx = fx1;
+ if (n_calls >= max_calls) break;
+ fx1 = func(n, x1, data); ++n_calls;
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
+ if (fx1 >= fx) break;
+ for (k = 0; k != n; ++k)
+ if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
+ if (k == n) break;
+ }
+ if (radius >= eps) {
+ if (n_calls >= max_calls) break;
+ radius *= r;
+ for (k = 0; k != n; ++k) dx[k] *= r;
+ } else break; /* converge */
+ }
+ free(x1); free(dx);
+ return fx1;
+}
+
+// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
+double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
+{
+ double bound, u, r, q, fu, tmp, fa, fb, fc, c;
+ const double gold1 = 1.6180339887;
+ const double gold2 = 0.3819660113;
+ const double tiny = 1e-20;
+ const int max_iter = 100;
+
+ double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
+ int iter;
+
+ fa = func(a, data); fb = func(b, data);
+ if (fb > fa) { // swap, such that f(a) > f(b)
+ tmp = a; a = b; b = tmp;
+ tmp = fa; fa = fb; fb = tmp;
+ }
+ c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
+ while (fb > fc) {
+ bound = b + 100.0 * (c - b); // the farthest point where we want to go
+ r = (b - a) * (fb - fc);
+ q = (b - c) * (fb - fa);
+ if (fabs(q - r) < tiny) { // avoid 0 denominator
+ tmp = q > r? tiny : 0.0 - tiny;
+ } else tmp = q - r;
+ u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
+ if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
+ fu = func(u, data);
+ if (fu < fc) { // (b,u,c) bracket the minimum
+ a = b; b = u; fa = fb; fb = fu;
+ break;
+ } else if (fu > fb) { // (a,b,u) bracket the minimum
+ c = u; fc = fu;
+ break;
+ }
+ u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
+ } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
+ fu = func(u, data);
+ if (fu < fc) { // fb > fc > fu
+ b = c; c = u; u = c + gold1 * (c - b);
+ fb = fc; fc = fu; fu = func(u, data);
+ } else { // (b,c,u) bracket the minimum
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ break;
+ }
+ } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
+ u = bound; fu = func(u, data);
+ } else { // u goes the other way around, use golden section extrapolation
+ u = c + gold1 * (c - b); fu = func(u, data);
+ }
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ }
+ if (a > c) u = a, a = c, c = u; // swap
+
+ // now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+ e = d = 0.0;
+ w = v = b; fv = fw = fb;
+ for (iter = 0; iter != max_iter; ++iter) {
+ mid = 0.5 * (a + c);
+ tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
+ if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
+ *xmin = b; return fb; // found
+ }
+ if (fabs(e) > tol1) {
+ // related to parabolic interpolation
+ r = (b - w) * (fb - fv);
+ q = (b - v) * (fb - fw);
+ p = (b - v) * q - (b - w) * r;
+ q = 2.0 * (q - r);
+ if (q > 0.0) p = 0.0 - p;
+ else q = 0.0 - q;
+ eold = e; e = d;
+ if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
+ d = gold2 * (e = (b >= mid ? a - b : c - b));
+ } else {
+ d = p / q; u = b + d; // actual parabolic interpolation happens here
+ if (u - a < tol2 || c - u < tol2)
+ d = (mid > b)? tol1 : 0.0 - tol1;
+ }
+ } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
+ u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
+ fu = func(u, data);
+ if (fu <= fb) { // u is the minimum point so far
+ if (u >= b) a = b;
+ else c = b;
+ v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
+ } else { // adjust (a,c) and (u,v,w)
+ if (u < b) a = u;
+ else c = u;
+ if (fu <= fw || w == b) {
+ v = w; w = u;
+ fv = fw; fw = fu;
+ } else if (fu <= fv || v == b || v == w) {
+ v = u; fv = fu;
+ }
+ }
+ }
+ *xmin = b;
+ return fb;
+}
diff --git a/bcftools/kmin.c.pysam.c b/bcftools/kmin.c.pysam.c
new file mode 100644
index 0000000..ee7b512
--- /dev/null
+++ b/bcftools/kmin.c.pysam.c
@@ -0,0 +1,211 @@
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2008, 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Hooke-Jeeves algorithm for nonlinear minimization
+
+ Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
+ the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
+ papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
+ 6(6):313-314). The original algorithm was designed by Hooke and
+ Jeeves (ACM 8:212-229). This program is further revised according to
+ Johnson's implementation at Netlib (opt/hooke.c).
+
+ Hooke-Jeeves algorithm is very simple and it works quite well on a
+ few examples. However, it might fail to converge due to its heuristic
+ nature. A possible improvement, as is suggested by Johnson, may be to
+ choose a small r at the beginning to quickly approach to the minimum
+ and a large r at later step to hit the minimum.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kmin.h"
+
+static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
+{
+ int k, j = *n_calls;
+ double ftmp;
+ for (k = 0; k != n; ++k) {
+ x1[k] += dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else { /* search the opposite direction */
+ dx[k] = 0.0 - dx[k];
+ x1[k] += dx[k] + dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else x1[k] -= dx[k]; /* back to the original x[k] */
+ }
+ }
+ *n_calls = j;
+ return fx1; /* here: fx1=f(n,x1) */
+}
+
+double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
+{
+ double fx, fx1, *x1, *dx, radius;
+ int k, n_calls = 0;
+ x1 = (double*)calloc(n, sizeof(double));
+ dx = (double*)calloc(n, sizeof(double));
+ for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
+ dx[k] = fabs(x[k]) * r;
+ if (dx[k] == 0) dx[k] = r;
+ }
+ radius = r;
+ fx1 = fx = func(n, x, data); ++n_calls;
+ for (;;) {
+ memcpy(x1, x, n * sizeof(double)); /* x1 = x */
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
+ while (fx1 < fx) {
+ for (k = 0; k != n; ++k) {
+ double t = x[k];
+ dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
+ x[k] = x1[k];
+ x1[k] = x1[k] + x1[k] - t;
+ }
+ fx = fx1;
+ if (n_calls >= max_calls) break;
+ fx1 = func(n, x1, data); ++n_calls;
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
+ if (fx1 >= fx) break;
+ for (k = 0; k != n; ++k)
+ if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
+ if (k == n) break;
+ }
+ if (radius >= eps) {
+ if (n_calls >= max_calls) break;
+ radius *= r;
+ for (k = 0; k != n; ++k) dx[k] *= r;
+ } else break; /* converge */
+ }
+ free(x1); free(dx);
+ return fx1;
+}
+
+// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
+double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
+{
+ double bound, u, r, q, fu, tmp, fa, fb, fc, c;
+ const double gold1 = 1.6180339887;
+ const double gold2 = 0.3819660113;
+ const double tiny = 1e-20;
+ const int max_iter = 100;
+
+ double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
+ int iter;
+
+ fa = func(a, data); fb = func(b, data);
+ if (fb > fa) { // swap, such that f(a) > f(b)
+ tmp = a; a = b; b = tmp;
+ tmp = fa; fa = fb; fb = tmp;
+ }
+ c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
+ while (fb > fc) {
+ bound = b + 100.0 * (c - b); // the farthest point where we want to go
+ r = (b - a) * (fb - fc);
+ q = (b - c) * (fb - fa);
+ if (fabs(q - r) < tiny) { // avoid 0 denominator
+ tmp = q > r? tiny : 0.0 - tiny;
+ } else tmp = q - r;
+ u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
+ if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
+ fu = func(u, data);
+ if (fu < fc) { // (b,u,c) bracket the minimum
+ a = b; b = u; fa = fb; fb = fu;
+ break;
+ } else if (fu > fb) { // (a,b,u) bracket the minimum
+ c = u; fc = fu;
+ break;
+ }
+ u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
+ } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
+ fu = func(u, data);
+ if (fu < fc) { // fb > fc > fu
+ b = c; c = u; u = c + gold1 * (c - b);
+ fb = fc; fc = fu; fu = func(u, data);
+ } else { // (b,c,u) bracket the minimum
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ break;
+ }
+ } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
+ u = bound; fu = func(u, data);
+ } else { // u goes the other way around, use golden section extrapolation
+ u = c + gold1 * (c - b); fu = func(u, data);
+ }
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ }
+ if (a > c) u = a, a = c, c = u; // swap
+
+ // now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+ e = d = 0.0;
+ w = v = b; fv = fw = fb;
+ for (iter = 0; iter != max_iter; ++iter) {
+ mid = 0.5 * (a + c);
+ tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
+ if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
+ *xmin = b; return fb; // found
+ }
+ if (fabs(e) > tol1) {
+ // related to parabolic interpolation
+ r = (b - w) * (fb - fv);
+ q = (b - v) * (fb - fw);
+ p = (b - v) * q - (b - w) * r;
+ q = 2.0 * (q - r);
+ if (q > 0.0) p = 0.0 - p;
+ else q = 0.0 - q;
+ eold = e; e = d;
+ if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
+ d = gold2 * (e = (b >= mid ? a - b : c - b));
+ } else {
+ d = p / q; u = b + d; // actual parabolic interpolation happens here
+ if (u - a < tol2 || c - u < tol2)
+ d = (mid > b)? tol1 : 0.0 - tol1;
+ }
+ } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
+ u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
+ fu = func(u, data);
+ if (fu <= fb) { // u is the minimum point so far
+ if (u >= b) a = b;
+ else c = b;
+ v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
+ } else { // adjust (a,c) and (u,v,w)
+ if (u < b) a = u;
+ else c = u;
+ if (fu <= fw || w == b) {
+ v = w; w = u;
+ fv = fw; fw = fu;
+ } else if (fu <= fv || v == b || v == w) {
+ v = u; fv = fu;
+ }
+ }
+ }
+ *xmin = b;
+ return fb;
+}
diff --git a/bcftools/kmin.h b/bcftools/kmin.h
new file mode 100644
index 0000000..6feba45
--- /dev/null
+++ b/bcftools/kmin.h
@@ -0,0 +1,46 @@
+/*
+ Copyright (c) 2008, 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KMIN_H
+#define KMIN_H
+
+#define KMIN_RADIUS 0.5
+#define KMIN_EPS 1e-7
+#define KMIN_MAXCALL 50000
+
+typedef double (*kmin_f)(int, double*, void*);
+typedef double (*kmin1_f)(double, void*);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls);
+ double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bcftools/main.c b/bcftools/main.c
new file mode 100644
index 0000000..f08b5c7
--- /dev/null
+++ b/bcftools/main.c
@@ -0,0 +1,264 @@
+/* main.c -- main bcftools command front-end.
+
+ Copyright (C) 2012-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <htslib/hts.h>
+#include "version.h"
+#include "bcftools.h"
+
+int main_tabix(int argc, char *argv[]);
+int main_vcfindex(int argc, char *argv[]);
+int main_vcfstats(int argc, char *argv[]);
+int main_vcfisec(int argc, char *argv[]);
+int main_vcfmerge(int argc, char *argv[]);
+int main_vcfquery(int argc, char *argv[]);
+int main_vcffilter(int argc, char *argv[]);
+int main_vcfsom(int argc, char *argv[]);
+int main_vcfnorm(int argc, char *argv[]);
+int main_vcfgtcheck(int argc, char *argv[]);
+int main_vcfview(int argc, char *argv[]);
+int main_vcfcall(int argc, char *argv[]);
+int main_vcfannotate(int argc, char *argv[]);
+int main_vcfroh(int argc, char *argv[]);
+int main_vcfconcat(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
+int main_vcfconvert(int argc, char *argv[]);
+int main_vcfcnv(int argc, char *argv[]);
+#if USE_GPL
+int main_polysomy(int argc, char *argv[]);
+#endif
+int main_plugin(int argc, char *argv[]);
+int main_consensus(int argc, char *argv[]);
+
+typedef struct
+{
+ int (*func)(int, char*[]);
+ const char *alias, *help;
+}
+cmd_t;
+
+static cmd_t cmds[] =
+{
+ { .func = NULL,
+ .alias = "Indexing",
+ .help = NULL
+ },
+ { .func = main_vcfindex,
+ .alias = "index",
+ .help = "index VCF/BCF files"
+ },
+ { .func = main_tabix,
+ .alias = "tabix",
+ .help = "-tabix for BGZF'd BED, GFF, SAM, VCF and more" // do not advertise; only keep here for testing
+ },
+
+ { .func = NULL,
+ .alias = "VCF/BCF manipulation",
+ .help = NULL
+ },
+
+ { .func = main_vcfannotate,
+ .alias = "annotate",
+ .help = "annotate and edit VCF/BCF files",
+ },
+ { .func = main_vcfconcat,
+ .alias = "concat",
+ .help = "concatenate VCF/BCF files from the same set of samples"
+ },
+ { .func = main_vcfconvert,
+ .alias = "convert",
+ .help = "convert VCF/BCF files to different formats and back"
+ },
+ { .func = main_vcfisec,
+ .alias = "isec",
+ .help = "intersections of VCF/BCF files"
+ },
+ { .func = main_vcfmerge,
+ .alias = "merge",
+ .help = "merge VCF/BCF files files from non-overlapping sample sets"
+ },
+ { .func = main_vcfnorm,
+ .alias = "norm",
+ .help = "left-align and normalize indels"
+ },
+ { .func = main_plugin,
+ .alias = "plugin",
+ .help = "user-defined plugins"
+ },
+ { .func = main_vcfquery,
+ .alias = "query",
+ .help = "transform VCF/BCF into user-defined formats"
+ },
+ { .func = main_reheader,
+ .alias = "reheader",
+ .help = "modify VCF/BCF header, change sample names"
+ },
+ { .func = main_vcfview,
+ .alias = "view",
+ .help = "VCF/BCF conversion, view, subset and filter VCF/BCF files"
+ },
+
+ { .func = NULL,
+ .alias = "VCF/BCF analysis",
+ .help = NULL
+ },
+
+ { .func = main_vcfcall,
+ .alias = "call",
+ .help = "SNP/indel calling"
+ },
+ { .func = main_consensus,
+ .alias = "consensus",
+ .help = "create consensus sequence by applying VCF variants"
+ },
+ { .func = main_vcfcnv,
+ .alias = "cnv",
+ .help = "HMM CNV calling"
+ },
+ { .func = main_vcffilter,
+ .alias = "filter",
+ .help = "filter VCF/BCF files using fixed thresholds"
+ },
+ { .func = main_vcfgtcheck,
+ .alias = "gtcheck",
+ .help = "check sample concordance, detect sample swaps and contamination"
+ },
+#if USE_GPL
+ { .func = main_polysomy,
+ .alias = "polysomy",
+ .help = "detect number of chromosomal copies",
+ },
+#endif
+ { .func = main_vcfroh,
+ .alias = "roh",
+ .help = "identify runs of autozygosity (HMM)",
+ },
+ { .func = main_vcfstats,
+ .alias = "stats",
+ .help = "produce VCF/BCF stats"
+ },
+
+ { .func = main_vcfsom,
+ .alias = "som",
+ .help = "-filter using Self-Organized Maps (experimental)" // do not advertise
+
+ },
+ { .func = NULL,
+ .alias = NULL,
+ .help = NULL
+ }
+};
+
+char *bcftools_version(void)
+{
+ return BCFTOOLS_VERSION;
+}
+
+static void usage(FILE *fp)
+{
+ fprintf(fp, "\n");
+ fprintf(fp, "Program: bcftools (Tools for variant calling and manipulating VCFs and BCFs)\n");
+#if USE_GPL
+ fprintf(fp, "License: GNU GPLv3+, due to use of the GNU Scientific Library\n");
+#endif
+ fprintf(fp, "Version: %s (using htslib %s)\n", bcftools_version(), hts_version());
+ fprintf(fp, "\n");
+ fprintf(fp, "Usage: bcftools [--version|--version-only] [--help] <command> <argument>\n");
+ fprintf(fp, "\n");
+ fprintf(fp, "Commands:\n");
+
+ int i = 0;
+ const char *sep = NULL;
+ while (cmds[i].alias)
+ {
+ if ( !cmds[i].func ) sep = cmds[i].alias;
+ if ( sep )
+ {
+ fprintf(fp, "\n -- %s\n", sep);
+ sep = NULL;
+ }
+ if ( cmds[i].func && cmds[i].help[0]!='-' ) fprintf(fp, " %-12s %s\n", cmds[i].alias, cmds[i].help);
+ i++;
+ }
+ fprintf(fp,"\n");
+ fprintf(fp,
+ " Most commands accept VCF, bgzipped VCF, and BCF with the file type detected\n"
+ " automatically even when streaming from a pipe. Indexed VCF and BCF will work\n"
+ " in all situations. Un-indexed VCF and BCF and streams will work in most but\n"
+ " not all situations.\n");
+ fprintf(fp,"\n");
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc < 2) { usage(stderr); return 1; }
+
+ if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2015 Genome Research Ltd.\n", bcftools_version(), hts_version());
+#if USE_GPL
+ printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
+#else
+ printf("License Expat: The MIT/Expat license\n");
+#endif
+ printf("This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n");
+ return 0;
+ }
+ else if (strcmp(argv[1], "--version-only") == 0) {
+ printf("%s+htslib-%s\n", bcftools_version(), hts_version());
+ return 0;
+ }
+ else if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) {
+ if (argc == 2) { usage(stdout); return 0; }
+ // Otherwise change "bcftools help COMMAND [...]" to "bcftools COMMAND";
+ // main_xyz() functions by convention display the subcommand's usage
+ // when invoked without any arguments.
+ argv++;
+ argc = 2;
+ }
+ else if ( argv[1][0]=='+' )
+ {
+ // "bcftools plugin name" can be run as "bcftools +name"
+ argv[1]++;
+ argv[0] = "plugin";
+ argv--;
+ argc++;
+ }
+
+ int i = 0;
+ while (cmds[i].alias)
+ {
+ if (cmds[i].func && strcmp(argv[1],cmds[i].alias)==0)
+ {
+ return cmds[i].func(argc-1,argv+1);
+ }
+ i++;
+ }
+ fprintf(stderr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]);
+ return 1;
+}
+
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c
new file mode 100644
index 0000000..f180e56
--- /dev/null
+++ b/bcftools/main.c.pysam.c
@@ -0,0 +1,266 @@
+#include "pysam.h"
+
+/* main.c -- main bcftools command front-end.
+
+ Copyright (C) 2012-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <htslib/hts.h>
+#include "version.h"
+#include "bcftools.h"
+
+int main_tabix(int argc, char *argv[]);
+int main_vcfindex(int argc, char *argv[]);
+int main_vcfstats(int argc, char *argv[]);
+int main_vcfisec(int argc, char *argv[]);
+int main_vcfmerge(int argc, char *argv[]);
+int main_vcfquery(int argc, char *argv[]);
+int main_vcffilter(int argc, char *argv[]);
+int main_vcfsom(int argc, char *argv[]);
+int main_vcfnorm(int argc, char *argv[]);
+int main_vcfgtcheck(int argc, char *argv[]);
+int main_vcfview(int argc, char *argv[]);
+int main_vcfcall(int argc, char *argv[]);
+int main_vcfannotate(int argc, char *argv[]);
+int main_vcfroh(int argc, char *argv[]);
+int main_vcfconcat(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
+int main_vcfconvert(int argc, char *argv[]);
+int main_vcfcnv(int argc, char *argv[]);
+#if USE_GPL
+int main_polysomy(int argc, char *argv[]);
+#endif
+int main_plugin(int argc, char *argv[]);
+int main_consensus(int argc, char *argv[]);
+
+typedef struct
+{
+ int (*func)(int, char*[]);
+ const char *alias, *help;
+}
+cmd_t;
+
+static cmd_t cmds[] =
+{
+ { .func = NULL,
+ .alias = "Indexing",
+ .help = NULL
+ },
+ { .func = main_vcfindex,
+ .alias = "index",
+ .help = "index VCF/BCF files"
+ },
+ { .func = main_tabix,
+ .alias = "tabix",
+ .help = "-tabix for BGZF'd BED, GFF, SAM, VCF and more" // do not advertise; only keep here for testing
+ },
+
+ { .func = NULL,
+ .alias = "VCF/BCF manipulation",
+ .help = NULL
+ },
+
+ { .func = main_vcfannotate,
+ .alias = "annotate",
+ .help = "annotate and edit VCF/BCF files",
+ },
+ { .func = main_vcfconcat,
+ .alias = "concat",
+ .help = "concatenate VCF/BCF files from the same set of samples"
+ },
+ { .func = main_vcfconvert,
+ .alias = "convert",
+ .help = "convert VCF/BCF files to different formats and back"
+ },
+ { .func = main_vcfisec,
+ .alias = "isec",
+ .help = "intersections of VCF/BCF files"
+ },
+ { .func = main_vcfmerge,
+ .alias = "merge",
+ .help = "merge VCF/BCF files files from non-overlapping sample sets"
+ },
+ { .func = main_vcfnorm,
+ .alias = "norm",
+ .help = "left-align and normalize indels"
+ },
+ { .func = main_plugin,
+ .alias = "plugin",
+ .help = "user-defined plugins"
+ },
+ { .func = main_vcfquery,
+ .alias = "query",
+ .help = "transform VCF/BCF into user-defined formats"
+ },
+ { .func = main_reheader,
+ .alias = "reheader",
+ .help = "modify VCF/BCF header, change sample names"
+ },
+ { .func = main_vcfview,
+ .alias = "view",
+ .help = "VCF/BCF conversion, view, subset and filter VCF/BCF files"
+ },
+
+ { .func = NULL,
+ .alias = "VCF/BCF analysis",
+ .help = NULL
+ },
+
+ { .func = main_vcfcall,
+ .alias = "call",
+ .help = "SNP/indel calling"
+ },
+ { .func = main_consensus,
+ .alias = "consensus",
+ .help = "create consensus sequence by applying VCF variants"
+ },
+ { .func = main_vcfcnv,
+ .alias = "cnv",
+ .help = "HMM CNV calling"
+ },
+ { .func = main_vcffilter,
+ .alias = "filter",
+ .help = "filter VCF/BCF files using fixed thresholds"
+ },
+ { .func = main_vcfgtcheck,
+ .alias = "gtcheck",
+ .help = "check sample concordance, detect sample swaps and contamination"
+ },
+#if USE_GPL
+ { .func = main_polysomy,
+ .alias = "polysomy",
+ .help = "detect number of chromosomal copies",
+ },
+#endif
+ { .func = main_vcfroh,
+ .alias = "roh",
+ .help = "identify runs of autozygosity (HMM)",
+ },
+ { .func = main_vcfstats,
+ .alias = "stats",
+ .help = "produce VCF/BCF stats"
+ },
+
+ { .func = main_vcfsom,
+ .alias = "som",
+ .help = "-filter using Self-Organized Maps (experimental)" // do not advertise
+
+ },
+ { .func = NULL,
+ .alias = NULL,
+ .help = NULL
+ }
+};
+
+char *bcftools_version(void)
+{
+ return BCFTOOLS_VERSION;
+}
+
+static void usage(FILE *fp)
+{
+ fprintf(fp, "\n");
+ fprintf(fp, "Program: bcftools (Tools for variant calling and manipulating VCFs and BCFs)\n");
+#if USE_GPL
+ fprintf(fp, "License: GNU GPLv3+, due to use of the GNU Scientific Library\n");
+#endif
+ fprintf(fp, "Version: %s (using htslib %s)\n", bcftools_version(), hts_version());
+ fprintf(fp, "\n");
+ fprintf(fp, "Usage: bcftools [--version|--version-only] [--help] <command> <argument>\n");
+ fprintf(fp, "\n");
+ fprintf(fp, "Commands:\n");
+
+ int i = 0;
+ const char *sep = NULL;
+ while (cmds[i].alias)
+ {
+ if ( !cmds[i].func ) sep = cmds[i].alias;
+ if ( sep )
+ {
+ fprintf(fp, "\n -- %s\n", sep);
+ sep = NULL;
+ }
+ if ( cmds[i].func && cmds[i].help[0]!='-' ) fprintf(fp, " %-12s %s\n", cmds[i].alias, cmds[i].help);
+ i++;
+ }
+ fprintf(fp,"\n");
+ fprintf(fp,
+ " Most commands accept VCF, bgzipped VCF, and BCF with the file type detected\n"
+ " automatically even when streaming from a pipe. Indexed VCF and BCF will work\n"
+ " in all situations. Un-indexed VCF and BCF and streams will work in most but\n"
+ " not all situations.\n");
+ fprintf(fp,"\n");
+}
+
+int bcftools_main(int argc, char *argv[])
+{
+ if (argc < 2) { usage(pysamerr); return 1; }
+
+ if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2015 Genome Research Ltd.\n", bcftools_version(), hts_version());
+#if USE_GPL
+ printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
+#else
+ printf("License Expat: The MIT/Expat license\n");
+#endif
+ printf("This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n");
+ return 0;
+ }
+ else if (strcmp(argv[1], "--version-only") == 0) {
+ printf("%s+htslib-%s\n", bcftools_version(), hts_version());
+ return 0;
+ }
+ else if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) {
+ if (argc == 2) { usage(stdout); return 0; }
+ // Otherwise change "bcftools help COMMAND [...]" to "bcftools COMMAND";
+ // main_xyz() functions by convention display the subcommand's usage
+ // when invoked without any arguments.
+ argv++;
+ argc = 2;
+ }
+ else if ( argv[1][0]=='+' )
+ {
+ // "bcftools plugin name" can be run as "bcftools +name"
+ argv[1]++;
+ argv[0] = "plugin";
+ argv--;
+ argc++;
+ }
+
+ int i = 0;
+ while (cmds[i].alias)
+ {
+ if (cmds[i].func && strcmp(argv[1],cmds[i].alias)==0)
+ {
+ return cmds[i].func(argc-1,argv+1);
+ }
+ i++;
+ }
+ fprintf(pysamerr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]);
+ return 1;
+}
+
diff --git a/bcftools/mcall.c b/bcftools/mcall.c
new file mode 100644
index 0000000..495f849
--- /dev/null
+++ b/bcftools/mcall.c
@@ -0,0 +1,1537 @@
+/* mcall.c -- multiallelic and rare variant calling.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <math.h>
+#include <htslib/kfunc.h>
+#include "call.h"
+
+// Using priors for GTs does not seem to be mathematically justified. Although
+// it seems effective in removing false calls, it also flips a significant
+// proportion of HET genotypes. Better is to filter by FORMAT/GQ using
+// `bcftools filter`.
+#define USE_PRIOR_FOR_GTS 0
+
+// Go with uniform PLs for samples with no coverage. If unset, missing
+// genotypes is reported instead.
+#define FLAT_PDG_FOR_MISSING 0
+
+// Estimate QS (combined quality and allele frequencies) from PLs
+#define QS_FROM_PDG 0
+
+
+void qcall_init(call_t *call) { return; }
+void qcall_destroy(call_t *call) { return; }
+int qcall(call_t *call, bcf1_t *rec)
+{
+ // QCall format:
+ // chromosome, position, reference allele, depth, mapping quality, 0, ..
+ error("TODO: qcall output\n");
+ return 0;
+}
+
+void call_init_pl2p(call_t *call)
+{
+ int i;
+ for (i=0; i<256; i++)
+ call->pl2p[i] = pow(10., -i/10.);
+}
+
+// Macros for accessing call->trio and call->ntrio
+#define FTYPE_222 0 // family type: all diploid
+#define FTYPE_121 1 // chrX, the child is a boy
+#define FTYPE_122 2 // chrX, a girl
+#define FTYPE_101 3 // chrY, boy
+#define FTYPE_100 4 // chrY, girl
+
+#define GT_SKIP 0xf // empty genotype (chrY in females)
+
+#define IS_POW2(x) (!((x) & ((x) - 1))) // zero is permitted
+#define IS_HOM(x) IS_POW2(x)
+
+// Pkij = P(k|i,j) tells how likely it is to be a het if the parents
+// are homs etc. The consistency of i,j,k has been already checked.
+// Parameters are alleles and ploidy of father, mother, kid
+// Returns 2/Pkij.
+int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl)
+{
+ int als = fals|mals|kals;
+ if ( IS_HOM(als) ) return 2; // all are the same: child must be a HOM, P=1
+
+ if ( fpl==1 )
+ {
+ if ( kpl==1 ) // chr X, the child is a boy, the copy is inherited from the mother
+ {
+ if ( IS_HOM(mals) ) return 2; // 0 11 -> P(1) = 1
+ return 4; // 0 01 -> P(0) = P(1) = 1/2
+ }
+ // chr X, the child is a girl
+ if ( IS_HOM(mals) ) return 2; // 0 11 -> P(01) = 1
+ return 4; // 0 01 -> P(00) = P(01) = 1/2
+ }
+
+ if ( IS_HOM(fals) && IS_HOM(mals) ) return 2; // 00 11 01, the child must be a HET, P=1
+ if ( !IS_HOM(fals) && !IS_HOM(mals) )
+ {
+ if ( IS_HOM(kals) ) return 8; // 01 01 00 or 01 01 11, P(k=HOM) = 1/4
+ return 4; // 01 01 01, P(k=HET) = 1/2
+ }
+ return 4; // 00 01, P(k=HET) = P(k=HOM) = 1/2
+}
+
+// Initialize ntrio and trio: ntrio lists the number of possible
+// genotypes given combination of haploid/diploid genomes and the
+// number of alleles. trio lists allowed genotype combinations:
+// 4bit: 2/Pkij, 4: father, 4: mother, 4: child
+// See also mcall_call_trio_genotypes()
+//
+static void mcall_init_trios(call_t *call)
+{
+ // 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
+ call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
+ call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
+ call->ntrio[FTYPE_122][2] = 8; call->ntrio[FTYPE_122][3] = 27; call->ntrio[FTYPE_122][4] = 64;
+ call->ntrio[FTYPE_101][2] = 2; call->ntrio[FTYPE_101][3] = 3; call->ntrio[FTYPE_101][4] = 4;
+ call->ntrio[FTYPE_100][2] = 2; call->ntrio[FTYPE_100][3] = 3; call->ntrio[FTYPE_100][4] = 4;
+
+ int nals, itype;
+ for (itype=0; itype<=4; itype++)
+ {
+ for (nals=2; nals<=4; nals++)
+ call->trio[itype][nals] = (uint16_t*) malloc(sizeof(uint16_t)*call->ntrio[itype][nals]);
+ }
+
+ // max 10 possible diploid genotypes
+ int gts[10];
+ for (nals=2; nals<=4; nals++)
+ {
+ int i,j,k, n = 0, ngts = 0;
+ for (i=0; i<nals; i++)
+ for (j=0; j<=i; j++)
+ gts[ngts++] = 1<<i | 1<<j;
+
+ // 222: all diploid
+ // i,j,k: father, mother, child
+ for (i=0; i<ngts; i++)
+ for (j=0; j<ngts; j++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue; // k not present in neither i nor j
+ if ( !(gts[i] & gts[k]) || !(gts[j] & gts[k]) ) continue; // one copy from father, one from mother
+ int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 2,2,2);
+ call->trio[FTYPE_222][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k; // father, mother, child
+ }
+ assert( n==call->ntrio[FTYPE_222][nals] );
+
+ // 121: chrX, boy
+ n = 0;
+ for (i=0; i<ngts; i++)
+ for (j=0; j<ngts; j++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue; // father nor boy can be diploid
+ if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue;
+ if ( !(gts[j] & gts[k]) ) continue; // boy must inherit the copy from mother
+ int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,1);
+ call->trio[FTYPE_121][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
+ }
+ assert( n==call->ntrio[FTYPE_121][nals] );
+
+ // 122: chrX, girl
+ n = 0;
+ for (i=0; i<ngts; i++)
+ for (j=0; j<ngts; j++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( !IS_HOM(gts[i]) ) continue;
+ if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue;
+ if ( !(gts[i] & gts[k]) ) continue; // girl must inherit one copy from the father and one from the mother
+ if ( !(gts[j] & gts[k]) ) continue;
+ int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,2);
+ call->trio[FTYPE_122][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
+ }
+ assert( n==call->ntrio[FTYPE_122][nals] );
+
+ // 101: chrY, boy
+ n = 0;
+ for (i=0; i<ngts; i++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue;
+ if ( (gts[i]>s[k]) != gts[k] ) continue;
+ call->trio[FTYPE_101][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | k;
+ }
+ assert( n==call->ntrio[FTYPE_101][nals] );
+
+ // 100: chrY, girl
+ n = 0;
+ for (i=0; i<ngts; i++)
+ {
+ if ( !IS_POW2(gts[i]) ) continue;
+ call->trio[FTYPE_100][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | GT_SKIP;
+ }
+ assert( n==call->ntrio[FTYPE_100][nals] );
+
+ }
+ call->GLs = (double*) calloc(bcf_hdr_nsamples(call->hdr)*10,sizeof(double));
+
+ int i, j;
+ for (i=0; i<call->nfams; i++)
+ {
+ family_t *fam = &call->fams[i];
+ int ploidy[3];
+ for (j=0; j<3; j++)
+ ploidy[j] = call->ploidy[fam->sample[j]];
+
+ if ( ploidy[FATHER]==2 ) // not X, not Y
+ {
+ if ( ploidy[MOTHER]!=2 || ploidy[CHILD]!=2 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ fam->type = FTYPE_222;
+ continue;
+ }
+ if ( ploidy[FATHER]!=1 || ploidy[MOTHER]==1 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ if ( ploidy[MOTHER]==2 ) // X
+ {
+ if ( ploidy[CHILD]==0 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ fam->type = ploidy[CHILD]==2 ? FTYPE_122 : FTYPE_121; // a girl or a boy
+ }
+ else // Y
+ {
+ if ( ploidy[CHILD]==2 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ fam->type = ploidy[CHILD]==0 ? FTYPE_100 : FTYPE_101; // a girl or a boy
+ }
+ }
+}
+static void mcall_destroy_trios(call_t *call)
+{
+ int i, j;
+ for (i=2; i<=4; i++)
+ for (j=0; j<=4; j++)
+ free(call->trio[j][i]);
+}
+
+void mcall_init(call_t *call)
+{
+ call_init_pl2p(call);
+
+ call->nqsum = 5;
+ call->qsum = (float*) malloc(sizeof(float)*call->nqsum); // will be expanded later if ncessary
+ call->nals_map = 5;
+ call->als_map = (int*) malloc(sizeof(int)*call->nals_map);
+ call->npl_map = 5*(5+1)/2; // will be expanded later if necessary
+ call->pl_map = (int*) malloc(sizeof(int)*call->npl_map);
+ call->gts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int32_t)); // assuming at most diploid everywhere
+
+ if ( call->flag & CALL_CONSTR_TRIO )
+ {
+ call->cgts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
+ call->ugts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
+ mcall_init_trios(call);
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=CGT,Number=1,Type=Integer,Description=\"Constrained Genotype (0-based index to Number=G ordering).\">");
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=UGT,Number=1,Type=Integer,Description=\"Unconstrained Genotype (0-based index to Number=G ordering).\">");
+ }
+ if ( call->flag & CALL_CONSTR_ALLELES ) call->vcmp = vcmp_init();
+
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
+ if ( call->output_tags & CALL_FMT_GP )
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Phred-scaled genotype posterior probabilities\">");
+ if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+ call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
+ bcf_hdr_append(call->hdr,"##INFO=<ID=ICB,Number=1,Type=Float,Description=\"Inbreeding Coefficient Binomial test (bigger is better)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=HOB,Number=1,Type=Float,Description=\"Bias in the number of HOMs number (smaller is better)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
+
+ // init the prior
+ if ( call->theta>0 )
+ {
+ int i, n = 0;
+ if ( !call->ploidy ) n = 2*bcf_hdr_nsamples(call->hdr); // all are diploid
+ else
+ {
+ for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
+ n += call->ploidy[i];
+ }
+ // Watterson factor, here aM_1 = aM_2 = 1
+ double aM = 1;
+ for (i=2; i<n; i++) aM += 1./i;
+ call->theta *= aM;
+ if ( call->theta >= 1 )
+ {
+ fprintf(stderr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta);
+ call->theta = 0.99;
+ }
+ call->theta = log(call->theta);
+ }
+
+ return;
+}
+
+void mcall_destroy(call_t *call)
+{
+ if (call->vcmp) vcmp_destroy(call->vcmp);
+ free(call->itmp);
+ mcall_destroy_trios(call);
+ free(call->GPs);
+ free(call->GLs);
+ free(call->GQs);
+ free(call->anno16);
+ free(call->PLs);
+ free(call->qsum);
+ free(call->als_map);
+ free(call->pl_map);
+ free(call->gts); free(call->cgts); free(call->ugts);
+ free(call->pdg);
+ free(call->als);
+ free(call->ac);
+ return;
+}
+
+
+// Inits P(D|G): convert PLs from log space and normalize. In case of zero
+// depth, missing PLs are all zero. In this case, pdg's are set to 0
+// so that the corresponding genotypes can be set as missing and the
+// qual calculation is not affected.
+// Missing values are replaced by generic likelihoods when X (unseen allele) is
+// present.
+// NB: While the -m callig model uses the pdgs in canonical order,
+// the original samtools -c calling code uses pdgs in reverse order (AA comes
+// first, RR last).
+// NB: Ploidy is not taken into account here, which is incorrect.
+void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unseen)
+{
+ int i, j, nals;
+
+ // find out the number of alleles, expecting diploid genotype likelihoods
+ bcf_gt2alleles(n_gt-1, &i, &nals);
+ assert( i==nals );
+ nals++;
+
+ for (i=0; i<n_smpl; i++)
+ {
+ double sum = 0;
+ for (j=0; j<n_gt; j++)
+ {
+ if ( PLs[j]==bcf_int32_vector_end )
+ {
+ // We expect diploid genotype likelihoods. If not diploid, treat as missing
+ j = 0;
+ break;
+ }
+ if ( PLs[j]==bcf_int32_missing ) break;
+ assert( PLs[j]<256 );
+ pdg[j] = pl2p[ PLs[j] ];
+ sum += pdg[j];
+ }
+
+ if ( j==0 )
+ {
+ // First value is missing (LK of RR), this indicates that
+ // all values are missing.
+ j = sum = n_gt;
+ }
+ else if ( j<n_gt && unseen<0 )
+ {
+ // Some of the values are missing and the unseen allele LK is not
+ // available. In such a case, we set LK to a very small value.
+ sum = 0;
+ for (j=0; j<n_gt; j++)
+ {
+ assert( PLs[j]!=bcf_int32_vector_end );
+ if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
+ assert( PLs[j]<256 );
+ pdg[j] = pl2p[ PLs[j] ];
+ sum += pdg[j];
+ }
+ }
+ if ( j<n_gt )
+ {
+ // Missing values present, fill with unseen allele LK. This can be only
+ // as good as the merge was.
+ int ia,ib, k;
+ j = 0;
+ sum = 0;
+ for (ia=0; ia<nals; ia++)
+ {
+ for (ib=0; ib<=ia; ib++)
+ {
+ if ( PLs[j]==bcf_int32_missing )
+ {
+ k = bcf_alleles2gt(ia,unseen);
+ if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(ib,unseen);
+ if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(unseen,unseen);
+ if ( PLs[k]==bcf_int32_missing )
+ {
+ // The PLs for unseen allele X are not present as well as for ia, ib.
+ // This can happen with incremental calling, when one of the merged
+ // files had all alleles A,C,G,T, in such a case, X was not present.
+ // Use a very small value instead.
+ PLs[j] = 255;
+ }
+ else
+ PLs[j] = PLs[k];
+ }
+ pdg[j] = pl2p[ PLs[j] ];
+ sum += pdg[j];
+ j++;
+ }
+ }
+ }
+ // Normalize: sum_i pdg_i = 1
+ if ( sum==n_gt )
+ {
+ // all missing
+ #if FLAT_PDG_FOR_MISSING
+ for (j=0; j<n_gt; j++) pdg[j] = 1./n_gt;
+ #else
+ for (j=0; j<n_gt; j++) pdg[j] = 0;
+ #endif
+ }
+ else
+ for (j=0; j<n_gt; j++) pdg[j] /= sum;
+
+ PLs += n_gt;
+ pdg += n_gt;
+ }
+}
+
+/*
+ Allele frequency estimated as:
+ #A = \sum_i (2*P_AA + P_AB)
+ F_A = #A / ( #A + #B )
+ where i runs across all samples
+*/
+void estimate_qsum(call_t *call, bcf1_t *rec)
+{
+ double *pdg = call->pdg;
+ int ngts = rec->n_allele*(rec->n_allele+1)/2;
+ int i,nsmpl = bcf_hdr_nsamples(call->hdr);
+
+ hts_expand(float,rec->n_allele,call->nqsum,call->qsum);
+ for (i=0; i<rec->n_allele; i++) call->qsum[i] = 0;
+
+ for (i=0; i<nsmpl; i++)
+ {
+ int a, b, k = 0;
+ for (a=0; a<rec->n_allele; a++)
+ {
+ for (b=0; b<=a; b++)
+ {
+ call->qsum[a] += pdg[k];
+ call->qsum[b] += pdg[k];
+ k++;
+ }
+ }
+ pdg += ngts;
+ }
+ float sum = 0;
+ for (i=0; i<rec->n_allele; i++) sum += call->qsum[i];
+ if ( sum ) for (i=0; i<rec->n_allele; i++) call->qsum[i] /= sum;
+}
+
+// Create mapping between old and new (trimmed) alleles
+void init_allele_trimming_maps(call_t *call, int als, int nals)
+{
+ int i, j;
+
+ // als_map: old(i) -> new(j)
+ for (i=0, j=0; i<nals; i++)
+ {
+ if ( als & 1<<i ) call->als_map[i] = j++;
+ else call->als_map[i] = -1;
+ }
+
+ if ( !call->pl_map ) return;
+
+ // pl_map: new(k) -> old(l)
+ int k = 0, l = 0;
+ for (i=0; i<nals; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ if ( (als & 1<<i) && (als & 1<<j) ) call->pl_map[k++] = l;
+ l++;
+ }
+ }
+}
+
+double binom_dist(int N, double p, int k)
+{
+ int mean = (int) (N*p);
+ if ( mean==k ) return 1.0;
+
+ double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p);
+ if ( k > N - k ) k = N - k;
+ if ( mean > N - mean ) mean = N - mean;
+
+ if ( k < mean ) { int tmp = k; k = mean; mean = tmp; }
+ double diff = k - mean;
+
+ double val = 1.0;
+ int i;
+ for (i=0; i<diff; i++)
+ val = val * (N-mean-i) / (k-i);
+
+ return exp(log_p)/val;
+}
+
+
+// Inbreeding Coefficient, binomial test
+float calc_ICB(int nref, int nalt, int nhets, int ndiploid)
+{
+ if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
+
+ double fref = (double)nref/(nref+nalt); // fraction of reference allelels
+ double falt = (double)nalt/(nref+nalt); // non-ref als
+ double q = 2*fref*falt; // probability of a het, assuming HWE
+ double mean = q*ndiploid;
+
+ //fprintf(stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
+
+ // Can we use normal approximation? The second condition is for performance only
+ // and is not well justified.
+ if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 )
+ {
+ //fprintf(stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
+ return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)));
+ }
+
+ return binom_dist(ndiploid, q, nhets);
+}
+
+float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
+{
+ if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
+
+ double fref = (double)nref/(nref+nalt); // fraction of reference allelels
+ double falt = (double)nalt/(nref+nalt); // non-ref als
+ return fabs((double)nhets/ndiploid - 2*fref*falt);
+}
+
+/**
+ * log(sum_i exp(a_i))
+ */
+static inline double logsumexp(double *vals, int nvals)
+{
+ int i;
+ double max_exp = vals[0];
+ for (i=1; i<nvals; i++)
+ if ( max_exp < vals[i] ) max_exp = vals[i];
+
+ double sum = 0;
+ for (i=0; i<nvals; i++)
+ sum += exp(vals[i] - max_exp);
+
+ return log(sum) + max_exp;
+}
+/** log(exp(a)+exp(b)) */
+static inline double logsumexp2(double a, double b)
+{
+ if ( a>b )
+ return log(1 + exp(b-a)) + a;
+ else
+ return log(1 + exp(a-b)) + b;
+}
+
+// Macro to set the most likely alleles
+#define UPDATE_MAX_LKs(als) { \
+ if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
+ if ( lk_tot_set ) lk_sum = logsumexp2(lk_tot,lk_sum); \
+}
+
+#define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
+
+// Determine the most likely combination of alleles. In this implementation,
+// at most tri-allelic sites are considered. Returns the number of alleles.
+static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
+{
+ int ia,ib,ic; // iterators over up to three alleles
+ int max_als=0; // most likely combination of alleles
+ double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
+ double lk_sum = -HUGE_VAL; // for normalizing the likelihoods
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int ngts = nals*(nals+1)/2;
+
+ // Single allele
+ for (ia=0; ia<nals; ia++)
+ {
+ double lk_tot = 0;
+ int lk_tot_set = 0;
+ int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype
+ int isample;
+ double *pdg = call->pdg + iaa;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
+ pdg += ngts;
+ }
+ if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
+ else lk_tot += call->theta; // the prior
+ UPDATE_MAX_LKs(1<<ia);
+ }
+
+ // Two alleles
+ if ( nals>1 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( call->qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( call->qsum[ib]==0 ) continue;
+ double lk_tot = 0;
+ int lk_tot_set = 0;
+ double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]);
+ double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]);
+ double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+ double *pdg = call->pdg;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ double val = 0;
+ if ( !call->ploidy || call->ploidy[isample]==2 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab];
+ else if ( call->ploidy && call->ploidy[isample]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb];
+ if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
+ pdg += ngts;
+ }
+ if ( ia!=0 ) lk_tot += call->theta; // the prior
+ if ( ib!=0 ) lk_tot += call->theta;
+ UPDATE_MAX_LKs(1<<ia|1<<ib);
+ }
+ }
+ }
+
+ // Three alleles
+ if ( nals>2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( call->qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( call->qsum[ib]==0 ) continue;
+ int ibb = (ib+1)*(ib+2)/2-1;
+ int iab = iaa - ia + ib;
+ for (ic=0; ic<ib; ic++)
+ {
+ if ( call->qsum[ic]==0 ) continue;
+ double lk_tot = 0;
+ int lk_tot_set = 1;
+ double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
+ double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
+ double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ int isample, icc = (ic+1)*(ic+2)/2-1;
+ int iac = iaa - ia + ic, ibc = ibb - ib + ic;
+ double *pdg = call->pdg;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ double val = 0;
+ if ( !call->ploidy || call->ploidy[isample]==2 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ else if ( call->ploidy && call->ploidy[isample]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
+ if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
+ pdg += ngts;
+ }
+ if ( ia!=0 ) lk_tot += call->theta; // the prior
+ if ( ib!=0 ) lk_tot += call->theta; // the prior
+ if ( ic!=0 ) lk_tot += call->theta; // the prior
+ UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic);
+ }
+ }
+ }
+ }
+
+ call->ref_lk = ref_lk;
+ call->lk_sum = lk_sum;
+ *out_als = max_als;
+
+ int i, n = 0;
+ for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
+
+ return n;
+}
+
+static void mcall_set_ref_genotypes(call_t *call, int nals)
+{
+ int i;
+ int ngts = nals*(nals+1)/2;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+
+ for (i=0; i<nals; i++) call->ac[i] = 0;
+ call->nhets = 0;
+ call->ndiploid = 0;
+
+ // Set all genotypes to 0/0 or 0
+ int *gts = call->gts;
+ double *pdg = call->pdg;
+ int isample;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts || !ploidy )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
+ }
+ else
+ {
+ gts[0] = bcf_gt_unphased(0);
+ gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
+ call->ac[0] += ploidy;
+ }
+ gts += 2;
+ pdg += ngts;
+ }
+}
+
+static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int ia, ib, i;
+ int ngts = nals*(nals+1)/2;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nout_gts = nout_als*(nout_als+1)/2;
+ hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs);
+
+ for (i=0; i<nout_als; i++) call->ac[i] = 0;
+ call->nhets = 0;
+ call->ndiploid = 0;
+
+ #if USE_PRIOR_FOR_GTS
+ float prior = exp(call->theta);
+ #endif
+ float *gps = call->GPs - nout_gts;
+ double *pdg = call->pdg - ngts;
+ int *gts = call->gts - 2;
+
+ int isample;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ assert( ploidy>=0 && ploidy<=2 );
+
+ pdg += ngts;
+ gts += 2;
+ gps += nout_gts;
+
+ if ( !ploidy )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = bcf_int32_vector_end;
+ gps[0] = -1;
+ continue;
+ }
+
+ #if !FLAT_PDG_FOR_MISSING
+ // Skip samples with zero depth, they have all pdg's equal to 0
+ for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
+ gps[0] = -1;
+ continue;
+ }
+ #endif
+
+ if ( ploidy==2 ) call->ndiploid++;
+
+ // Default fallback for the case all LKs are the same
+ gts[0] = bcf_gt_unphased(0);
+ gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
+
+ // Non-zero depth, determine the most likely genotype
+ double best_lk = 0;
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
+ double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ #if USE_PRIOR_FOR_GTS
+ if ( ia!=0 ) lk *= prior;
+ #endif
+ int igt = ploidy==2 ? bcf_alleles2gt(call->als_map[ia],call->als_map[ia]) : call->als_map[ia];
+ gps[igt] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_gt_unphased(call->als_map[ia]);
+ }
+ }
+ if ( ploidy==2 )
+ {
+ gts[1] = gts[0];
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( !(out_als & 1<<ib) ) continue;
+ int iab = iaa - ia + ib;
+ double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib];
+ #if USE_PRIOR_FOR_GTS
+ if ( ia!=0 ) lk *= prior;
+ if ( ib!=0 ) lk *= prior;
+ #endif
+ int igt = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
+ gps[igt] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_gt_unphased(call->als_map[ib]);
+ gts[1] = bcf_gt_unphased(call->als_map[ia]);
+ }
+ }
+ }
+ if ( gts[0] != gts[1] ) call->nhets++;
+ }
+ else
+ gts[1] = bcf_int32_vector_end;
+
+ call->ac[ bcf_gt_allele(gts[0]) ]++;
+ if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
+ }
+ if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+ {
+ double max, sum;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ gps = call->GPs + isample*nout_gts;
+
+ int nmax;
+ if ( call->ploidy )
+ {
+ if ( call->ploidy[isample]==2 ) nmax = nout_gts;
+ else if ( call->ploidy[isample]==1 ) nmax = nout_als;
+ else nmax = 0;
+ }
+ else nmax = nout_gts;
+
+ max = gps[0];
+ if ( max<0 || nmax==0 )
+ {
+ // no call
+ if ( call->output_tags & CALL_FMT_GP )
+ {
+ for (i=0; i<nmax; i++) gps[i] = 0;
+ if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
+ if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+ }
+ call->GQs[isample] = 0;
+ continue;
+ }
+ sum = gps[0];
+ for (i=1; i<nmax; i++)
+ {
+ if ( max < gps[i] ) max = gps[i];
+ sum += gps[i];
+ }
+ max = -4.34294*log(1 - max/sum);
+ call->GQs[isample] = max<=INT8_MAX ? max : INT8_MAX;
+ if ( call->output_tags & CALL_FMT_GP )
+ {
+ assert( max );
+ for (i=0; i<nmax; i++) gps[i] = (int)(-4.34294*log(gps[i]/sum));
+ if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+ }
+ }
+ }
+ if ( call->output_tags & CALL_FMT_GP )
+ bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts);
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
+}
+
+
+/**
+ Pm = P(mendelian) .. parameter to vary, 1-Pm is the probability of novel mutation.
+ When trio_Pm_ins is negative, Pm is calculated dynamically
+ according to indel length. For simplicity, only the
+ first ALT is considered.
+ Pkij = P(k|i,j) .. probability that the genotype combination i,j,k is consistent
+ with mendelian inheritance (the likelihood that offspring
+ of two HETs is a HOM is smaller than it being a HET)
+
+ P_uc(F=i,M=j,K=k) = P(F=i) . P(M=j) . P(K=k) .. unconstrained P
+ P_c(F=i,M=j,K=k) = P_uc . Pkij .. constrained P
+ P(F=i,M=j,K=k) = P_uc . (1 - Pm) + P_c . Pm
+ = P_uc . [1 - Pm + Pkij . Pm]
+
+ We choose genotype combination i,j,k which maximizes P(F=i,M=j,K=k). This
+ probability gives the quality GQ(Trio).
+ Individual qualities are calculated as
+ GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
+ */
+static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int ia, ib, i;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int ngts = nals*(nals+1)/2;
+ int nout_gts = nout_als*(nout_als+1)/2;
+ double *gls = call->GLs - nout_gts;
+ double *pdg = call->pdg - ngts;
+
+ // Calculate individuals' genotype likelihoods P(X=i)
+ int isample;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ int32_t *gts = call->ugts + isample;
+
+ gls += nout_gts;
+ pdg += ngts;
+
+ // Skip samples with all pdg's equal to 1. These have zero depth.
+ for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts || !ploidy )
+ {
+ gts[0] = -1;
+ gls[0] = 1;
+ continue;
+ }
+
+ for (i=0; i<nout_gts; i++) gls[i] = -HUGE_VAL;
+
+ double sum_lk = 0;
+ double best_lk = 0;
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
+ int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
+ double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ sum_lk += lk;
+ gls[idx] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
+ }
+ }
+ if ( ploidy==2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( !(out_als & 1<<ib) ) continue;
+ int iab = bcf_alleles2gt(ia,ib);
+ int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
+ double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib];
+ sum_lk += lk;
+ gls[idx] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_alleles2gt(call->als_map[ib],call->als_map[ia]);
+ }
+ }
+ }
+ }
+ for (i=0; i<nout_gts; i++)
+ if ( gls[i]!=-HUGE_VAL ) gls[i] = log(gls[i]/sum_lk);
+ }
+
+ // Set novel mutation rate for this site: using first ALT allele for simplicity.
+ double trio_Pm;
+ if ( call->trio_Pm_ins<0 && call->trio_Pm_del<0 ) trio_Pm = call->trio_Pm_SNPs; // the same Pm for indels and SNPs requested
+ else
+ {
+ int ret = bcf_get_variant_types(rec);
+ if ( !(ret & VCF_INDEL) ) trio_Pm = call->trio_Pm_SNPs;
+ else
+ {
+ if ( call->trio_Pm_ins<0 ) // dynamic calculation, trio_Pm_del holds the scaling factor
+ {
+ trio_Pm = rec->d.var[1].n<0 ? -21.9313 - 0.2856*rec->d.var[1].n : -22.8689 + 0.2994*rec->d.var[1].n;
+ trio_Pm = 1 - call->trio_Pm_del * exp(trio_Pm);
+ }
+ else // snps and indels set explicitly
+ {
+ trio_Pm = rec->d.var[1].n<0 ? call->trio_Pm_del : call->trio_Pm_ins;
+ }
+ }
+ }
+
+ // Calculate constrained likelihoods and determine genotypes
+ int ifm;
+ for (ifm=0; ifm<call->nfams; ifm++)
+ {
+ family_t *fam = &call->fams[ifm];
+ int ntrio = call->ntrio[fam->type][nout_als];
+ uint16_t *trio = call->trio[fam->type][nout_als];
+
+ // Unconstrained likelihood
+ int uc_itr = 0;
+ double uc_lk = 0;
+ for (i=0; i<3; i++) // for father, mother, child
+ {
+ int ismpl = fam->sample[i];
+ double *gl = call->GLs + nout_gts*ismpl;
+ if ( gl[0]==1 ) continue;
+ int j, jmax = 0;
+ double max = gl[0];
+ for (j=1; j<nout_gts; j++)
+ if ( max < gl[j] ) { max = gl[j]; jmax = j; }
+ uc_lk += max;
+ uc_itr |= jmax << ((2-i)*4);
+ }
+
+ // Best constrained likelihood
+ int c_itr = -1, itr, uc_is_mendelian = 0;
+ double c_lk = -HUGE_VAL;
+ for (itr=0; itr<ntrio; itr++) // for each trio genotype combination
+ {
+ double lk = 0;
+ int npresent = 0;
+ for (i=0; i<3; i++) // for father, mother, child
+ {
+ int ismpl = fam->sample[i];
+ double *gl = call->GLs + nout_gts*ismpl;
+ if ( gl[0]==1 ) continue;
+ int igt = trio[itr]>>((2-i)*4) & 0xf;
+ assert( !call->ploidy || call->ploidy[ismpl]>0 );
+ if ( igt==GT_SKIP ) continue;
+ lk += gl[igt];
+ npresent++;
+ // fprintf(stderr," %e", gl[igt]);
+ }
+ // fprintf(stderr,"\t\t");
+ double Pkij = npresent==3 ? (double)2/(trio[itr]>>12) : 1; // with missing genotypes Pkij's are different
+ lk += log(1 - trio_Pm * (1 - Pkij));
+ // fprintf(stderr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij);
+ if ( c_lk < lk ) { c_lk = lk; c_itr = trio[itr]; }
+ if ( uc_itr==trio[itr] ) uc_is_mendelian = 1;
+ }
+
+ if ( !uc_is_mendelian )
+ {
+ uc_lk += log(1 - trio_Pm);
+ // fprintf(stderr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
+ if ( c_lk < uc_lk ) { c_lk = uc_lk; c_itr = uc_itr; }
+ }
+ // fprintf(stderr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
+
+ // Set genotypes for father, mother, child and calculate genotype qualities
+ for (i=0; i<3; i++)
+ {
+ // GT
+ int ismpl = fam->sample[i];
+ int igt = c_itr>>((2-i)*4) & 0xf;
+ double *gl = call->GLs + nout_gts*ismpl;
+ int32_t *gts = call->cgts + ismpl;
+ if ( gl[0]==1 || igt==GT_SKIP ) // zero depth, set missing genotypes
+ {
+ gts[0] = -1;
+ // bcf_float_set_missing(call->GQs[ismpl]);
+ continue;
+ }
+ gts[0] = igt;
+
+ #if 0
+ // todo: Genotype Qualities
+ //
+ // GQ: for each family member i sum over all genotypes j,k keeping igt fixed
+ double lk_sum = 0;
+ for (itr=0; itr<ntrio; itr++)
+ {
+ if ( igt != (trio[itr]>>((2-i)*4) & 0xf) ) continue;
+ double lk = 0;
+ int j;
+ for (j=0; j<3; j++)
+ {
+ int jsmpl = fam->sample[j];
+ double *gl = call->GLs + ngts*jsmpl;
+ if ( gl[0]==1 ) continue;
+ int jgt = trio[itr]>>((2-j)*4) & 0xf;
+ if ( jgt==GT_SKIP ) continue;
+ lk += gl[jgt];
+ }
+ double Pkij = (double)2/(trio[itr]>>12);
+ lk += log(1 - trio_Pm * (1 - Pkij));
+ lk_sum = logsumexp2(lk_sum, lk);
+ }
+ if ( !uc_is_mendelian && (best_itr>>((2-i)*4)&0xf)==(uc_itr>>((2-i)*4)&0xf) ) lk_sum = logsumexp2(lk_sum,uc_lk);
+ call->GQs[ismpl] = -4.3429*(best_lk - lk_sum);
+ #endif
+ }
+ }
+
+ for (i=0; i<4; i++) call->ac[i] = 0;
+ call->nhets = 0;
+ call->ndiploid = 0;
+
+ // Test if CGT,UGT are needed
+ int ucgts_needed = 0;
+ int32_t *cgts = call->cgts - 1;
+ int32_t *ugts = call->ugts - 1;
+ int32_t *gts = call->gts - 2;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ cgts++;
+ ugts++;
+ gts += 2;
+ if ( ugts[0]==-1 )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
+ continue;
+ }
+ int a,b;
+ if ( cgts[0]!=ugts[0] )
+ {
+ bcf_gt2alleles(cgts[0], &a, &b);
+ gts[0] = bcf_gt_unphased(a);
+ gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
+ }
+ else
+ {
+ bcf_gt2alleles(ugts[0], &a, &b);
+ gts[0] = bcf_gt_unphased(a);
+ gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
+ }
+ if ( cgts[0]!=ugts[0] ) ucgts_needed = 1;
+ call->ac[a]++;
+ if ( ploidy==2 )
+ {
+ call->ac[b]++;
+ call->ndiploid++;
+ if ( a!=b ) call->nhets++;
+ }
+ }
+ if ( ucgts_needed )
+ {
+ // Some GTs are different
+ bcf_update_format_int32(call->hdr,rec,"UGT",call->ugts,nsmpl);
+ bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
+ }
+}
+
+static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int ngts = nals*(nals+1)/2;
+ int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new
+ if ( call->all_diploid && npls_src == npls_dst ) return;
+
+ int *pls_src = call->PLs, *pls_dst = call->PLs;
+
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int isample, ia;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ if ( ploidy==2 )
+ {
+ for (ia=0; ia<npls_dst; ia++)
+ pls_dst[ia] = pls_src[ call->pl_map[ia] ];
+ }
+ else if ( ploidy==1 )
+ {
+ for (ia=0; ia<nout_als; ia++)
+ {
+ int isrc = (ia+1)*(ia+2)/2-1;
+ pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
+ }
+ if ( ia<npls_dst ) pls_dst[ia] = bcf_int32_vector_end;
+ }
+ else
+ {
+ pls_dst[0] = bcf_int32_missing;
+ pls_dst[1] = bcf_int32_vector_end; // relying on nout_als>1 in mcall()
+ }
+ pls_src += npls_src;
+ pls_dst += npls_dst;
+ }
+ bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
+}
+
+void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int i, ret;
+
+ // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer,
+ // so only dealing with these cases at the moment
+ for (i=0; i<rec->n_info; i++)
+ {
+ bcf_info_t *info = &rec->d.info[i];
+ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
+ if ( vlen!=BCF_VL_R ) continue;
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
+ if ( type!=BCF_HT_INT ) continue;
+
+ ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp);
+ if ( ret>0 )
+ {
+ assert( ret==nals );
+ if ( out_als==1 )
+ bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1);
+ else
+ {
+ int j;
+ for (j=0; j<nals; j++)
+ {
+ if ( call->als_map[j]==-1 ) continue; // to be dropped
+ call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point
+ }
+ bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als);
+ }
+ }
+ }
+
+ for (i=0; i<rec->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &rec->d.fmt[i];
+ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
+ if ( vlen!=BCF_VL_R ) continue;
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
+ if ( type!=BCF_HT_INT ) continue;
+
+ ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp);
+ if ( ret>0 )
+ {
+ int j, nsmpl = bcf_hdr_nsamples(call->hdr);
+ int ndp = ret / nsmpl;
+ assert( ndp==nals );
+ if ( out_als==1 )
+ {
+ for (j=0; j<nsmpl; j++)
+ call->PLs[j] = call->itmp[j*ndp];
+
+ bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl);
+ }
+ else
+ {
+ int k;
+ for (j=0; j<nsmpl; j++)
+ {
+ int32_t *dp_dst = call->PLs + j*nout_als;
+ int32_t *dp_src = call->itmp + j*ndp;
+ for (k=0; k<nals; k++)
+ {
+ if ( call->als_map[k]==-1 ) continue; // to be dropped
+ dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point
+ }
+ }
+ bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als);
+ }
+ }
+ }
+}
+
+
+// NB: in this function we temporarily use calls->als_map for a different
+// purpose to store mapping from new (target) alleles to original alleles.
+//
+static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
+{
+ bcf_sr_regions_t *tgt = call->srs->targets;
+ if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals);
+ hts_expand(char*,tgt->nals+1,call->nals,call->als);
+
+ int has_new = 0;
+
+ int i, j, nals = 1;
+ for (i=1; i<call->nals_map; i++) call->als_map[i] = -1;
+
+ if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], tgt->als[0]) < 0 )
+ error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,tgt->als[0],rec->d.allele[0]);
+
+ // create mapping from new to old alleles
+ call->als[0] = tgt->als[0];
+ call->als_map[0] = 0;
+
+ for (i=1; i<tgt->nals; i++)
+ {
+ call->als[nals] = tgt->als[i];
+ j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]);
+
+ if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]);
+
+ if ( j>=0 )
+ {
+ // existing allele
+ call->als_map[nals] = j+1;
+ }
+ else
+ {
+ // There is a new allele in targets which is not present in VCF.
+ // We use the X allele to estimate PLs. Note that X may not be
+ // present at multiallelic indels sites. In that case we use the
+ // last allele anyway, because the least likely allele comes last
+ // in mpileup's ALT output.
+ call->als_map[nals] = (*unseen)>=0 ? *unseen : rec->n_allele - 1;
+ has_new = 1;
+ }
+ nals++;
+ }
+ if ( *unseen )
+ {
+ call->als_map[nals] = *unseen;
+ call->als[nals] = rec->d.allele[*unseen];
+ nals++;
+ }
+
+ if ( !has_new && nals==rec->n_allele ) return;
+ bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
+
+ // create mapping from new PL to old PL
+ int k = 0;
+ for (i=0; i<nals; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ int a = call->als_map[i], b = call->als_map[j];
+ call->pl_map[k++] = a>b ? a*(a+1)/2 + b : b*(b+1)/2 + a;
+ }
+ }
+
+ // update PL
+ call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int npls_ori = call->nPLs / nsmpl;
+ int npls_new = k;
+ hts_expand(int32_t,npls_new*nsmpl,call->n_itmp,call->itmp);
+ int *ori_pl = call->PLs, *new_pl = call->itmp;
+ for (i=0; i<nsmpl; i++)
+ {
+ for (k=0; k<npls_new; k++)
+ {
+ new_pl[k] = ori_pl[call->pl_map[k]];
+ if ( new_pl[k]==bcf_int32_missing && *unseen>=0 )
+ {
+ // missing value, and there is an unseen allele: identify the
+ // alleles and use the lk of either AX or XX
+ int k_ori = call->pl_map[k], ia, ib;
+ bcf_gt2alleles(k_ori, &ia, &ib);
+ k_ori = bcf_alleles2gt(ia,*unseen);
+ if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(ib,*unseen);
+ if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(*unseen,*unseen);
+ new_pl[k] = ori_pl[k_ori];
+ }
+ if ( !k && new_pl[k]==bcf_int32_vector_end ) new_pl[k]=bcf_int32_missing;
+ }
+ ori_pl += npls_ori;
+ new_pl += npls_new;
+ }
+ bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
+
+ // update QS
+ float qsum[5];
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
+ for (i=0; i<nals; i++)
+ qsum[i] = call->als_map[i]<nqs ? call->qsum[call->als_map[i]] : 0;
+ bcf_update_info_float(call->hdr, rec, "QS", qsum, nals);
+
+ if ( *unseen ) *unseen = nals-1;
+}
+
+
+/**
+ * This function implements the multiallelic calling model. It has two major parts:
+ * 1) determine the most likely set of alleles and calculate the quality of ref/non-ref site
+ * 2) determine and set the genotypes
+ * In various places in between, the BCF record gets updated.
+ */
+int mcall(call_t *call, bcf1_t *rec)
+{
+ int i, unseen = call->unseen;
+
+ // Force alleles when calling genotypes given alleles was requested
+ if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen);
+
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nals = rec->n_allele;
+ hts_expand(int,nals,call->nac,call->ac);
+ hts_expand(int,nals,call->nals_map,call->als_map);
+ hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map);
+
+ // Get the genotype likelihoods
+ call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
+ if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only
+ error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+
+ // Convert PLs to probabilities
+ int ngts = nals*(nals+1)/2;
+ hts_expand(double, call->nPLs, call->npdg, call->pdg);
+ set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen);
+
+ #if QS_FROM_PDG
+ estimate_qsum(call, rec);
+ #else
+ // Get sum of qualities
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
+ if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ if ( nqs < nals )
+ {
+ // Some of the listed alleles do not have the corresponding QS field. This is
+ // typically ref-only site with X in ALT.
+
+ hts_expand(float,nals,call->nqsum,call->qsum);
+ for (i=nqs; i<nals; i++) call->qsum[i] = 0;
+ }
+ float qsum_tot = 0;
+ for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
+ if ( !call->qsum[0] )
+ {
+ // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
+ // an equivalent of a single reference read.
+ if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
+ error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
+ if ( call->itmp[0] )
+ {
+ call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
+ qsum_tot += call->qsum[0];
+ }
+ }
+ if ( qsum_tot ) for (i=0; i<nals; i++) call->qsum[i] /= qsum_tot;
+ #endif
+
+ // Find the best combination of alleles
+ int out_als, nout;
+ if ( nals > 8*sizeof(out_als) )
+ {
+ fprintf(stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ return 0;
+ }
+ nout = mcall_find_best_alleles(call, nals, &out_als);
+
+ // Make sure the REF allele is always present
+ if ( !(out_als&1) )
+ {
+ out_als |= 1;
+ nout++;
+ }
+ int is_variant = out_als==1 ? 0 : 1;
+ if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
+
+ // With -A, keep all ALTs except X
+ if ( call->flag & CALL_KEEPALT )
+ {
+ nout = 0;
+ for (i=0; i<nals; i++)
+ {
+ if ( i>0 && i==unseen ) continue;
+ out_als |= 1<<i;
+ nout++;
+ }
+ }
+
+ int nAC = 0;
+ if ( out_als==1 ) // only REF allele on output
+ {
+ init_allele_trimming_maps(call, 1, nals);
+ mcall_set_ref_genotypes(call,nals);
+ bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now
+ }
+ else
+ {
+ // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
+ // Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
+ init_allele_trimming_maps(call, out_als, nals);
+ if ( !is_variant )
+ mcall_set_ref_genotypes(call,nals); // running with -A, prevent mcall_call_genotypes from putting some ALT back
+ else if ( call->flag & CALL_CONSTR_TRIO )
+ {
+ if ( nout>4 )
+ {
+ fprintf(stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ return 0;
+ }
+ mcall_call_trio_genotypes(call, rec, nals,nout,out_als);
+ }
+ else
+ mcall_call_genotypes(call,rec,nals,nout,out_als);
+
+ // Skip the site if all samples are 0/0. This can happen occasionally.
+ nAC = 0;
+ for (i=1; i<nout; i++) nAC += call->ac[i];
+ if ( !nAC && call->flag & CALL_VARONLY ) return 0;
+ mcall_trim_PLs(call, rec, nals, nout, out_als);
+ }
+ if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als);
+
+ // Set QUAL and calculate HWE-related annotations
+ if ( nAC )
+ {
+ float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid);
+ if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1);
+
+ float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid);
+ if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
+
+ // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
+ rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum));
+ }
+ else
+ {
+ // Set the quality of a REF site
+ rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum));
+ }
+ if ( rec->qual>999 ) rec->qual = 999;
+ if ( rec->qual>50 ) rec->qual = rint(rec->qual);
+
+ // AC, AN
+ if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1);
+ nAC += call->ac[0];
+ bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
+
+ // Remove unused alleles
+ hts_expand(char*,nout,call->nals,call->als);
+ for (i=0; i<nals; i++)
+ if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
+ bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout);
+ bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
+
+ // DP4 tag
+ if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
+ {
+ int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
+ bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
+
+ int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
+ bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
+ }
+
+ bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
+ return nout;
+}
+
diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c
new file mode 100644
index 0000000..b4c4a99
--- /dev/null
+++ b/bcftools/mcall.c.pysam.c
@@ -0,0 +1,1539 @@
+#include "pysam.h"
+
+/* mcall.c -- multiallelic and rare variant calling.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <math.h>
+#include <htslib/kfunc.h>
+#include "call.h"
+
+// Using priors for GTs does not seem to be mathematically justified. Although
+// it seems effective in removing false calls, it also flips a significant
+// proportion of HET genotypes. Better is to filter by FORMAT/GQ using
+// `bcftools filter`.
+#define USE_PRIOR_FOR_GTS 0
+
+// Go with uniform PLs for samples with no coverage. If unset, missing
+// genotypes is reported instead.
+#define FLAT_PDG_FOR_MISSING 0
+
+// Estimate QS (combined quality and allele frequencies) from PLs
+#define QS_FROM_PDG 0
+
+
+void qcall_init(call_t *call) { return; }
+void qcall_destroy(call_t *call) { return; }
+int qcall(call_t *call, bcf1_t *rec)
+{
+ // QCall format:
+ // chromosome, position, reference allele, depth, mapping quality, 0, ..
+ error("TODO: qcall output\n");
+ return 0;
+}
+
+void call_init_pl2p(call_t *call)
+{
+ int i;
+ for (i=0; i<256; i++)
+ call->pl2p[i] = pow(10., -i/10.);
+}
+
+// Macros for accessing call->trio and call->ntrio
+#define FTYPE_222 0 // family type: all diploid
+#define FTYPE_121 1 // chrX, the child is a boy
+#define FTYPE_122 2 // chrX, a girl
+#define FTYPE_101 3 // chrY, boy
+#define FTYPE_100 4 // chrY, girl
+
+#define GT_SKIP 0xf // empty genotype (chrY in females)
+
+#define IS_POW2(x) (!((x) & ((x) - 1))) // zero is permitted
+#define IS_HOM(x) IS_POW2(x)
+
+// Pkij = P(k|i,j) tells how likely it is to be a het if the parents
+// are homs etc. The consistency of i,j,k has been already checked.
+// Parameters are alleles and ploidy of father, mother, kid
+// Returns 2/Pkij.
+int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl)
+{
+ int als = fals|mals|kals;
+ if ( IS_HOM(als) ) return 2; // all are the same: child must be a HOM, P=1
+
+ if ( fpl==1 )
+ {
+ if ( kpl==1 ) // chr X, the child is a boy, the copy is inherited from the mother
+ {
+ if ( IS_HOM(mals) ) return 2; // 0 11 -> P(1) = 1
+ return 4; // 0 01 -> P(0) = P(1) = 1/2
+ }
+ // chr X, the child is a girl
+ if ( IS_HOM(mals) ) return 2; // 0 11 -> P(01) = 1
+ return 4; // 0 01 -> P(00) = P(01) = 1/2
+ }
+
+ if ( IS_HOM(fals) && IS_HOM(mals) ) return 2; // 00 11 01, the child must be a HET, P=1
+ if ( !IS_HOM(fals) && !IS_HOM(mals) )
+ {
+ if ( IS_HOM(kals) ) return 8; // 01 01 00 or 01 01 11, P(k=HOM) = 1/4
+ return 4; // 01 01 01, P(k=HET) = 1/2
+ }
+ return 4; // 00 01, P(k=HET) = P(k=HOM) = 1/2
+}
+
+// Initialize ntrio and trio: ntrio lists the number of possible
+// genotypes given combination of haploid/diploid genomes and the
+// number of alleles. trio lists allowed genotype combinations:
+// 4bit: 2/Pkij, 4: father, 4: mother, 4: child
+// See also mcall_call_trio_genotypes()
+//
+static void mcall_init_trios(call_t *call)
+{
+ // 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
+ call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
+ call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
+ call->ntrio[FTYPE_122][2] = 8; call->ntrio[FTYPE_122][3] = 27; call->ntrio[FTYPE_122][4] = 64;
+ call->ntrio[FTYPE_101][2] = 2; call->ntrio[FTYPE_101][3] = 3; call->ntrio[FTYPE_101][4] = 4;
+ call->ntrio[FTYPE_100][2] = 2; call->ntrio[FTYPE_100][3] = 3; call->ntrio[FTYPE_100][4] = 4;
+
+ int nals, itype;
+ for (itype=0; itype<=4; itype++)
+ {
+ for (nals=2; nals<=4; nals++)
+ call->trio[itype][nals] = (uint16_t*) malloc(sizeof(uint16_t)*call->ntrio[itype][nals]);
+ }
+
+ // max 10 possible diploid genotypes
+ int gts[10];
+ for (nals=2; nals<=4; nals++)
+ {
+ int i,j,k, n = 0, ngts = 0;
+ for (i=0; i<nals; i++)
+ for (j=0; j<=i; j++)
+ gts[ngts++] = 1<<i | 1<<j;
+
+ // 222: all diploid
+ // i,j,k: father, mother, child
+ for (i=0; i<ngts; i++)
+ for (j=0; j<ngts; j++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue; // k not present in neither i nor j
+ if ( !(gts[i] & gts[k]) || !(gts[j] & gts[k]) ) continue; // one copy from father, one from mother
+ int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 2,2,2);
+ call->trio[FTYPE_222][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k; // father, mother, child
+ }
+ assert( n==call->ntrio[FTYPE_222][nals] );
+
+ // 121: chrX, boy
+ n = 0;
+ for (i=0; i<ngts; i++)
+ for (j=0; j<ngts; j++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue; // father nor boy can be diploid
+ if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue;
+ if ( !(gts[j] & gts[k]) ) continue; // boy must inherit the copy from mother
+ int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,1);
+ call->trio[FTYPE_121][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
+ }
+ assert( n==call->ntrio[FTYPE_121][nals] );
+
+ // 122: chrX, girl
+ n = 0;
+ for (i=0; i<ngts; i++)
+ for (j=0; j<ngts; j++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( !IS_HOM(gts[i]) ) continue;
+ if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue;
+ if ( !(gts[i] & gts[k]) ) continue; // girl must inherit one copy from the father and one from the mother
+ if ( !(gts[j] & gts[k]) ) continue;
+ int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,2);
+ call->trio[FTYPE_122][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
+ }
+ assert( n==call->ntrio[FTYPE_122][nals] );
+
+ // 101: chrY, boy
+ n = 0;
+ for (i=0; i<ngts; i++)
+ for (k=0; k<ngts; k++)
+ {
+ if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue;
+ if ( (gts[i]>s[k]) != gts[k] ) continue;
+ call->trio[FTYPE_101][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | k;
+ }
+ assert( n==call->ntrio[FTYPE_101][nals] );
+
+ // 100: chrY, girl
+ n = 0;
+ for (i=0; i<ngts; i++)
+ {
+ if ( !IS_POW2(gts[i]) ) continue;
+ call->trio[FTYPE_100][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | GT_SKIP;
+ }
+ assert( n==call->ntrio[FTYPE_100][nals] );
+
+ }
+ call->GLs = (double*) calloc(bcf_hdr_nsamples(call->hdr)*10,sizeof(double));
+
+ int i, j;
+ for (i=0; i<call->nfams; i++)
+ {
+ family_t *fam = &call->fams[i];
+ int ploidy[3];
+ for (j=0; j<3; j++)
+ ploidy[j] = call->ploidy[fam->sample[j]];
+
+ if ( ploidy[FATHER]==2 ) // not X, not Y
+ {
+ if ( ploidy[MOTHER]!=2 || ploidy[CHILD]!=2 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ fam->type = FTYPE_222;
+ continue;
+ }
+ if ( ploidy[FATHER]!=1 || ploidy[MOTHER]==1 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ if ( ploidy[MOTHER]==2 ) // X
+ {
+ if ( ploidy[CHILD]==0 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ fam->type = ploidy[CHILD]==2 ? FTYPE_122 : FTYPE_121; // a girl or a boy
+ }
+ else // Y
+ {
+ if ( ploidy[CHILD]==2 )
+ error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
+ fam->type = ploidy[CHILD]==0 ? FTYPE_100 : FTYPE_101; // a girl or a boy
+ }
+ }
+}
+static void mcall_destroy_trios(call_t *call)
+{
+ int i, j;
+ for (i=2; i<=4; i++)
+ for (j=0; j<=4; j++)
+ free(call->trio[j][i]);
+}
+
+void mcall_init(call_t *call)
+{
+ call_init_pl2p(call);
+
+ call->nqsum = 5;
+ call->qsum = (float*) malloc(sizeof(float)*call->nqsum); // will be expanded later if ncessary
+ call->nals_map = 5;
+ call->als_map = (int*) malloc(sizeof(int)*call->nals_map);
+ call->npl_map = 5*(5+1)/2; // will be expanded later if necessary
+ call->pl_map = (int*) malloc(sizeof(int)*call->npl_map);
+ call->gts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int32_t)); // assuming at most diploid everywhere
+
+ if ( call->flag & CALL_CONSTR_TRIO )
+ {
+ call->cgts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
+ call->ugts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
+ mcall_init_trios(call);
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=CGT,Number=1,Type=Integer,Description=\"Constrained Genotype (0-based index to Number=G ordering).\">");
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=UGT,Number=1,Type=Integer,Description=\"Unconstrained Genotype (0-based index to Number=G ordering).\">");
+ }
+ if ( call->flag & CALL_CONSTR_ALLELES ) call->vcmp = vcmp_init();
+
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
+ if ( call->output_tags & CALL_FMT_GP )
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Phred-scaled genotype posterior probabilities\">");
+ if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+ call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
+ bcf_hdr_append(call->hdr,"##INFO=<ID=ICB,Number=1,Type=Float,Description=\"Inbreeding Coefficient Binomial test (bigger is better)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=HOB,Number=1,Type=Float,Description=\"Bias in the number of HOMs number (smaller is better)\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
+ bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
+
+ // init the prior
+ if ( call->theta>0 )
+ {
+ int i, n = 0;
+ if ( !call->ploidy ) n = 2*bcf_hdr_nsamples(call->hdr); // all are diploid
+ else
+ {
+ for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
+ n += call->ploidy[i];
+ }
+ // Watterson factor, here aM_1 = aM_2 = 1
+ double aM = 1;
+ for (i=2; i<n; i++) aM += 1./i;
+ call->theta *= aM;
+ if ( call->theta >= 1 )
+ {
+ fprintf(pysamerr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta);
+ call->theta = 0.99;
+ }
+ call->theta = log(call->theta);
+ }
+
+ return;
+}
+
+void mcall_destroy(call_t *call)
+{
+ if (call->vcmp) vcmp_destroy(call->vcmp);
+ free(call->itmp);
+ mcall_destroy_trios(call);
+ free(call->GPs);
+ free(call->GLs);
+ free(call->GQs);
+ free(call->anno16);
+ free(call->PLs);
+ free(call->qsum);
+ free(call->als_map);
+ free(call->pl_map);
+ free(call->gts); free(call->cgts); free(call->ugts);
+ free(call->pdg);
+ free(call->als);
+ free(call->ac);
+ return;
+}
+
+
+// Inits P(D|G): convert PLs from log space and normalize. In case of zero
+// depth, missing PLs are all zero. In this case, pdg's are set to 0
+// so that the corresponding genotypes can be set as missing and the
+// qual calculation is not affected.
+// Missing values are replaced by generic likelihoods when X (unseen allele) is
+// present.
+// NB: While the -m callig model uses the pdgs in canonical order,
+// the original samtools -c calling code uses pdgs in reverse order (AA comes
+// first, RR last).
+// NB: Ploidy is not taken into account here, which is incorrect.
+void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unseen)
+{
+ int i, j, nals;
+
+ // find out the number of alleles, expecting diploid genotype likelihoods
+ bcf_gt2alleles(n_gt-1, &i, &nals);
+ assert( i==nals );
+ nals++;
+
+ for (i=0; i<n_smpl; i++)
+ {
+ double sum = 0;
+ for (j=0; j<n_gt; j++)
+ {
+ if ( PLs[j]==bcf_int32_vector_end )
+ {
+ // We expect diploid genotype likelihoods. If not diploid, treat as missing
+ j = 0;
+ break;
+ }
+ if ( PLs[j]==bcf_int32_missing ) break;
+ assert( PLs[j]<256 );
+ pdg[j] = pl2p[ PLs[j] ];
+ sum += pdg[j];
+ }
+
+ if ( j==0 )
+ {
+ // First value is missing (LK of RR), this indicates that
+ // all values are missing.
+ j = sum = n_gt;
+ }
+ else if ( j<n_gt && unseen<0 )
+ {
+ // Some of the values are missing and the unseen allele LK is not
+ // available. In such a case, we set LK to a very small value.
+ sum = 0;
+ for (j=0; j<n_gt; j++)
+ {
+ assert( PLs[j]!=bcf_int32_vector_end );
+ if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
+ assert( PLs[j]<256 );
+ pdg[j] = pl2p[ PLs[j] ];
+ sum += pdg[j];
+ }
+ }
+ if ( j<n_gt )
+ {
+ // Missing values present, fill with unseen allele LK. This can be only
+ // as good as the merge was.
+ int ia,ib, k;
+ j = 0;
+ sum = 0;
+ for (ia=0; ia<nals; ia++)
+ {
+ for (ib=0; ib<=ia; ib++)
+ {
+ if ( PLs[j]==bcf_int32_missing )
+ {
+ k = bcf_alleles2gt(ia,unseen);
+ if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(ib,unseen);
+ if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(unseen,unseen);
+ if ( PLs[k]==bcf_int32_missing )
+ {
+ // The PLs for unseen allele X are not present as well as for ia, ib.
+ // This can happen with incremental calling, when one of the merged
+ // files had all alleles A,C,G,T, in such a case, X was not present.
+ // Use a very small value instead.
+ PLs[j] = 255;
+ }
+ else
+ PLs[j] = PLs[k];
+ }
+ pdg[j] = pl2p[ PLs[j] ];
+ sum += pdg[j];
+ j++;
+ }
+ }
+ }
+ // Normalize: sum_i pdg_i = 1
+ if ( sum==n_gt )
+ {
+ // all missing
+ #if FLAT_PDG_FOR_MISSING
+ for (j=0; j<n_gt; j++) pdg[j] = 1./n_gt;
+ #else
+ for (j=0; j<n_gt; j++) pdg[j] = 0;
+ #endif
+ }
+ else
+ for (j=0; j<n_gt; j++) pdg[j] /= sum;
+
+ PLs += n_gt;
+ pdg += n_gt;
+ }
+}
+
+/*
+ Allele frequency estimated as:
+ #A = \sum_i (2*P_AA + P_AB)
+ F_A = #A / ( #A + #B )
+ where i runs across all samples
+*/
+void estimate_qsum(call_t *call, bcf1_t *rec)
+{
+ double *pdg = call->pdg;
+ int ngts = rec->n_allele*(rec->n_allele+1)/2;
+ int i,nsmpl = bcf_hdr_nsamples(call->hdr);
+
+ hts_expand(float,rec->n_allele,call->nqsum,call->qsum);
+ for (i=0; i<rec->n_allele; i++) call->qsum[i] = 0;
+
+ for (i=0; i<nsmpl; i++)
+ {
+ int a, b, k = 0;
+ for (a=0; a<rec->n_allele; a++)
+ {
+ for (b=0; b<=a; b++)
+ {
+ call->qsum[a] += pdg[k];
+ call->qsum[b] += pdg[k];
+ k++;
+ }
+ }
+ pdg += ngts;
+ }
+ float sum = 0;
+ for (i=0; i<rec->n_allele; i++) sum += call->qsum[i];
+ if ( sum ) for (i=0; i<rec->n_allele; i++) call->qsum[i] /= sum;
+}
+
+// Create mapping between old and new (trimmed) alleles
+void init_allele_trimming_maps(call_t *call, int als, int nals)
+{
+ int i, j;
+
+ // als_map: old(i) -> new(j)
+ for (i=0, j=0; i<nals; i++)
+ {
+ if ( als & 1<<i ) call->als_map[i] = j++;
+ else call->als_map[i] = -1;
+ }
+
+ if ( !call->pl_map ) return;
+
+ // pl_map: new(k) -> old(l)
+ int k = 0, l = 0;
+ for (i=0; i<nals; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ if ( (als & 1<<i) && (als & 1<<j) ) call->pl_map[k++] = l;
+ l++;
+ }
+ }
+}
+
+double binom_dist(int N, double p, int k)
+{
+ int mean = (int) (N*p);
+ if ( mean==k ) return 1.0;
+
+ double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p);
+ if ( k > N - k ) k = N - k;
+ if ( mean > N - mean ) mean = N - mean;
+
+ if ( k < mean ) { int tmp = k; k = mean; mean = tmp; }
+ double diff = k - mean;
+
+ double val = 1.0;
+ int i;
+ for (i=0; i<diff; i++)
+ val = val * (N-mean-i) / (k-i);
+
+ return exp(log_p)/val;
+}
+
+
+// Inbreeding Coefficient, binomial test
+float calc_ICB(int nref, int nalt, int nhets, int ndiploid)
+{
+ if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
+
+ double fref = (double)nref/(nref+nalt); // fraction of reference allelels
+ double falt = (double)nalt/(nref+nalt); // non-ref als
+ double q = 2*fref*falt; // probability of a het, assuming HWE
+ double mean = q*ndiploid;
+
+ //fprintf(pysamerr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
+
+ // Can we use normal approximation? The second condition is for performance only
+ // and is not well justified.
+ if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 )
+ {
+ //fprintf(pysamerr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
+ return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)));
+ }
+
+ return binom_dist(ndiploid, q, nhets);
+}
+
+float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
+{
+ if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
+
+ double fref = (double)nref/(nref+nalt); // fraction of reference allelels
+ double falt = (double)nalt/(nref+nalt); // non-ref als
+ return fabs((double)nhets/ndiploid - 2*fref*falt);
+}
+
+/**
+ * log(sum_i exp(a_i))
+ */
+static inline double logsumexp(double *vals, int nvals)
+{
+ int i;
+ double max_exp = vals[0];
+ for (i=1; i<nvals; i++)
+ if ( max_exp < vals[i] ) max_exp = vals[i];
+
+ double sum = 0;
+ for (i=0; i<nvals; i++)
+ sum += exp(vals[i] - max_exp);
+
+ return log(sum) + max_exp;
+}
+/** log(exp(a)+exp(b)) */
+static inline double logsumexp2(double a, double b)
+{
+ if ( a>b )
+ return log(1 + exp(b-a)) + a;
+ else
+ return log(1 + exp(a-b)) + b;
+}
+
+// Macro to set the most likely alleles
+#define UPDATE_MAX_LKs(als) { \
+ if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
+ if ( lk_tot_set ) lk_sum = logsumexp2(lk_tot,lk_sum); \
+}
+
+#define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
+
+// Determine the most likely combination of alleles. In this implementation,
+// at most tri-allelic sites are considered. Returns the number of alleles.
+static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
+{
+ int ia,ib,ic; // iterators over up to three alleles
+ int max_als=0; // most likely combination of alleles
+ double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
+ double lk_sum = -HUGE_VAL; // for normalizing the likelihoods
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int ngts = nals*(nals+1)/2;
+
+ // Single allele
+ for (ia=0; ia<nals; ia++)
+ {
+ double lk_tot = 0;
+ int lk_tot_set = 0;
+ int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype
+ int isample;
+ double *pdg = call->pdg + iaa;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
+ pdg += ngts;
+ }
+ if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
+ else lk_tot += call->theta; // the prior
+ UPDATE_MAX_LKs(1<<ia);
+ }
+
+ // Two alleles
+ if ( nals>1 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( call->qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( call->qsum[ib]==0 ) continue;
+ double lk_tot = 0;
+ int lk_tot_set = 0;
+ double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]);
+ double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]);
+ double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+ double *pdg = call->pdg;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ double val = 0;
+ if ( !call->ploidy || call->ploidy[isample]==2 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab];
+ else if ( call->ploidy && call->ploidy[isample]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb];
+ if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
+ pdg += ngts;
+ }
+ if ( ia!=0 ) lk_tot += call->theta; // the prior
+ if ( ib!=0 ) lk_tot += call->theta;
+ UPDATE_MAX_LKs(1<<ia|1<<ib);
+ }
+ }
+ }
+
+ // Three alleles
+ if ( nals>2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( call->qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( call->qsum[ib]==0 ) continue;
+ int ibb = (ib+1)*(ib+2)/2-1;
+ int iab = iaa - ia + ib;
+ for (ic=0; ic<ib; ic++)
+ {
+ if ( call->qsum[ic]==0 ) continue;
+ double lk_tot = 0;
+ int lk_tot_set = 1;
+ double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
+ double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
+ double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ int isample, icc = (ic+1)*(ic+2)/2-1;
+ int iac = iaa - ia + ic, ibc = ibb - ib + ic;
+ double *pdg = call->pdg;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ double val = 0;
+ if ( !call->ploidy || call->ploidy[isample]==2 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ else if ( call->ploidy && call->ploidy[isample]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
+ if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
+ pdg += ngts;
+ }
+ if ( ia!=0 ) lk_tot += call->theta; // the prior
+ if ( ib!=0 ) lk_tot += call->theta; // the prior
+ if ( ic!=0 ) lk_tot += call->theta; // the prior
+ UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic);
+ }
+ }
+ }
+ }
+
+ call->ref_lk = ref_lk;
+ call->lk_sum = lk_sum;
+ *out_als = max_als;
+
+ int i, n = 0;
+ for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
+
+ return n;
+}
+
+static void mcall_set_ref_genotypes(call_t *call, int nals)
+{
+ int i;
+ int ngts = nals*(nals+1)/2;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+
+ for (i=0; i<nals; i++) call->ac[i] = 0;
+ call->nhets = 0;
+ call->ndiploid = 0;
+
+ // Set all genotypes to 0/0 or 0
+ int *gts = call->gts;
+ double *pdg = call->pdg;
+ int isample;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts || !ploidy )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
+ }
+ else
+ {
+ gts[0] = bcf_gt_unphased(0);
+ gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
+ call->ac[0] += ploidy;
+ }
+ gts += 2;
+ pdg += ngts;
+ }
+}
+
+static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int ia, ib, i;
+ int ngts = nals*(nals+1)/2;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nout_gts = nout_als*(nout_als+1)/2;
+ hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs);
+
+ for (i=0; i<nout_als; i++) call->ac[i] = 0;
+ call->nhets = 0;
+ call->ndiploid = 0;
+
+ #if USE_PRIOR_FOR_GTS
+ float prior = exp(call->theta);
+ #endif
+ float *gps = call->GPs - nout_gts;
+ double *pdg = call->pdg - ngts;
+ int *gts = call->gts - 2;
+
+ int isample;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ assert( ploidy>=0 && ploidy<=2 );
+
+ pdg += ngts;
+ gts += 2;
+ gps += nout_gts;
+
+ if ( !ploidy )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = bcf_int32_vector_end;
+ gps[0] = -1;
+ continue;
+ }
+
+ #if !FLAT_PDG_FOR_MISSING
+ // Skip samples with zero depth, they have all pdg's equal to 0
+ for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
+ gps[0] = -1;
+ continue;
+ }
+ #endif
+
+ if ( ploidy==2 ) call->ndiploid++;
+
+ // Default fallback for the case all LKs are the same
+ gts[0] = bcf_gt_unphased(0);
+ gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
+
+ // Non-zero depth, determine the most likely genotype
+ double best_lk = 0;
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
+ double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ #if USE_PRIOR_FOR_GTS
+ if ( ia!=0 ) lk *= prior;
+ #endif
+ int igt = ploidy==2 ? bcf_alleles2gt(call->als_map[ia],call->als_map[ia]) : call->als_map[ia];
+ gps[igt] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_gt_unphased(call->als_map[ia]);
+ }
+ }
+ if ( ploidy==2 )
+ {
+ gts[1] = gts[0];
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( !(out_als & 1<<ib) ) continue;
+ int iab = iaa - ia + ib;
+ double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib];
+ #if USE_PRIOR_FOR_GTS
+ if ( ia!=0 ) lk *= prior;
+ if ( ib!=0 ) lk *= prior;
+ #endif
+ int igt = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
+ gps[igt] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_gt_unphased(call->als_map[ib]);
+ gts[1] = bcf_gt_unphased(call->als_map[ia]);
+ }
+ }
+ }
+ if ( gts[0] != gts[1] ) call->nhets++;
+ }
+ else
+ gts[1] = bcf_int32_vector_end;
+
+ call->ac[ bcf_gt_allele(gts[0]) ]++;
+ if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
+ }
+ if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+ {
+ double max, sum;
+ for (isample=0; isample<nsmpl; isample++)
+ {
+ gps = call->GPs + isample*nout_gts;
+
+ int nmax;
+ if ( call->ploidy )
+ {
+ if ( call->ploidy[isample]==2 ) nmax = nout_gts;
+ else if ( call->ploidy[isample]==1 ) nmax = nout_als;
+ else nmax = 0;
+ }
+ else nmax = nout_gts;
+
+ max = gps[0];
+ if ( max<0 || nmax==0 )
+ {
+ // no call
+ if ( call->output_tags & CALL_FMT_GP )
+ {
+ for (i=0; i<nmax; i++) gps[i] = 0;
+ if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
+ if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+ }
+ call->GQs[isample] = 0;
+ continue;
+ }
+ sum = gps[0];
+ for (i=1; i<nmax; i++)
+ {
+ if ( max < gps[i] ) max = gps[i];
+ sum += gps[i];
+ }
+ max = -4.34294*log(1 - max/sum);
+ call->GQs[isample] = max<=INT8_MAX ? max : INT8_MAX;
+ if ( call->output_tags & CALL_FMT_GP )
+ {
+ assert( max );
+ for (i=0; i<nmax; i++) gps[i] = (int)(-4.34294*log(gps[i]/sum));
+ if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+ }
+ }
+ }
+ if ( call->output_tags & CALL_FMT_GP )
+ bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts);
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
+}
+
+
+/**
+ Pm = P(mendelian) .. parameter to vary, 1-Pm is the probability of novel mutation.
+ When trio_Pm_ins is negative, Pm is calculated dynamically
+ according to indel length. For simplicity, only the
+ first ALT is considered.
+ Pkij = P(k|i,j) .. probability that the genotype combination i,j,k is consistent
+ with mendelian inheritance (the likelihood that offspring
+ of two HETs is a HOM is smaller than it being a HET)
+
+ P_uc(F=i,M=j,K=k) = P(F=i) . P(M=j) . P(K=k) .. unconstrained P
+ P_c(F=i,M=j,K=k) = P_uc . Pkij .. constrained P
+ P(F=i,M=j,K=k) = P_uc . (1 - Pm) + P_c . Pm
+ = P_uc . [1 - Pm + Pkij . Pm]
+
+ We choose genotype combination i,j,k which maximizes P(F=i,M=j,K=k). This
+ probability gives the quality GQ(Trio).
+ Individual qualities are calculated as
+ GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
+ */
+static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int ia, ib, i;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int ngts = nals*(nals+1)/2;
+ int nout_gts = nout_als*(nout_als+1)/2;
+ double *gls = call->GLs - nout_gts;
+ double *pdg = call->pdg - ngts;
+
+ // Calculate individuals' genotype likelihoods P(X=i)
+ int isample;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ int32_t *gts = call->ugts + isample;
+
+ gls += nout_gts;
+ pdg += ngts;
+
+ // Skip samples with all pdg's equal to 1. These have zero depth.
+ for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts || !ploidy )
+ {
+ gts[0] = -1;
+ gls[0] = 1;
+ continue;
+ }
+
+ for (i=0; i<nout_gts; i++) gls[i] = -HUGE_VAL;
+
+ double sum_lk = 0;
+ double best_lk = 0;
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
+ int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
+ double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ sum_lk += lk;
+ gls[idx] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
+ }
+ }
+ if ( ploidy==2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(out_als & 1<<ia) ) continue;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( !(out_als & 1<<ib) ) continue;
+ int iab = bcf_alleles2gt(ia,ib);
+ int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
+ double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib];
+ sum_lk += lk;
+ gls[idx] = lk;
+ if ( best_lk < lk )
+ {
+ best_lk = lk;
+ gts[0] = bcf_alleles2gt(call->als_map[ib],call->als_map[ia]);
+ }
+ }
+ }
+ }
+ for (i=0; i<nout_gts; i++)
+ if ( gls[i]!=-HUGE_VAL ) gls[i] = log(gls[i]/sum_lk);
+ }
+
+ // Set novel mutation rate for this site: using first ALT allele for simplicity.
+ double trio_Pm;
+ if ( call->trio_Pm_ins<0 && call->trio_Pm_del<0 ) trio_Pm = call->trio_Pm_SNPs; // the same Pm for indels and SNPs requested
+ else
+ {
+ int ret = bcf_get_variant_types(rec);
+ if ( !(ret & VCF_INDEL) ) trio_Pm = call->trio_Pm_SNPs;
+ else
+ {
+ if ( call->trio_Pm_ins<0 ) // dynamic calculation, trio_Pm_del holds the scaling factor
+ {
+ trio_Pm = rec->d.var[1].n<0 ? -21.9313 - 0.2856*rec->d.var[1].n : -22.8689 + 0.2994*rec->d.var[1].n;
+ trio_Pm = 1 - call->trio_Pm_del * exp(trio_Pm);
+ }
+ else // snps and indels set explicitly
+ {
+ trio_Pm = rec->d.var[1].n<0 ? call->trio_Pm_del : call->trio_Pm_ins;
+ }
+ }
+ }
+
+ // Calculate constrained likelihoods and determine genotypes
+ int ifm;
+ for (ifm=0; ifm<call->nfams; ifm++)
+ {
+ family_t *fam = &call->fams[ifm];
+ int ntrio = call->ntrio[fam->type][nout_als];
+ uint16_t *trio = call->trio[fam->type][nout_als];
+
+ // Unconstrained likelihood
+ int uc_itr = 0;
+ double uc_lk = 0;
+ for (i=0; i<3; i++) // for father, mother, child
+ {
+ int ismpl = fam->sample[i];
+ double *gl = call->GLs + nout_gts*ismpl;
+ if ( gl[0]==1 ) continue;
+ int j, jmax = 0;
+ double max = gl[0];
+ for (j=1; j<nout_gts; j++)
+ if ( max < gl[j] ) { max = gl[j]; jmax = j; }
+ uc_lk += max;
+ uc_itr |= jmax << ((2-i)*4);
+ }
+
+ // Best constrained likelihood
+ int c_itr = -1, itr, uc_is_mendelian = 0;
+ double c_lk = -HUGE_VAL;
+ for (itr=0; itr<ntrio; itr++) // for each trio genotype combination
+ {
+ double lk = 0;
+ int npresent = 0;
+ for (i=0; i<3; i++) // for father, mother, child
+ {
+ int ismpl = fam->sample[i];
+ double *gl = call->GLs + nout_gts*ismpl;
+ if ( gl[0]==1 ) continue;
+ int igt = trio[itr]>>((2-i)*4) & 0xf;
+ assert( !call->ploidy || call->ploidy[ismpl]>0 );
+ if ( igt==GT_SKIP ) continue;
+ lk += gl[igt];
+ npresent++;
+ // fprintf(pysamerr," %e", gl[igt]);
+ }
+ // fprintf(pysamerr,"\t\t");
+ double Pkij = npresent==3 ? (double)2/(trio[itr]>>12) : 1; // with missing genotypes Pkij's are different
+ lk += log(1 - trio_Pm * (1 - Pkij));
+ // fprintf(pysamerr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij);
+ if ( c_lk < lk ) { c_lk = lk; c_itr = trio[itr]; }
+ if ( uc_itr==trio[itr] ) uc_is_mendelian = 1;
+ }
+
+ if ( !uc_is_mendelian )
+ {
+ uc_lk += log(1 - trio_Pm);
+ // fprintf(pysamerr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
+ if ( c_lk < uc_lk ) { c_lk = uc_lk; c_itr = uc_itr; }
+ }
+ // fprintf(pysamerr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
+
+ // Set genotypes for father, mother, child and calculate genotype qualities
+ for (i=0; i<3; i++)
+ {
+ // GT
+ int ismpl = fam->sample[i];
+ int igt = c_itr>>((2-i)*4) & 0xf;
+ double *gl = call->GLs + nout_gts*ismpl;
+ int32_t *gts = call->cgts + ismpl;
+ if ( gl[0]==1 || igt==GT_SKIP ) // zero depth, set missing genotypes
+ {
+ gts[0] = -1;
+ // bcf_float_set_missing(call->GQs[ismpl]);
+ continue;
+ }
+ gts[0] = igt;
+
+ #if 0
+ // todo: Genotype Qualities
+ //
+ // GQ: for each family member i sum over all genotypes j,k keeping igt fixed
+ double lk_sum = 0;
+ for (itr=0; itr<ntrio; itr++)
+ {
+ if ( igt != (trio[itr]>>((2-i)*4) & 0xf) ) continue;
+ double lk = 0;
+ int j;
+ for (j=0; j<3; j++)
+ {
+ int jsmpl = fam->sample[j];
+ double *gl = call->GLs + ngts*jsmpl;
+ if ( gl[0]==1 ) continue;
+ int jgt = trio[itr]>>((2-j)*4) & 0xf;
+ if ( jgt==GT_SKIP ) continue;
+ lk += gl[jgt];
+ }
+ double Pkij = (double)2/(trio[itr]>>12);
+ lk += log(1 - trio_Pm * (1 - Pkij));
+ lk_sum = logsumexp2(lk_sum, lk);
+ }
+ if ( !uc_is_mendelian && (best_itr>>((2-i)*4)&0xf)==(uc_itr>>((2-i)*4)&0xf) ) lk_sum = logsumexp2(lk_sum,uc_lk);
+ call->GQs[ismpl] = -4.3429*(best_lk - lk_sum);
+ #endif
+ }
+ }
+
+ for (i=0; i<4; i++) call->ac[i] = 0;
+ call->nhets = 0;
+ call->ndiploid = 0;
+
+ // Test if CGT,UGT are needed
+ int ucgts_needed = 0;
+ int32_t *cgts = call->cgts - 1;
+ int32_t *ugts = call->ugts - 1;
+ int32_t *gts = call->gts - 2;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ cgts++;
+ ugts++;
+ gts += 2;
+ if ( ugts[0]==-1 )
+ {
+ gts[0] = bcf_gt_missing;
+ gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
+ continue;
+ }
+ int a,b;
+ if ( cgts[0]!=ugts[0] )
+ {
+ bcf_gt2alleles(cgts[0], &a, &b);
+ gts[0] = bcf_gt_unphased(a);
+ gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
+ }
+ else
+ {
+ bcf_gt2alleles(ugts[0], &a, &b);
+ gts[0] = bcf_gt_unphased(a);
+ gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
+ }
+ if ( cgts[0]!=ugts[0] ) ucgts_needed = 1;
+ call->ac[a]++;
+ if ( ploidy==2 )
+ {
+ call->ac[b]++;
+ call->ndiploid++;
+ if ( a!=b ) call->nhets++;
+ }
+ }
+ if ( ucgts_needed )
+ {
+ // Some GTs are different
+ bcf_update_format_int32(call->hdr,rec,"UGT",call->ugts,nsmpl);
+ bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
+ }
+}
+
+static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int ngts = nals*(nals+1)/2;
+ int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new
+ if ( call->all_diploid && npls_src == npls_dst ) return;
+
+ int *pls_src = call->PLs, *pls_dst = call->PLs;
+
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int isample, ia;
+ for (isample = 0; isample < nsmpl; isample++)
+ {
+ int ploidy = call->ploidy ? call->ploidy[isample] : 2;
+ if ( ploidy==2 )
+ {
+ for (ia=0; ia<npls_dst; ia++)
+ pls_dst[ia] = pls_src[ call->pl_map[ia] ];
+ }
+ else if ( ploidy==1 )
+ {
+ for (ia=0; ia<nout_als; ia++)
+ {
+ int isrc = (ia+1)*(ia+2)/2-1;
+ pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
+ }
+ if ( ia<npls_dst ) pls_dst[ia] = bcf_int32_vector_end;
+ }
+ else
+ {
+ pls_dst[0] = bcf_int32_missing;
+ pls_dst[1] = bcf_int32_vector_end; // relying on nout_als>1 in mcall()
+ }
+ pls_src += npls_src;
+ pls_dst += npls_dst;
+ }
+ bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
+}
+
+void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+{
+ int i, ret;
+
+ // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer,
+ // so only dealing with these cases at the moment
+ for (i=0; i<rec->n_info; i++)
+ {
+ bcf_info_t *info = &rec->d.info[i];
+ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
+ if ( vlen!=BCF_VL_R ) continue;
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
+ if ( type!=BCF_HT_INT ) continue;
+
+ ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp);
+ if ( ret>0 )
+ {
+ assert( ret==nals );
+ if ( out_als==1 )
+ bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1);
+ else
+ {
+ int j;
+ for (j=0; j<nals; j++)
+ {
+ if ( call->als_map[j]==-1 ) continue; // to be dropped
+ call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point
+ }
+ bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als);
+ }
+ }
+ }
+
+ for (i=0; i<rec->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &rec->d.fmt[i];
+ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
+ if ( vlen!=BCF_VL_R ) continue;
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
+ if ( type!=BCF_HT_INT ) continue;
+
+ ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp);
+ if ( ret>0 )
+ {
+ int j, nsmpl = bcf_hdr_nsamples(call->hdr);
+ int ndp = ret / nsmpl;
+ assert( ndp==nals );
+ if ( out_als==1 )
+ {
+ for (j=0; j<nsmpl; j++)
+ call->PLs[j] = call->itmp[j*ndp];
+
+ bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl);
+ }
+ else
+ {
+ int k;
+ for (j=0; j<nsmpl; j++)
+ {
+ int32_t *dp_dst = call->PLs + j*nout_als;
+ int32_t *dp_src = call->itmp + j*ndp;
+ for (k=0; k<nals; k++)
+ {
+ if ( call->als_map[k]==-1 ) continue; // to be dropped
+ dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point
+ }
+ }
+ bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als);
+ }
+ }
+ }
+}
+
+
+// NB: in this function we temporarily use calls->als_map for a different
+// purpose to store mapping from new (target) alleles to original alleles.
+//
+static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
+{
+ bcf_sr_regions_t *tgt = call->srs->targets;
+ if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals);
+ hts_expand(char*,tgt->nals+1,call->nals,call->als);
+
+ int has_new = 0;
+
+ int i, j, nals = 1;
+ for (i=1; i<call->nals_map; i++) call->als_map[i] = -1;
+
+ if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], tgt->als[0]) < 0 )
+ error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,tgt->als[0],rec->d.allele[0]);
+
+ // create mapping from new to old alleles
+ call->als[0] = tgt->als[0];
+ call->als_map[0] = 0;
+
+ for (i=1; i<tgt->nals; i++)
+ {
+ call->als[nals] = tgt->als[i];
+ j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]);
+
+ if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]);
+
+ if ( j>=0 )
+ {
+ // existing allele
+ call->als_map[nals] = j+1;
+ }
+ else
+ {
+ // There is a new allele in targets which is not present in VCF.
+ // We use the X allele to estimate PLs. Note that X may not be
+ // present at multiallelic indels sites. In that case we use the
+ // last allele anyway, because the least likely allele comes last
+ // in mpileup's ALT output.
+ call->als_map[nals] = (*unseen)>=0 ? *unseen : rec->n_allele - 1;
+ has_new = 1;
+ }
+ nals++;
+ }
+ if ( *unseen )
+ {
+ call->als_map[nals] = *unseen;
+ call->als[nals] = rec->d.allele[*unseen];
+ nals++;
+ }
+
+ if ( !has_new && nals==rec->n_allele ) return;
+ bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
+
+ // create mapping from new PL to old PL
+ int k = 0;
+ for (i=0; i<nals; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ int a = call->als_map[i], b = call->als_map[j];
+ call->pl_map[k++] = a>b ? a*(a+1)/2 + b : b*(b+1)/2 + a;
+ }
+ }
+
+ // update PL
+ call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int npls_ori = call->nPLs / nsmpl;
+ int npls_new = k;
+ hts_expand(int32_t,npls_new*nsmpl,call->n_itmp,call->itmp);
+ int *ori_pl = call->PLs, *new_pl = call->itmp;
+ for (i=0; i<nsmpl; i++)
+ {
+ for (k=0; k<npls_new; k++)
+ {
+ new_pl[k] = ori_pl[call->pl_map[k]];
+ if ( new_pl[k]==bcf_int32_missing && *unseen>=0 )
+ {
+ // missing value, and there is an unseen allele: identify the
+ // alleles and use the lk of either AX or XX
+ int k_ori = call->pl_map[k], ia, ib;
+ bcf_gt2alleles(k_ori, &ia, &ib);
+ k_ori = bcf_alleles2gt(ia,*unseen);
+ if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(ib,*unseen);
+ if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(*unseen,*unseen);
+ new_pl[k] = ori_pl[k_ori];
+ }
+ if ( !k && new_pl[k]==bcf_int32_vector_end ) new_pl[k]=bcf_int32_missing;
+ }
+ ori_pl += npls_ori;
+ new_pl += npls_new;
+ }
+ bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
+
+ // update QS
+ float qsum[5];
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
+ for (i=0; i<nals; i++)
+ qsum[i] = call->als_map[i]<nqs ? call->qsum[call->als_map[i]] : 0;
+ bcf_update_info_float(call->hdr, rec, "QS", qsum, nals);
+
+ if ( *unseen ) *unseen = nals-1;
+}
+
+
+/**
+ * This function implements the multiallelic calling model. It has two major parts:
+ * 1) determine the most likely set of alleles and calculate the quality of ref/non-ref site
+ * 2) determine and set the genotypes
+ * In various places in between, the BCF record gets updated.
+ */
+int mcall(call_t *call, bcf1_t *rec)
+{
+ int i, unseen = call->unseen;
+
+ // Force alleles when calling genotypes given alleles was requested
+ if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen);
+
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nals = rec->n_allele;
+ hts_expand(int,nals,call->nac,call->ac);
+ hts_expand(int,nals,call->nals_map,call->als_map);
+ hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map);
+
+ // Get the genotype likelihoods
+ call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
+ if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only
+ error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+
+ // Convert PLs to probabilities
+ int ngts = nals*(nals+1)/2;
+ hts_expand(double, call->nPLs, call->npdg, call->pdg);
+ set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen);
+
+ #if QS_FROM_PDG
+ estimate_qsum(call, rec);
+ #else
+ // Get sum of qualities
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
+ if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ if ( nqs < nals )
+ {
+ // Some of the listed alleles do not have the corresponding QS field. This is
+ // typically ref-only site with X in ALT.
+
+ hts_expand(float,nals,call->nqsum,call->qsum);
+ for (i=nqs; i<nals; i++) call->qsum[i] = 0;
+ }
+ float qsum_tot = 0;
+ for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
+ if ( !call->qsum[0] )
+ {
+ // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
+ // an equivalent of a single reference read.
+ if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
+ error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
+ if ( call->itmp[0] )
+ {
+ call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
+ qsum_tot += call->qsum[0];
+ }
+ }
+ if ( qsum_tot ) for (i=0; i<nals; i++) call->qsum[i] /= qsum_tot;
+ #endif
+
+ // Find the best combination of alleles
+ int out_als, nout;
+ if ( nals > 8*sizeof(out_als) )
+ {
+ fprintf(pysamerr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ return 0;
+ }
+ nout = mcall_find_best_alleles(call, nals, &out_als);
+
+ // Make sure the REF allele is always present
+ if ( !(out_als&1) )
+ {
+ out_als |= 1;
+ nout++;
+ }
+ int is_variant = out_als==1 ? 0 : 1;
+ if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
+
+ // With -A, keep all ALTs except X
+ if ( call->flag & CALL_KEEPALT )
+ {
+ nout = 0;
+ for (i=0; i<nals; i++)
+ {
+ if ( i>0 && i==unseen ) continue;
+ out_als |= 1<<i;
+ nout++;
+ }
+ }
+
+ int nAC = 0;
+ if ( out_als==1 ) // only REF allele on output
+ {
+ init_allele_trimming_maps(call, 1, nals);
+ mcall_set_ref_genotypes(call,nals);
+ bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now
+ }
+ else
+ {
+ // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
+ // Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
+ init_allele_trimming_maps(call, out_als, nals);
+ if ( !is_variant )
+ mcall_set_ref_genotypes(call,nals); // running with -A, prevent mcall_call_genotypes from putting some ALT back
+ else if ( call->flag & CALL_CONSTR_TRIO )
+ {
+ if ( nout>4 )
+ {
+ fprintf(pysamerr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ return 0;
+ }
+ mcall_call_trio_genotypes(call, rec, nals,nout,out_als);
+ }
+ else
+ mcall_call_genotypes(call,rec,nals,nout,out_als);
+
+ // Skip the site if all samples are 0/0. This can happen occasionally.
+ nAC = 0;
+ for (i=1; i<nout; i++) nAC += call->ac[i];
+ if ( !nAC && call->flag & CALL_VARONLY ) return 0;
+ mcall_trim_PLs(call, rec, nals, nout, out_als);
+ }
+ if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als);
+
+ // Set QUAL and calculate HWE-related annotations
+ if ( nAC )
+ {
+ float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid);
+ if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1);
+
+ float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid);
+ if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
+
+ // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
+ rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum));
+ }
+ else
+ {
+ // Set the quality of a REF site
+ rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum));
+ }
+ if ( rec->qual>999 ) rec->qual = 999;
+ if ( rec->qual>50 ) rec->qual = rint(rec->qual);
+
+ // AC, AN
+ if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1);
+ nAC += call->ac[0];
+ bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
+
+ // Remove unused alleles
+ hts_expand(char*,nout,call->nals,call->als);
+ for (i=0; i<nals; i++)
+ if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
+ bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout);
+ bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
+
+ // DP4 tag
+ if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
+ {
+ int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
+ bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
+
+ int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
+ bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
+ }
+
+ bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
+ return nout;
+}
+
diff --git a/bcftools/ploidy.c b/bcftools/ploidy.c
new file mode 100644
index 0000000..160bc3e
--- /dev/null
+++ b/bcftools/ploidy.c
@@ -0,0 +1,254 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <htslib/regidx.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/hts.h>
+#include "bcftools.h"
+#include "ploidy.h"
+
+struct _ploidy_t
+{
+ int nsex, msex; // number of genders, m:number of allocated elements in id2sex array
+ int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
+ int *sex2dflt;
+ regidx_t *idx;
+ void *sex2id;
+ char **id2sex;
+ kstring_t tmp_str;
+};
+
+typedef struct
+{
+ int sex, ploidy;
+}
+sex_ploidy_t;
+
+
+regidx_t *ploidy_regions(ploidy_t *ploidy)
+{
+ return ploidy->idx;
+}
+
+int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ int i, ret;
+ ploidy_t *ploidy = (ploidy_t*) usr;
+ void *sex2id = ploidy->sex2id;
+
+ // Check for special case of default ploidy "* * * <sex> <ploidy>"
+ int default_ploidy_def = 0;
+
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( ss[0]=='*' && (!ss[1] || isspace(ss[1])) )
+ default_ploidy_def = 1; // definition of default ploidy, chr="*"
+ else
+ {
+ // Fill CHR,FROM,TO
+ ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ if ( ret!=0 ) return ret;
+ }
+
+ // Skip the fields already parsed by regidx_parse_tab
+ ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ for (i=0; i<3; i++)
+ {
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) return -2; // wrong number of fields
+ while ( *ss && isspace(*ss) ) ss++;
+ }
+ if ( !*ss ) return -2;
+
+ // Parse the payload
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se || se==ss ) error("Could not parse: %s\n", line);
+ ploidy->tmp_str.l = 0;
+ kputsn(ss,se-ss,&ploidy->tmp_str);
+
+ sex_ploidy_t *sp = (sex_ploidy_t*) payload;
+ if ( khash_str2int_get(sex2id, ploidy->tmp_str.s, &sp->sex) != 0 )
+ {
+ ploidy->nsex++;
+ hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
+ ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
+ sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
+ ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
+ ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
+ }
+
+ ss = se;
+ while ( *se && isspace(*se) ) se++;
+ if ( !*se ) error("Could not parse: %s\n", line);
+ sp->ploidy = strtol(ss,&se,10);
+ if ( ss==se ) error("Could not parse: %s\n", line);
+ if ( sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
+ if ( sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
+
+ // Special case, chr="*" stands for a default value
+ if ( default_ploidy_def )
+ {
+ ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
+ return -1;
+ }
+
+ return 0;
+}
+
+ploidy_t *ploidy_init(const char *fname, int dflt)
+{
+ ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
+ if ( !pld ) return NULL;
+
+ pld->dflt = pld->min = pld->max = dflt;
+ pld->sex2id = khash_str2int_init();
+ pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+ if ( !pld->idx )
+ {
+ ploidy_destroy(pld);
+ pld = NULL;
+ }
+ return pld;
+}
+
+ploidy_t *ploidy_init_string(const char *str, int dflt)
+{
+ ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
+ if ( !pld ) return NULL;
+
+ pld->dflt = pld->min = pld->max = dflt;
+ pld->sex2id = khash_str2int_init();
+ pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+
+ kstring_t tmp = {0,0,0};
+ const char *ss = str;
+ while ( *ss )
+ {
+ while ( *ss && isspace(*ss) ) ss++;
+ const char *se = ss;
+ while ( *se && *se!='\r' && *se!='\n' ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ regidx_insert(pld->idx,tmp.s);
+ while ( *se && isspace(*se) ) se++;
+ ss = se;
+ }
+ regidx_insert(pld->idx,NULL);
+ free(tmp.s);
+
+ return pld;
+}
+
+void ploidy_destroy(ploidy_t *ploidy)
+{
+ if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
+ if ( ploidy->idx ) regidx_destroy(ploidy->idx);
+ free(ploidy->id2sex);
+ free(ploidy->tmp_str.s);
+ free(ploidy->sex2dflt);
+ free(ploidy);
+}
+
+int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
+{
+ regitr_t itr;
+ int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr);
+
+ if ( !sex2ploidy && !min && !max ) return ret;
+
+ if ( !ret )
+ {
+ // no overlap
+ if ( min ) *min = ploidy->dflt;
+ if ( max ) *max = ploidy->dflt;
+ if ( sex2ploidy )
+ for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->sex2dflt[i];
+ return 0;
+ }
+
+ int _min = INT_MAX, _max = -1;
+ if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
+
+ while ( REGITR_OVERLAP(itr,pos,pos) )
+ {
+ int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex;
+ int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy;
+ if ( pld!=ploidy->dflt )
+ {
+ if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
+ if ( _min > pld ) _min = pld;
+ if ( _max < pld ) _max = pld;
+ }
+ itr.i++;
+ }
+ if ( _max==-1 ) _max = _min = ploidy->dflt;
+ if ( max ) *max = _max;
+ if ( min ) *min = _min;
+
+ return 1;
+}
+
+int ploidy_nsex(ploidy_t *ploidy)
+{
+ return ploidy->nsex;
+}
+
+char *ploidy_id2sex(ploidy_t *ploidy, int id)
+{
+ if ( id<0 || id>=ploidy->nsex ) return NULL;
+ return ploidy->id2sex[id];
+}
+
+int ploidy_sex2id(ploidy_t *ploidy, char *sex)
+{
+ int id;
+ if ( khash_str2int_get(ploidy->sex2id,sex,&id)!=0 ) return -1;
+ return id;
+}
+
+int ploidy_add_sex(ploidy_t *ploidy, const char *sex)
+{
+ int id;
+ if ( khash_str2int_get(ploidy->sex2id, sex, &id)==0 ) return id;
+ ploidy->nsex++;
+ hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
+ ploidy->id2sex[ploidy->nsex-1] = strdup(sex);
+ ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
+ ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
+ return khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
+}
+
+int ploidy_max(ploidy_t *ploidy)
+{
+ return ploidy->dflt > ploidy->max ? ploidy->dflt : ploidy->max;
+}
+
+int ploidy_min(ploidy_t *ploidy)
+{
+ return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
+}
+
diff --git a/bcftools/ploidy.c.pysam.c b/bcftools/ploidy.c.pysam.c
new file mode 100644
index 0000000..4f567a3
--- /dev/null
+++ b/bcftools/ploidy.c.pysam.c
@@ -0,0 +1,256 @@
+#include "pysam.h"
+
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <htslib/regidx.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/hts.h>
+#include "bcftools.h"
+#include "ploidy.h"
+
+struct _ploidy_t
+{
+ int nsex, msex; // number of genders, m:number of allocated elements in id2sex array
+ int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
+ int *sex2dflt;
+ regidx_t *idx;
+ void *sex2id;
+ char **id2sex;
+ kstring_t tmp_str;
+};
+
+typedef struct
+{
+ int sex, ploidy;
+}
+sex_ploidy_t;
+
+
+regidx_t *ploidy_regions(ploidy_t *ploidy)
+{
+ return ploidy->idx;
+}
+
+int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ int i, ret;
+ ploidy_t *ploidy = (ploidy_t*) usr;
+ void *sex2id = ploidy->sex2id;
+
+ // Check for special case of default ploidy "* * * <sex> <ploidy>"
+ int default_ploidy_def = 0;
+
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( ss[0]=='*' && (!ss[1] || isspace(ss[1])) )
+ default_ploidy_def = 1; // definition of default ploidy, chr="*"
+ else
+ {
+ // Fill CHR,FROM,TO
+ ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ if ( ret!=0 ) return ret;
+ }
+
+ // Skip the fields already parsed by regidx_parse_tab
+ ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ for (i=0; i<3; i++)
+ {
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) return -2; // wrong number of fields
+ while ( *ss && isspace(*ss) ) ss++;
+ }
+ if ( !*ss ) return -2;
+
+ // Parse the payload
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se || se==ss ) error("Could not parse: %s\n", line);
+ ploidy->tmp_str.l = 0;
+ kputsn(ss,se-ss,&ploidy->tmp_str);
+
+ sex_ploidy_t *sp = (sex_ploidy_t*) payload;
+ if ( khash_str2int_get(sex2id, ploidy->tmp_str.s, &sp->sex) != 0 )
+ {
+ ploidy->nsex++;
+ hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
+ ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
+ sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
+ ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
+ ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
+ }
+
+ ss = se;
+ while ( *se && isspace(*se) ) se++;
+ if ( !*se ) error("Could not parse: %s\n", line);
+ sp->ploidy = strtol(ss,&se,10);
+ if ( ss==se ) error("Could not parse: %s\n", line);
+ if ( sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
+ if ( sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
+
+ // Special case, chr="*" stands for a default value
+ if ( default_ploidy_def )
+ {
+ ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
+ return -1;
+ }
+
+ return 0;
+}
+
+ploidy_t *ploidy_init(const char *fname, int dflt)
+{
+ ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
+ if ( !pld ) return NULL;
+
+ pld->dflt = pld->min = pld->max = dflt;
+ pld->sex2id = khash_str2int_init();
+ pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+ if ( !pld->idx )
+ {
+ ploidy_destroy(pld);
+ pld = NULL;
+ }
+ return pld;
+}
+
+ploidy_t *ploidy_init_string(const char *str, int dflt)
+{
+ ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
+ if ( !pld ) return NULL;
+
+ pld->dflt = pld->min = pld->max = dflt;
+ pld->sex2id = khash_str2int_init();
+ pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+
+ kstring_t tmp = {0,0,0};
+ const char *ss = str;
+ while ( *ss )
+ {
+ while ( *ss && isspace(*ss) ) ss++;
+ const char *se = ss;
+ while ( *se && *se!='\r' && *se!='\n' ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ regidx_insert(pld->idx,tmp.s);
+ while ( *se && isspace(*se) ) se++;
+ ss = se;
+ }
+ regidx_insert(pld->idx,NULL);
+ free(tmp.s);
+
+ return pld;
+}
+
+void ploidy_destroy(ploidy_t *ploidy)
+{
+ if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
+ if ( ploidy->idx ) regidx_destroy(ploidy->idx);
+ free(ploidy->id2sex);
+ free(ploidy->tmp_str.s);
+ free(ploidy->sex2dflt);
+ free(ploidy);
+}
+
+int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
+{
+ regitr_t itr;
+ int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr);
+
+ if ( !sex2ploidy && !min && !max ) return ret;
+
+ if ( !ret )
+ {
+ // no overlap
+ if ( min ) *min = ploidy->dflt;
+ if ( max ) *max = ploidy->dflt;
+ if ( sex2ploidy )
+ for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->sex2dflt[i];
+ return 0;
+ }
+
+ int _min = INT_MAX, _max = -1;
+ if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
+
+ while ( REGITR_OVERLAP(itr,pos,pos) )
+ {
+ int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex;
+ int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy;
+ if ( pld!=ploidy->dflt )
+ {
+ if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
+ if ( _min > pld ) _min = pld;
+ if ( _max < pld ) _max = pld;
+ }
+ itr.i++;
+ }
+ if ( _max==-1 ) _max = _min = ploidy->dflt;
+ if ( max ) *max = _max;
+ if ( min ) *min = _min;
+
+ return 1;
+}
+
+int ploidy_nsex(ploidy_t *ploidy)
+{
+ return ploidy->nsex;
+}
+
+char *ploidy_id2sex(ploidy_t *ploidy, int id)
+{
+ if ( id<0 || id>=ploidy->nsex ) return NULL;
+ return ploidy->id2sex[id];
+}
+
+int ploidy_sex2id(ploidy_t *ploidy, char *sex)
+{
+ int id;
+ if ( khash_str2int_get(ploidy->sex2id,sex,&id)!=0 ) return -1;
+ return id;
+}
+
+int ploidy_add_sex(ploidy_t *ploidy, const char *sex)
+{
+ int id;
+ if ( khash_str2int_get(ploidy->sex2id, sex, &id)==0 ) return id;
+ ploidy->nsex++;
+ hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
+ ploidy->id2sex[ploidy->nsex-1] = strdup(sex);
+ ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
+ ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
+ return khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
+}
+
+int ploidy_max(ploidy_t *ploidy)
+{
+ return ploidy->dflt > ploidy->max ? ploidy->dflt : ploidy->max;
+}
+
+int ploidy_min(ploidy_t *ploidy)
+{
+ return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
+}
+
diff --git a/bcftools/ploidy.h b/bcftools/ploidy.h
new file mode 100644
index 0000000..6deef73
--- /dev/null
+++ b/bcftools/ploidy.h
@@ -0,0 +1,129 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/*
+ Lookup from region and sex to ploidy.
+
+ Example of usage:
+
+ int default_ploidy = 2;
+ ploidy_t *pld = ploidy_init(fname, default_ploidy);
+
+ int nsex = ploidy_nsex(pld);
+ int *sex2ploidy = malloc(sizeof(int)*nsex);
+
+ ploidy_query(pld, "X",60000, sex2ploidy, NULL, NULL);
+ for (i=0; i<nsex; i++)
+ printf("ploidy of %s is %d\n", ploidy_id2sex(pld,i), sex2ploidy[i]);
+
+ ploidy_destroy(pld);
+
+ An example of ploidy file format follows. The coordinates are 1-based and
+ inclusive. The "*" records define the default ploidy for each sex. If not
+ present, the default_ploidy passed to ploidy_init is used instead:
+ X 1 60000 M 1
+ X 2699521 154931043 M 1
+ Y 1 59373566 M 1
+ Y 1 59373566 F 0
+ MT 1 16569 M 1
+ MT 1 16569 F 1
+ * * * M 2
+ * * * F 2
+*/
+
+#ifndef __PLOIDY_H__
+#define __PLOIDY_H__
+
+#include <htslib/regidx.h>
+
+typedef struct _ploidy_t ploidy_t;
+
+/*
+ * ploidy_init()
+ * @param fname: input file name or NULL if default ploidy from example above should be used
+ * @param dflt: default ploidy to use for unlisted regions (the '* * *' records have precedence).
+ *
+ * Returns new structure on success or NULL on error.
+ */
+ploidy_t *ploidy_init(const char *fname, int dflt);
+
+/* Same as ploidy_init() but the whole file is passed as a single string */
+ploidy_t *ploidy_init_string(const char *str, int dflt);
+
+/*
+ * ploidy_destroy() - free memory allocated by ploidy_init
+ */
+void ploidy_destroy(ploidy_t *ploidy);
+
+/*
+ * ploidy_query() - query ploidy at a position for all genders at once
+ * @param seq: chromosome name
+ * @param pos: 0-based position
+ * @param sex2ploidy: if not NULL, array will be filled with mapping from sex id to ploidy
+ * @param min: if not NULL, minimum encountered encountered will be set
+ * @param max: if not NULL, maximum encountered encountered will be set
+ *
+ * Returns 1 if the position is listed in the regions or 0 otherwise.
+ */
+int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max);
+
+/*
+ * ploidy_nsex() - return number of recognised genders
+ */
+int ploidy_nsex(ploidy_t *ploidy);
+
+/*
+ * ploidy_id2sex() - mapping between numeric gender id and name
+ *
+ * Returns gender name (e.g. "M" or "F" in the example above)
+ * or NULL if there is no such id.
+ */
+char *ploidy_id2sex(ploidy_t *ploidy, int id);
+
+/*
+ * ploidy_sex2id() - mapping between gender name and its numeric id
+ *
+ * Returns numeric id or -1 if not present.
+ */
+int ploidy_sex2id(ploidy_t *ploidy, char *sex);
+
+/*
+ * ploidy_add_sex() - register new gender name. This function is
+ * useful when gender has the default ploidy for all regions
+ * and is not listed in the file passed to ploidy_init()
+ *
+ * Returns numeric id of the added sex, regardless of whether the string was
+ * newly added or was already present in the dictionary.
+ */
+int ploidy_add_sex(ploidy_t *ploidy, const char *sex);
+
+/** Returns region index for raw access */
+regidx_t *ploidy_regions(ploidy_t *ploidy);
+
+/** Return the minimum / maximum recognised ploidy */
+int ploidy_max(ploidy_t *ploidy);
+int ploidy_min(ploidy_t *ploidy);
+
+#endif
+
diff --git a/bcftools/prob1.c b/bcftools/prob1.c
new file mode 100644
index 0000000..8f4463f
--- /dev/null
+++ b/bcftools/prob1.c
@@ -0,0 +1,529 @@
+/* prob1.c -- mathematical utility functions.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2012, 2013 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <limits.h>
+#include <zlib.h>
+#include "prob1.h"
+
+// #include "kstring.h"
+// #include "kseq.h"
+// KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MC_MAX_EM_ITER 16
+#define MC_EM_EPS 1e-5
+#define MC_DEF_INDEL 0.15
+
+gzFile bcf_p1_fp_lk;
+
+void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
+{
+ int i;
+ for (i = 0; i < ma->M; ++i)
+ ma->phi_indel[i] = ma->phi[i] * x;
+ ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x;
+}
+
+static void init_prior(int type, double theta, int M, double *phi)
+{
+ int i;
+ if (type == MC_PTYPE_COND2) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 2. * (i + 1) / (M + 1) / (M + 2);
+ } else if (type == MC_PTYPE_FLAT) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 1. / (M + 1);
+ } else {
+ double sum;
+ for (i = 0, sum = 0.; i < M; ++i)
+ sum += (phi[i] = theta / (M - i));
+ phi[M] = 1. - sum;
+ }
+}
+
+void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta)
+{
+ init_prior(type, theta, ma->M, ma->phi);
+ bcf_p1_indel_prior(ma, MC_DEF_INDEL);
+}
+
+void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta)
+{
+ if (ma->n1 <= 0 || ma->n1 >= ma->M) return;
+ init_prior(type, theta, 2*ma->n1, ma->phi1);
+ init_prior(type, theta, 2*(ma->n - ma->n1), ma->phi2);
+}
+
+
+/* Initialise a bcf_p1aux_t */
+bcf_p1aux_t *bcf_p1_init(int n_smpl, uint8_t *ploidy)
+{
+ bcf_p1aux_t *ma;
+ int i;
+ ma = (bcf_p1aux_t*) calloc(1, sizeof(bcf_p1aux_t));
+ ma->n1 = -1;
+ ma->n = n_smpl;
+ ma->M = 2 * n_smpl;
+ if (ploidy) {
+ ma->ploidy = (uint8_t*) malloc(n_smpl);
+ memcpy(ma->ploidy, ploidy, n_smpl);
+ for (i = 0, ma->M = 0; i < n_smpl; ++i) ma->M += ploidy[i];
+ if (ma->M == 2 * n_smpl) {
+ free(ma->ploidy);
+ ma->ploidy = 0;
+ }
+ }
+ ma->q2p = (double*) calloc(256, sizeof(double));
+ ma->pdg = (double*) calloc(3 * ma->n, sizeof(double));
+ ma->phi = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->phi_indel = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->phi1 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->phi2 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->z = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->zswap = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->z1 = (double*) calloc(ma->M + 1, sizeof(double)); // actually we do not need this large
+ ma->z2 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->afs = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->afs1 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->lf = (double*) calloc(ma->M + 1, sizeof(double));
+ for (i = 0; i < 256; ++i)
+ ma->q2p[i] = pow(10., -i / 10.);
+ for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1);
+ bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
+ return ma;
+}
+
+int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; }
+
+int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
+{
+ if (n1 == 0 || n1 >= b->n) return -1;
+ if (b->M != b->n * 2) {
+ fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
+ return -1;
+ }
+ b->n1 = n1;
+ return 0;
+}
+
+void bcf_p1_destroy(bcf_p1aux_t *ma)
+{
+ if (ma) {
+ int k;
+ free(ma->lf);
+ if (ma->hg && ma->n1 > 0) {
+ for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]);
+ free(ma->hg);
+ }
+ free(ma->ploidy); free(ma->q2p); free(ma->pdg);
+ free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
+ free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
+ free(ma->afs); free(ma->afs1);
+ free(ma);
+ }
+}
+
+extern double kf_gammap(double s, double z);
+int test16(bcf1_t *b, anno16_t *a);
+
+/* Calculate P(D|g) */
+static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
+{
+ int i, j;
+ long *p, tmp;
+ p = (long*) alloca(b->n_allele * sizeof(long));
+ memset(p, 0, sizeof(long) * b->n_allele);
+
+ // Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk
+ for (j = 0; j < ma->n; ++j) {
+ // Fetch the PL array for the sample
+ const int *pi = ma->PL + j * ma->PL_len;
+ // Fetch the P(D|g) array for the sample
+ double *pdg = ma->pdg + j * 3;
+ pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
+ for (i = 0; i < b->n_allele; ++i)
+ p[i] += (int)pi[(i+1)*(i+2)/2-1];
+ }
+ for (i = 0; i < b->n_allele; ++i) p[i] = p[i]<<4 | i;
+ for (i = 1; i < b->n_allele; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
+ for (i = b->n_allele - 1; i >= 0; --i)
+ if ((p[i]&0xf) == 0) break;
+ return i;
+}
+
+
+/* f0 is minor allele fraction */
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+{
+ double sum, g[3];
+ double max, f3[3], *pdg = ma->pdg + k * 3;
+ int q, i, max_i, ploidy;
+ /* determine ploidy */
+ ploidy = ma->ploidy? ma->ploidy[k] : 2;
+ if (ploidy == 2) {
+ /* given allele frequency we can determine how many of each
+ * genotype we have by HWE p=1-q PP=p^2 PQ&QP=2*p*q QQ=q^2 */
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ } else {
+ f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0;
+ }
+ for (i = 0, sum = 0.; i < 3; ++i)
+ sum += (g[i] = pdg[i] * f3[i]);
+ /* normalise g and then determine max */
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ g[i] /= sum;
+ if (g[i] > max) max = g[i], max_i = i;
+ }
+ max = 1. - max;
+ if (max < 1e-308) max = 1e-308;
+ q = (int)(-4.343 * log(max) + .499);
+ if (q > 99) q = 99;
+ return q<<2|max_i;
+}
+
+// If likelihoods fall below this they get squashed to 0
+#define TINY 1e-20
+static void mc_cal_y_core(bcf_p1aux_t *ma, int beg)
+{
+ double *z[2], *tmp, *pdg;
+ int _j, last_min, last_max;
+ assert(beg == 0 || ma->M == ma->n*2);
+ z[0] = ma->z;
+ z[1] = ma->zswap;
+ pdg = ma->pdg;
+ memset(z[0], 0, sizeof(double) * (ma->M + 1));
+ memset(z[1], 0, sizeof(double) * (ma->M + 1));
+ z[0][0] = 1.;
+ last_min = last_max = 0;
+ ma->t = 0.;
+ if (ma->M == ma->n * 2) {
+ int M = 0;
+ for (_j = beg; _j < ma->n; ++_j) {
+ int k, j = _j - beg, _min = last_min, _max = last_max, M0;
+ double p[3], sum;
+ M0 = M; M += 2;
+ // Fetch P(D|g) for this sample
+ pdg = ma->pdg + _j * 3;
+ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ // If we are not on the last sample
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset
+ ma->t1 = ma->t;
+ memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary?
+ //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.;
+ } else { // this block is very similar to the block above; these two might be merged in future
+ int j, M = 0;
+ for (j = 0; j < ma->n; ++j) {
+ int k, M0, _min = last_min, _max = last_max;
+ double p[3], sum;
+ // Fetch P(D|g) for this sample
+ pdg = ma->pdg + j * 3;
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ M0 = M;
+ M += ma->ploidy[j];
+ if (ma->ploidy[j] == 1) {
+ p[0] = pdg[0]; p[1] = pdg[2];
+ _max++;
+ if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k];
+ for (k = _min < 1? 1 : _min; k <= _max; ++k)
+ z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / M);
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ // If we are not on the last sample
+ if (j < ma->n - 1) z[1][_max+1] = 0.;
+ } else if (ma->ploidy[j] == 2) {
+ p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2];
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ // If we are not on the last sample
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ }
+ if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
+ if (bcf_p1_fp_lk)
+ gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1));
+}
+
+static void mc_cal_y(bcf_p1aux_t *ma)
+{
+ if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples
+ int k;
+ long double x;
+ memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
+ memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ ma->t1 = ma->t2 = 0.;
+ mc_cal_y_core(ma, ma->n1);
+ ma->t2 = ma->t;
+ memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ mc_cal_y_core(ma, 0);
+ // rescale z
+ x = expl(ma->t - (ma->t1 + ma->t2));
+ for (k = 0; k <= ma->M; ++k) ma->z[k] *= x;
+ } else mc_cal_y_core(ma, 0);
+}
+
+#define CONTRAST_TINY 1e-30
+
+extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test
+
+static inline double chi2_test(int a, int b, int c, int d)
+{
+ double x, z;
+ x = (double)(a+b) * (c+d) * (b+d) * (a+c);
+ if (x == 0.) return 1;
+ z = a * d - b * c;
+ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
+}
+
+// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3])
+{
+ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ int n1 = p1->n1, n2 = p1->n - p1->n1;
+ if (p < CONTRAST_TINY) return -1;
+ if (.5*k1/n1 < .5*k2/n2) x[1] += p;
+ else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
+ else x[0] += p;
+ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
+}
+
+static double contrast2(bcf_p1aux_t *p1, double ret[3])
+{
+ int k, k1, k2, k10, k20, n1, n2;
+ double sum;
+ // get n1 and n2
+ n1 = p1->n1; n2 = p1->n - p1->n1;
+ if (n1 <= 0 || n2 <= 0) return 0.;
+ if (p1->hg == 0) { // initialize the hypergeometric distribution
+ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
+ to avoid precomputing this matrix, but it is slower and quite intricate. The following
+ computation in this block can be accelerated with a similar strategy, but perhaps this
+ is not a serious concern for now. */
+ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
+ p1->hg = (double**) calloc(2*n1+1, sizeof(double*));
+ for (k1 = 0; k1 <= 2*n1; ++k1) {
+ p1->hg[k1] = (double*)calloc(2*n2+1, sizeof(double));
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
+ }
+ }
+ { // compute
+ long double suml = 0;
+ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
+ sum = suml;
+ }
+ { // get the max k1 and k2
+ double max;
+ int max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
+ double x = p1->phi1[k] * p1->z1[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k10 = max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
+ double x = p1->phi2[k] * p1->z2[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k20 = max_k;
+ }
+ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
+ double x[3], y;
+ long double z = 0., L[2];
+ x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0;
+ for (k1 = k10; k1 >= 0; --k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
+ if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0;
+ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y;
+ if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why...
+ z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
+ }
+ return (double)z;
+ }
+}
+
+static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)
+{
+ int k;
+ long double sum = 0., sum2;
+ double *phi = ma->is_indel? ma->phi_indel : ma->phi;
+ memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
+ mc_cal_y(ma);
+ // compute AFS
+ // MP15: is this using equation 20 from doi:10.1093/bioinformatics/btr509?
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)phi[k] * ma->z[k];
+ for (k = 0; k <= ma->M; ++k) {
+ ma->afs1[k] = phi[k] * ma->z[k] / sum;
+ if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
+ }
+ // compute folded variant probability
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+ for (k = 1, sum2 = 0.; k < ma->M; ++k)
+ sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+ *p_var_folded = sum2 / sum;
+ *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum;
+ // the expected frequency
+ for (k = 0, sum = 0.; k <= ma->M; ++k) {
+ ma->afs[k] += ma->afs1[k];
+ sum += k * ma->afs1[k];
+ }
+ return sum / ma->M;
+}
+
+int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
+{
+ int i, k;
+ long double sum = 0.;
+ ma->is_indel = bcf_is_snp(b) ? 0 : 1;
+ rst->perm_rank = -1;
+
+ ma->PL = call->PLs;
+ ma->PL_len = call->nPLs / b->n_sample;
+ if (b->n_allele < 2) return -1; // FIXME: find a better solution
+
+ rst->rank0 = cal_pdg(b, ma);
+ rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded);
+ rst->p_ref = ma->afs1[ma->M];
+ for (k = 0, sum = 0.; k < ma->M; ++k)
+ sum += ma->afs1[k];
+ rst->p_var = (double)sum;
+ { // compute the allele count
+ double max = -1;
+ rst->ac = -1;
+ for (k = 0; k <= ma->M; ++k)
+ if (max < ma->z[k]) max = ma->z[k], rst->ac = k;
+ rst->ac = ma->M - rst->ac;
+ }
+ // calculate f_flat and f_em
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)ma->z[k];
+ rst->f_flat = 0.;
+ for (k = 0; k <= ma->M; ++k) {
+ double p = ma->z[k] / sum;
+ rst->f_flat += k * p;
+ }
+ rst->f_flat /= ma->M;
+ { // estimate equal-tail credible interval (95% level)
+ int l, h;
+ double p;
+ for (i = 0, p = 0.; i <= ma->M; ++i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ l = i;
+ for (i = ma->M, p = 0.; i >= 0; --i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ h = i;
+ rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
+ }
+ if (ma->n1 > 0) { // compute LRT
+ double max0, max1, max2;
+ for (k = 0, max0 = -1; k <= ma->M; ++k)
+ if (max0 < ma->z[k]) max0 = ma->z[k];
+ for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k)
+ if (max1 < ma->z1[k]) max1 = ma->z1[k];
+ for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k)
+ if (max2 < ma->z2[k]) max2 = ma->z2[k];
+ rst->lrt = log(max1 * max2 / max0);
+ rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt);
+ } else rst->lrt = -1.0;
+ rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
+ if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant
+ rst->p_chi2 = contrast2(ma, rst->cmp);
+ return 0;
+}
+
+void bcf_p1_dump_afs(bcf_p1aux_t *ma)
+{
+ int k;
+ fprintf(stderr, "[afs]");
+ for (k = 0; k <= ma->M; ++k)
+ fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
+ fprintf(stderr, "\n");
+ memset(ma->afs, 0, sizeof(double) * (ma->M + 1));
+}
diff --git a/bcftools/prob1.c.pysam.c b/bcftools/prob1.c.pysam.c
new file mode 100644
index 0000000..bad2478
--- /dev/null
+++ b/bcftools/prob1.c.pysam.c
@@ -0,0 +1,531 @@
+#include "pysam.h"
+
+/* prob1.c -- mathematical utility functions.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2012, 2013 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <limits.h>
+#include <zlib.h>
+#include "prob1.h"
+
+// #include "kstring.h"
+// #include "kseq.h"
+// KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MC_MAX_EM_ITER 16
+#define MC_EM_EPS 1e-5
+#define MC_DEF_INDEL 0.15
+
+gzFile bcf_p1_fp_lk;
+
+void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
+{
+ int i;
+ for (i = 0; i < ma->M; ++i)
+ ma->phi_indel[i] = ma->phi[i] * x;
+ ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x;
+}
+
+static void init_prior(int type, double theta, int M, double *phi)
+{
+ int i;
+ if (type == MC_PTYPE_COND2) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 2. * (i + 1) / (M + 1) / (M + 2);
+ } else if (type == MC_PTYPE_FLAT) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 1. / (M + 1);
+ } else {
+ double sum;
+ for (i = 0, sum = 0.; i < M; ++i)
+ sum += (phi[i] = theta / (M - i));
+ phi[M] = 1. - sum;
+ }
+}
+
+void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta)
+{
+ init_prior(type, theta, ma->M, ma->phi);
+ bcf_p1_indel_prior(ma, MC_DEF_INDEL);
+}
+
+void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta)
+{
+ if (ma->n1 <= 0 || ma->n1 >= ma->M) return;
+ init_prior(type, theta, 2*ma->n1, ma->phi1);
+ init_prior(type, theta, 2*(ma->n - ma->n1), ma->phi2);
+}
+
+
+/* Initialise a bcf_p1aux_t */
+bcf_p1aux_t *bcf_p1_init(int n_smpl, uint8_t *ploidy)
+{
+ bcf_p1aux_t *ma;
+ int i;
+ ma = (bcf_p1aux_t*) calloc(1, sizeof(bcf_p1aux_t));
+ ma->n1 = -1;
+ ma->n = n_smpl;
+ ma->M = 2 * n_smpl;
+ if (ploidy) {
+ ma->ploidy = (uint8_t*) malloc(n_smpl);
+ memcpy(ma->ploidy, ploidy, n_smpl);
+ for (i = 0, ma->M = 0; i < n_smpl; ++i) ma->M += ploidy[i];
+ if (ma->M == 2 * n_smpl) {
+ free(ma->ploidy);
+ ma->ploidy = 0;
+ }
+ }
+ ma->q2p = (double*) calloc(256, sizeof(double));
+ ma->pdg = (double*) calloc(3 * ma->n, sizeof(double));
+ ma->phi = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->phi_indel = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->phi1 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->phi2 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->z = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->zswap = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->z1 = (double*) calloc(ma->M + 1, sizeof(double)); // actually we do not need this large
+ ma->z2 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->afs = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->afs1 = (double*) calloc(ma->M + 1, sizeof(double));
+ ma->lf = (double*) calloc(ma->M + 1, sizeof(double));
+ for (i = 0; i < 256; ++i)
+ ma->q2p[i] = pow(10., -i / 10.);
+ for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1);
+ bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
+ return ma;
+}
+
+int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; }
+
+int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
+{
+ if (n1 == 0 || n1 >= b->n) return -1;
+ if (b->M != b->n * 2) {
+ fprintf(pysamerr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
+ return -1;
+ }
+ b->n1 = n1;
+ return 0;
+}
+
+void bcf_p1_destroy(bcf_p1aux_t *ma)
+{
+ if (ma) {
+ int k;
+ free(ma->lf);
+ if (ma->hg && ma->n1 > 0) {
+ for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]);
+ free(ma->hg);
+ }
+ free(ma->ploidy); free(ma->q2p); free(ma->pdg);
+ free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
+ free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
+ free(ma->afs); free(ma->afs1);
+ free(ma);
+ }
+}
+
+extern double kf_gammap(double s, double z);
+int test16(bcf1_t *b, anno16_t *a);
+
+/* Calculate P(D|g) */
+static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
+{
+ int i, j;
+ long *p, tmp;
+ p = (long*) alloca(b->n_allele * sizeof(long));
+ memset(p, 0, sizeof(long) * b->n_allele);
+
+ // Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk
+ for (j = 0; j < ma->n; ++j) {
+ // Fetch the PL array for the sample
+ const int *pi = ma->PL + j * ma->PL_len;
+ // Fetch the P(D|g) array for the sample
+ double *pdg = ma->pdg + j * 3;
+ pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
+ for (i = 0; i < b->n_allele; ++i)
+ p[i] += (int)pi[(i+1)*(i+2)/2-1];
+ }
+ for (i = 0; i < b->n_allele; ++i) p[i] = p[i]<<4 | i;
+ for (i = 1; i < b->n_allele; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
+ for (i = b->n_allele - 1; i >= 0; --i)
+ if ((p[i]&0xf) == 0) break;
+ return i;
+}
+
+
+/* f0 is minor allele fraction */
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+{
+ double sum, g[3];
+ double max, f3[3], *pdg = ma->pdg + k * 3;
+ int q, i, max_i, ploidy;
+ /* determine ploidy */
+ ploidy = ma->ploidy? ma->ploidy[k] : 2;
+ if (ploidy == 2) {
+ /* given allele frequency we can determine how many of each
+ * genotype we have by HWE p=1-q PP=p^2 PQ&QP=2*p*q QQ=q^2 */
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ } else {
+ f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0;
+ }
+ for (i = 0, sum = 0.; i < 3; ++i)
+ sum += (g[i] = pdg[i] * f3[i]);
+ /* normalise g and then determine max */
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ g[i] /= sum;
+ if (g[i] > max) max = g[i], max_i = i;
+ }
+ max = 1. - max;
+ if (max < 1e-308) max = 1e-308;
+ q = (int)(-4.343 * log(max) + .499);
+ if (q > 99) q = 99;
+ return q<<2|max_i;
+}
+
+// If likelihoods fall below this they get squashed to 0
+#define TINY 1e-20
+static void mc_cal_y_core(bcf_p1aux_t *ma, int beg)
+{
+ double *z[2], *tmp, *pdg;
+ int _j, last_min, last_max;
+ assert(beg == 0 || ma->M == ma->n*2);
+ z[0] = ma->z;
+ z[1] = ma->zswap;
+ pdg = ma->pdg;
+ memset(z[0], 0, sizeof(double) * (ma->M + 1));
+ memset(z[1], 0, sizeof(double) * (ma->M + 1));
+ z[0][0] = 1.;
+ last_min = last_max = 0;
+ ma->t = 0.;
+ if (ma->M == ma->n * 2) {
+ int M = 0;
+ for (_j = beg; _j < ma->n; ++_j) {
+ int k, j = _j - beg, _min = last_min, _max = last_max, M0;
+ double p[3], sum;
+ M0 = M; M += 2;
+ // Fetch P(D|g) for this sample
+ pdg = ma->pdg + _j * 3;
+ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ // If we are not on the last sample
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset
+ ma->t1 = ma->t;
+ memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary?
+ //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.;
+ } else { // this block is very similar to the block above; these two might be merged in future
+ int j, M = 0;
+ for (j = 0; j < ma->n; ++j) {
+ int k, M0, _min = last_min, _max = last_max;
+ double p[3], sum;
+ // Fetch P(D|g) for this sample
+ pdg = ma->pdg + j * 3;
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ M0 = M;
+ M += ma->ploidy[j];
+ if (ma->ploidy[j] == 1) {
+ p[0] = pdg[0]; p[1] = pdg[2];
+ _max++;
+ if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k];
+ for (k = _min < 1? 1 : _min; k <= _max; ++k)
+ z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / M);
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ // If we are not on the last sample
+ if (j < ma->n - 1) z[1][_max+1] = 0.;
+ } else if (ma->ploidy[j] == 2) {
+ p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2];
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ // If we are not on the last sample
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ }
+ if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
+ if (bcf_p1_fp_lk)
+ gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1));
+}
+
+static void mc_cal_y(bcf_p1aux_t *ma)
+{
+ if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples
+ int k;
+ long double x;
+ memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
+ memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ ma->t1 = ma->t2 = 0.;
+ mc_cal_y_core(ma, ma->n1);
+ ma->t2 = ma->t;
+ memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ mc_cal_y_core(ma, 0);
+ // rescale z
+ x = expl(ma->t - (ma->t1 + ma->t2));
+ for (k = 0; k <= ma->M; ++k) ma->z[k] *= x;
+ } else mc_cal_y_core(ma, 0);
+}
+
+#define CONTRAST_TINY 1e-30
+
+extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test
+
+static inline double chi2_test(int a, int b, int c, int d)
+{
+ double x, z;
+ x = (double)(a+b) * (c+d) * (b+d) * (a+c);
+ if (x == 0.) return 1;
+ z = a * d - b * c;
+ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
+}
+
+// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3])
+{
+ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ int n1 = p1->n1, n2 = p1->n - p1->n1;
+ if (p < CONTRAST_TINY) return -1;
+ if (.5*k1/n1 < .5*k2/n2) x[1] += p;
+ else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
+ else x[0] += p;
+ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
+}
+
+static double contrast2(bcf_p1aux_t *p1, double ret[3])
+{
+ int k, k1, k2, k10, k20, n1, n2;
+ double sum;
+ // get n1 and n2
+ n1 = p1->n1; n2 = p1->n - p1->n1;
+ if (n1 <= 0 || n2 <= 0) return 0.;
+ if (p1->hg == 0) { // initialize the hypergeometric distribution
+ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
+ to avoid precomputing this matrix, but it is slower and quite intricate. The following
+ computation in this block can be accelerated with a similar strategy, but perhaps this
+ is not a serious concern for now. */
+ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
+ p1->hg = (double**) calloc(2*n1+1, sizeof(double*));
+ for (k1 = 0; k1 <= 2*n1; ++k1) {
+ p1->hg[k1] = (double*)calloc(2*n2+1, sizeof(double));
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
+ }
+ }
+ { // compute
+ long double suml = 0;
+ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
+ sum = suml;
+ }
+ { // get the max k1 and k2
+ double max;
+ int max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
+ double x = p1->phi1[k] * p1->z1[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k10 = max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
+ double x = p1->phi2[k] * p1->z2[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k20 = max_k;
+ }
+ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
+ double x[3], y;
+ long double z = 0., L[2];
+ x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0;
+ for (k1 = k10; k1 >= 0; --k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
+ if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0;
+ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y;
+ if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why...
+ z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
+ }
+ return (double)z;
+ }
+}
+
+static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)
+{
+ int k;
+ long double sum = 0., sum2;
+ double *phi = ma->is_indel? ma->phi_indel : ma->phi;
+ memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
+ mc_cal_y(ma);
+ // compute AFS
+ // MP15: is this using equation 20 from doi:10.1093/bioinformatics/btr509?
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)phi[k] * ma->z[k];
+ for (k = 0; k <= ma->M; ++k) {
+ ma->afs1[k] = phi[k] * ma->z[k] / sum;
+ if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
+ }
+ // compute folded variant probability
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+ for (k = 1, sum2 = 0.; k < ma->M; ++k)
+ sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+ *p_var_folded = sum2 / sum;
+ *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum;
+ // the expected frequency
+ for (k = 0, sum = 0.; k <= ma->M; ++k) {
+ ma->afs[k] += ma->afs1[k];
+ sum += k * ma->afs1[k];
+ }
+ return sum / ma->M;
+}
+
+int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
+{
+ int i, k;
+ long double sum = 0.;
+ ma->is_indel = bcf_is_snp(b) ? 0 : 1;
+ rst->perm_rank = -1;
+
+ ma->PL = call->PLs;
+ ma->PL_len = call->nPLs / b->n_sample;
+ if (b->n_allele < 2) return -1; // FIXME: find a better solution
+
+ rst->rank0 = cal_pdg(b, ma);
+ rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded);
+ rst->p_ref = ma->afs1[ma->M];
+ for (k = 0, sum = 0.; k < ma->M; ++k)
+ sum += ma->afs1[k];
+ rst->p_var = (double)sum;
+ { // compute the allele count
+ double max = -1;
+ rst->ac = -1;
+ for (k = 0; k <= ma->M; ++k)
+ if (max < ma->z[k]) max = ma->z[k], rst->ac = k;
+ rst->ac = ma->M - rst->ac;
+ }
+ // calculate f_flat and f_em
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)ma->z[k];
+ rst->f_flat = 0.;
+ for (k = 0; k <= ma->M; ++k) {
+ double p = ma->z[k] / sum;
+ rst->f_flat += k * p;
+ }
+ rst->f_flat /= ma->M;
+ { // estimate equal-tail credible interval (95% level)
+ int l, h;
+ double p;
+ for (i = 0, p = 0.; i <= ma->M; ++i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ l = i;
+ for (i = ma->M, p = 0.; i >= 0; --i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ h = i;
+ rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
+ }
+ if (ma->n1 > 0) { // compute LRT
+ double max0, max1, max2;
+ for (k = 0, max0 = -1; k <= ma->M; ++k)
+ if (max0 < ma->z[k]) max0 = ma->z[k];
+ for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k)
+ if (max1 < ma->z1[k]) max1 = ma->z1[k];
+ for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k)
+ if (max2 < ma->z2[k]) max2 = ma->z2[k];
+ rst->lrt = log(max1 * max2 / max0);
+ rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt);
+ } else rst->lrt = -1.0;
+ rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
+ if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant
+ rst->p_chi2 = contrast2(ma, rst->cmp);
+ return 0;
+}
+
+void bcf_p1_dump_afs(bcf_p1aux_t *ma)
+{
+ int k;
+ fprintf(pysamerr, "[afs]");
+ for (k = 0; k <= ma->M; ++k)
+ fprintf(pysamerr, " %d:%.3lf", k, ma->afs[ma->M - k]);
+ fprintf(pysamerr, "\n");
+ memset(ma->afs, 0, sizeof(double) * (ma->M + 1));
+}
diff --git a/bcftools/prob1.h b/bcftools/prob1.h
new file mode 100644
index 0000000..1594d3f
--- /dev/null
+++ b/bcftools/prob1.h
@@ -0,0 +1,93 @@
+/* prob1.h -- mathematical utility functions.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2012, 2013 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#ifndef BCF_PROB1_H
+#define BCF_PROB1_H
+
+#include <htslib/vcf.h>
+#include "call.h"
+
+typedef struct {
+ int n; // Number of samples
+ int M; // Total number of chromosomes across all samples (n*2 if all samples are diploid)
+ int n1;
+ int is_indel;
+ uint8_t *ploidy; // haploid or diploid ONLY
+ double *q2p, *pdg; // q2p maps from phread scaled to real likelihood, pdg -> P(D|g)
+ double *phi; // Probability of seeing k reference alleles
+ double *phi_indel;
+ double *z, *zswap; // aux for afs
+ double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
+ double **hg; // hypergeometric distribution
+ double *lf; // log factorial
+ double t, t1, t2;
+ double *afs, *afs1; // afs: accumulative allele frequency spectrum (AFS); afs1: site posterior distribution
+ const int *PL; // point to PL
+ int PL_len;
+ int cons_llr; // pair and trio calling
+ int64_t cons_gt;
+} bcf_p1aux_t;
+
+typedef struct {
+ int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
+ int ac; // ML alternative allele count
+ double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
+ double cil, cih;
+ double cmp[3], p_chi2, lrt; // used by contrast2()
+} bcf_p1rst_t;
+
+typedef struct {
+ double p[4];
+ double edb, mqb, bqb; // end distance bias, mapQ bias, baseQ bias
+ int mq, depth, is_tested, d[4];
+} anno16_t;
+
+#define MC_PTYPE_FULL 1
+#define MC_PTYPE_COND2 2
+#define MC_PTYPE_FLAT 3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_p1aux_t *bcf_p1_init(int n_smpl, uint8_t *ploidy);
+ void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
+ void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta);
+ void bcf_p1_destroy(bcf_p1aux_t *ma);
+ void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma);
+ int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
+ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
+ void bcf_p1_dump_afs(bcf_p1aux_t *ma);
+ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
+ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
+ void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
+
+ int bcf_em1(call_t *call, const bcf1_t *b, int n1, int flag, double x[10]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bcftools/pysam.h b/bcftools/pysam.h
new file mode 100644
index 0000000..008cbbd
--- /dev/null
+++ b/bcftools/pysam.h
@@ -0,0 +1,5 @@
+#ifndef PYSAM_H
+#define PYSAM_H
+#include "stdio.h"
+extern FILE * pysamerr;
+#endif
diff --git a/bcftools/rbuf.h b/bcftools/rbuf.h
new file mode 100644
index 0000000..3d2805c
--- /dev/null
+++ b/bcftools/rbuf.h
@@ -0,0 +1,201 @@
+/* rbuf.h -- round buffers.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#ifndef __RBUF_H__
+#define __RBUF_H__
+
+#include <string.h>
+
+typedef struct
+{
+ int m,n,f; // m: allocated size, n: number of elements in the buffer, f: first element
+}
+rbuf_t;
+
+/**
+ * rbuf_init() - initialize round buffer
+ * @rbuf: the rbuf_t holder
+ * @size: the maximum number of elements
+ *
+ */
+static inline void rbuf_init(rbuf_t *rbuf, int size)
+{
+ rbuf->m = size; rbuf->n = rbuf->f = 0;
+}
+/**
+ * rbuf_kth() - get index of the k-th element of the round buffer
+ * @rbuf: the rbuf_t holder
+ * @k: 0-based index
+ */
+static inline int rbuf_kth(rbuf_t *rbuf, int k)
+{
+ if ( k >= rbuf->n || k<0 ) return -1;
+ int i = k + rbuf->f;
+ if ( i >= rbuf->m ) i -= rbuf->m;
+ return i;
+}
+/**
+ * rbuf_last() - get index of the last element of the round buffer
+ * @rbuf: the rbuf_t holder
+ *
+ */
+#define rbuf_last(rbuf) rbuf_kth(rbuf, (rbuf)->n - 1)
+
+/**
+ * rbuf_next() - get index of the next element in the round buffer
+ * @rbuf: the rbuf_t holder
+ * @i: pointer to the last rbuf index. Set to -1 before the first call.
+ *
+ * Sets i to the next position in the buffer. The return value indicates if
+ * the position points to a valid element (1) or if there are no more elements
+ * after *i (0). When the end is reached, *i is set to the first element in the
+ * buffer.
+ */
+static inline int rbuf_next(rbuf_t *rbuf, int *i)
+{
+ if ( !rbuf->n ) return 0;
+ if ( *i==-1 ) { *i = rbuf->f; return 1; }
+ int n = (rbuf->f <= *i) ? *i - rbuf->f + 1 : *i + rbuf->m - rbuf->f + 1;
+ if ( ++(*i) >= rbuf->m ) *i = 0;
+ if ( n < rbuf->n ) return 1;
+ *i = rbuf->f;
+ return 0;
+}
+/**
+ * rbuf_prev() - get index of the previous element in the round buffer
+ * @rbuf: the rbuf_t holder
+ * @i: pointer to the last rbuf index. Set to -1 before the first call.
+ *
+ * Sets i to the previous position in the buffer. The return value indicates if
+ * the position points to a valid element (1) or if there are no more elements
+ * before *i (0).
+ */
+static inline int rbuf_prev(rbuf_t *rbuf, int *i)
+{
+ if ( !rbuf->n || *i==rbuf->f ) return 0;
+ if ( *i==-1 )
+ {
+ *i = rbuf_last(rbuf);
+ return 1;
+ }
+ if ( --(*i) < 0 ) *i = rbuf->m - 1;
+ return 1;
+}
+/**
+ * rbuf_prepend() - register new element at the start of the round buffer
+ * @rbuf: the rbuf_t holder
+ *
+ * Returns index of the newly inserted element.
+ */
+static inline int rbuf_prepend(rbuf_t *rbuf)
+{
+ if ( rbuf->n < rbuf->m ) rbuf->n++;
+
+ rbuf->f = rbuf->f > 0 ? rbuf->f - 1 : rbuf->m - 1;
+ return rbuf->f;
+}
+/**
+ * rbuf_append() - register new element at the end of the round buffer
+ * @rbuf: the rbuf_t holder
+ *
+ * Returns index of the newly inserted element.
+ */
+static inline int rbuf_append(rbuf_t *rbuf)
+{
+ if ( rbuf->n < rbuf->m )
+ {
+ rbuf->n++;
+ int i = rbuf->f + rbuf->n;
+ return i <= rbuf->m ? i - 1 : i - rbuf->m - 1;
+ }
+
+ rbuf->f++;
+ if ( rbuf->f >= rbuf->m )
+ {
+ rbuf->f = 0;
+ return rbuf->m - 1;
+ }
+ return rbuf->f - 1;
+}
+/**
+ * rbuf_shift() - removes first element from the buffer
+ * @rbuf: the rbuf_t holder
+ *
+ * Returns index of the removed element.
+ */
+static inline int rbuf_shift(rbuf_t *rbuf)
+{
+ if ( !rbuf->n ) return -1;
+ int ret = rbuf->f;
+ rbuf->f++;
+ if ( rbuf->f >= rbuf->m ) rbuf->f = 0;
+ rbuf->n--;
+ return ret;
+}
+/**
+ * rbuf_shift_n() - removes first n elements from the buffer
+ * @rbuf: the rbuf_t holder
+ * @n: number of elements to remove
+ */
+static inline void rbuf_shift_n(rbuf_t *rbuf, int n)
+{
+ if ( n >= rbuf->n )
+ {
+ rbuf->n = rbuf->f = 0;
+ return;
+ }
+ rbuf->n -= n;
+ rbuf->f += n;
+ if ( rbuf->f >= rbuf->m ) rbuf->f -= rbuf->m;
+}
+
+/**
+ * rbuf_expand0() - expand round buffer and set the newly allocated elements to 0
+ * @rbuf: the rbuf holder
+ * @type_t: data type
+ * @n: requested number of elements
+ * @data: data array to be realloced
+ *
+ * Note: The new array is linearized and leaves the rbuf.f offset untouched,
+ * thus the size of the new buffer is determined by the current position.
+ */
+#define rbuf_expand0(rbuf,type_t,n,data) \
+{ \
+ if ( n > (rbuf)->m ) \
+ { \
+ int m = n + (rbuf)->f; \
+ m--, m|=m>>1, m|=m>>2, m|=m>>4, m|=m>>8, m|=m>>16, m++; /* kroundup32 */ \
+ data = (type_t*) realloc(data, sizeof(type_t)*m); \
+ type_t *ptr = data; \
+ memset(ptr+(rbuf)->m,0,sizeof(type_t)*(m-(rbuf)->m)); \
+ if ( (rbuf)->f ) \
+ { \
+ memcpy(ptr+(rbuf)->m,ptr,sizeof(type_t)*(rbuf)->f); \
+ memset(ptr,0,sizeof(type_t)*(rbuf)->f); \
+ } \
+ (rbuf)->m = m; \
+ } \
+}
+
+#endif
diff --git a/bcftools/tabix.c b/bcftools/tabix.c
new file mode 100644
index 0000000..2f24b92
--- /dev/null
+++ b/bcftools/tabix.c
@@ -0,0 +1,129 @@
+/* tabix.c -- tabix subcommand.
+
+ Copyright (C) 2012 Broad Institute.
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <htslib/bgzf.h>
+#include <htslib/tbx.h>
+
+int main_tabix(int argc, char *argv[])
+{
+ int c, min_shift = -1, is_force = 0, is_all = 0;
+ tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+ while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
+ if (c == '0') conf.preset |= TBX_UCSC;
+ else if (c == 'f') is_force = 1;
+ else if (c == 'a') is_all = 1;
+ else if (c == 'm') min_shift = atoi(optarg);
+ else if (c == 's') conf.sc = atoi(optarg);
+ else if (c == 'b') conf.bc = atoi(optarg);
+ else if (c == 'e') conf.ec = atoi(optarg);
+ else if (c == 'c') conf.meta_char = *optarg;
+ else if (c == 'S') conf.line_skip = atoi(optarg);
+ else if (c == 'p') {
+ if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ else {
+ fprintf(stderr, "The type '%s' not recognised\n", optarg);
+ return 1;
+ }
+
+ }
+ if (optind == argc) {
+ fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n");
+ fprintf(stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n");
+ fprintf(stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n");
+ fprintf(stderr, " -b INT column number for region start [4]\n");
+ fprintf(stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n");
+ fprintf(stderr, " -0 specify coordinates are zero-based\n");
+ fprintf(stderr, " -S INT skip first INT lines [0]\n");
+ fprintf(stderr, " -c CHAR skip lines starting with CHAR [null]\n");
+ fprintf(stderr, " -a print all records\n");
+ fprintf(stderr, " -f force to overwrite existing index\n");
+ fprintf(stderr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ if (is_all) { // read without random access
+ kstring_t s;
+ BGZF *fp;
+ s.l = s.m = 0; s.s = 0;
+ fp = bgzf_open(argv[optind], "r");
+ while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s);
+ bgzf_close(fp);
+ free(s.s);
+ } else if (optind + 2 > argc) { // create index
+ if ( !conf_ptr )
+ {
+ // auto-detect file type by file name
+ int l = strlen(argv[optind]);
+ int strcasecmp(const char *s1, const char *s2);
+ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
+ }
+ if ( conf_ptr ) conf = *conf_ptr;
+
+ if (!is_force) {
+ char *fn;
+ FILE *fp;
+ fn = (char*)alloca(strlen(argv[optind]) + 5);
+ strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
+ if ((fp = fopen(fn, "rb")) != 0) {
+ fclose(fp);
+ fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
+ return 1;
+ }
+ }
+ if ( tbx_index_build(argv[optind], min_shift, &conf) )
+ {
+ fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n");
+ return 1;
+ }
+ } else { // read with random access
+ tbx_t *tbx;
+ BGZF *fp;
+ kstring_t s;
+ int i;
+ if ((tbx = tbx_index_load(argv[optind])) == 0) return 1;
+ if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1;
+ s.s = 0; s.l = s.m = 0;
+ for (i = optind + 1; i < argc; ++i) {
+ hts_itr_t *itr;
+ if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue;
+ while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s);
+ tbx_itr_destroy(itr);
+ }
+ free(s.s);
+ bgzf_close(fp);
+ tbx_destroy(tbx);
+ }
+ return 0;
+}
diff --git a/bcftools/tabix.c.pysam.c b/bcftools/tabix.c.pysam.c
new file mode 100644
index 0000000..0eb328f
--- /dev/null
+++ b/bcftools/tabix.c.pysam.c
@@ -0,0 +1,131 @@
+#include "pysam.h"
+
+/* tabix.c -- tabix subcommand.
+
+ Copyright (C) 2012 Broad Institute.
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <htslib/bgzf.h>
+#include <htslib/tbx.h>
+
+int main_tabix(int argc, char *argv[])
+{
+ int c, min_shift = -1, is_force = 0, is_all = 0;
+ tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+ while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
+ if (c == '0') conf.preset |= TBX_UCSC;
+ else if (c == 'f') is_force = 1;
+ else if (c == 'a') is_all = 1;
+ else if (c == 'm') min_shift = atoi(optarg);
+ else if (c == 's') conf.sc = atoi(optarg);
+ else if (c == 'b') conf.bc = atoi(optarg);
+ else if (c == 'e') conf.ec = atoi(optarg);
+ else if (c == 'c') conf.meta_char = *optarg;
+ else if (c == 'S') conf.line_skip = atoi(optarg);
+ else if (c == 'p') {
+ if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ else {
+ fprintf(pysamerr, "The type '%s' not recognised\n", optarg);
+ return 1;
+ }
+
+ }
+ if (optind == argc) {
+ fprintf(pysamerr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n");
+ fprintf(pysamerr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n");
+ fprintf(pysamerr, " -s INT column number for sequence names (suppressed by -p) [1]\n");
+ fprintf(pysamerr, " -b INT column number for region start [4]\n");
+ fprintf(pysamerr, " -e INT column number for region end (if no end, set INT to -b) [5]\n");
+ fprintf(pysamerr, " -0 specify coordinates are zero-based\n");
+ fprintf(pysamerr, " -S INT skip first INT lines [0]\n");
+ fprintf(pysamerr, " -c CHAR skip lines starting with CHAR [null]\n");
+ fprintf(pysamerr, " -a print all records\n");
+ fprintf(pysamerr, " -f force to overwrite existing index\n");
+ fprintf(pysamerr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
+ fprintf(pysamerr, "\n");
+ return 1;
+ }
+ if (is_all) { // read without random access
+ kstring_t s;
+ BGZF *fp;
+ s.l = s.m = 0; s.s = 0;
+ fp = bgzf_open(argv[optind], "r");
+ while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s);
+ bgzf_close(fp);
+ free(s.s);
+ } else if (optind + 2 > argc) { // create index
+ if ( !conf_ptr )
+ {
+ // auto-detect file type by file name
+ int l = strlen(argv[optind]);
+ int strcasecmp(const char *s1, const char *s2);
+ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
+ }
+ if ( conf_ptr ) conf = *conf_ptr;
+
+ if (!is_force) {
+ char *fn;
+ FILE *fp;
+ fn = (char*)alloca(strlen(argv[optind]) + 5);
+ strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
+ if ((fp = fopen(fn, "rb")) != 0) {
+ fclose(fp);
+ fprintf(pysamerr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
+ return 1;
+ }
+ }
+ if ( tbx_index_build(argv[optind], min_shift, &conf) )
+ {
+ fprintf(pysamerr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n");
+ return 1;
+ }
+ } else { // read with random access
+ tbx_t *tbx;
+ BGZF *fp;
+ kstring_t s;
+ int i;
+ if ((tbx = tbx_index_load(argv[optind])) == 0) return 1;
+ if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1;
+ s.s = 0; s.l = s.m = 0;
+ for (i = optind + 1; i < argc; ++i) {
+ hts_itr_t *itr;
+ if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue;
+ while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s);
+ tbx_itr_destroy(itr);
+ }
+ free(s.s);
+ bgzf_close(fp);
+ tbx_destroy(tbx);
+ }
+ return 0;
+}
diff --git a/bcftools/tsv2vcf.c b/bcftools/tsv2vcf.c
new file mode 100644
index 0000000..8826f18
--- /dev/null
+++ b/bcftools/tsv2vcf.c
@@ -0,0 +1,121 @@
+/* tsv2vcf.c -- convert from whitespace-separated fields to VCF
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <ctype.h>
+#include "tsv2vcf.h"
+
+tsv_t *tsv_init(const char *str)
+{
+ tsv_t *tsv = (tsv_t *) calloc(1,sizeof(tsv_t));
+ kstring_t tmp = {0,0,0};
+ const char *ss = str, *se = ss;
+ tsv->ncols = 0;
+ while ( *ss )
+ {
+ if ( *se && *se!=',' ) { se++; continue; }
+ tsv->ncols++;
+ tsv->cols = (tsv_col_t*) realloc(tsv->cols,sizeof(tsv_col_t)*tsv->ncols);
+ tsv->cols[tsv->ncols-1].name = NULL;
+ tsv->cols[tsv->ncols-1].setter = NULL;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ if ( strcasecmp("-",tmp.s) )
+ tsv->cols[tsv->ncols-1].name = strdup(tmp.s);
+ if ( !*se ) break;
+ ss = ++se;
+ }
+ free(tmp.s);
+ return tsv;
+}
+
+void tsv_destroy(tsv_t *tsv)
+{
+ int i;
+ for (i=0; i<tsv->ncols; i++) free(tsv->cols[i].name);
+ free(tsv->cols);
+ free(tsv);
+}
+
+int tsv_register(tsv_t *tsv, const char *id, tsv_setter_t setter, void *usr)
+{
+ int i;
+ for (i=0; i<tsv->ncols; i++)
+ {
+ if ( !tsv->cols[i].name || strcasecmp(tsv->cols[i].name,id) ) continue;
+ tsv->cols[i].setter = setter;
+ tsv->cols[i].usr = usr;
+ return 0;
+ }
+ return -1;
+}
+
+int tsv_parse(tsv_t *tsv, bcf1_t *rec, char *str)
+{
+ int status = 0;
+ tsv->icol = 0;
+ tsv->ss = tsv->se = str;
+ while ( *tsv->ss && tsv->icol < tsv->ncols )
+ {
+ while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
+ if ( tsv->cols[tsv->icol].setter )
+ {
+ int ret = tsv->cols[tsv->icol].setter(tsv,rec,tsv->cols[tsv->icol].usr);
+ if ( ret<0 ) return -1;
+ status++;
+ }
+ while ( *tsv->se && isspace(*tsv->se) ) tsv->se++;
+ tsv->ss = tsv->se;
+ tsv->icol++;
+ }
+ return status ? 0 : -1;
+}
+
+int tsv_setter_chrom(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char tmp = *tsv->se;
+ *tsv->se = 0;
+ rec->rid = bcf_hdr_name2id((bcf_hdr_t*)usr, tsv->ss);
+ *tsv->se = tmp;
+ return rec->rid==-1 ? -1 : 0;
+}
+
+int tsv_setter_pos(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char *endptr;
+ rec->pos = strtol(tsv->ss, &endptr, 10) - 1;
+ if ( tsv->ss==endptr ) return -1;
+ return 0;
+}
+
+int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char tmp = *tsv->se;
+ *tsv->se = 0;
+ bcf_update_id((bcf_hdr_t*)usr, rec, tsv->ss);
+ *tsv->se = tmp;
+ return 0;
+}
+
+
diff --git a/bcftools/tsv2vcf.c.pysam.c b/bcftools/tsv2vcf.c.pysam.c
new file mode 100644
index 0000000..1da48d5
--- /dev/null
+++ b/bcftools/tsv2vcf.c.pysam.c
@@ -0,0 +1,123 @@
+#include "pysam.h"
+
+/* tsv2vcf.c -- convert from whitespace-separated fields to VCF
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <ctype.h>
+#include "tsv2vcf.h"
+
+tsv_t *tsv_init(const char *str)
+{
+ tsv_t *tsv = (tsv_t *) calloc(1,sizeof(tsv_t));
+ kstring_t tmp = {0,0,0};
+ const char *ss = str, *se = ss;
+ tsv->ncols = 0;
+ while ( *ss )
+ {
+ if ( *se && *se!=',' ) { se++; continue; }
+ tsv->ncols++;
+ tsv->cols = (tsv_col_t*) realloc(tsv->cols,sizeof(tsv_col_t)*tsv->ncols);
+ tsv->cols[tsv->ncols-1].name = NULL;
+ tsv->cols[tsv->ncols-1].setter = NULL;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ if ( strcasecmp("-",tmp.s) )
+ tsv->cols[tsv->ncols-1].name = strdup(tmp.s);
+ if ( !*se ) break;
+ ss = ++se;
+ }
+ free(tmp.s);
+ return tsv;
+}
+
+void tsv_destroy(tsv_t *tsv)
+{
+ int i;
+ for (i=0; i<tsv->ncols; i++) free(tsv->cols[i].name);
+ free(tsv->cols);
+ free(tsv);
+}
+
+int tsv_register(tsv_t *tsv, const char *id, tsv_setter_t setter, void *usr)
+{
+ int i;
+ for (i=0; i<tsv->ncols; i++)
+ {
+ if ( !tsv->cols[i].name || strcasecmp(tsv->cols[i].name,id) ) continue;
+ tsv->cols[i].setter = setter;
+ tsv->cols[i].usr = usr;
+ return 0;
+ }
+ return -1;
+}
+
+int tsv_parse(tsv_t *tsv, bcf1_t *rec, char *str)
+{
+ int status = 0;
+ tsv->icol = 0;
+ tsv->ss = tsv->se = str;
+ while ( *tsv->ss && tsv->icol < tsv->ncols )
+ {
+ while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
+ if ( tsv->cols[tsv->icol].setter )
+ {
+ int ret = tsv->cols[tsv->icol].setter(tsv,rec,tsv->cols[tsv->icol].usr);
+ if ( ret<0 ) return -1;
+ status++;
+ }
+ while ( *tsv->se && isspace(*tsv->se) ) tsv->se++;
+ tsv->ss = tsv->se;
+ tsv->icol++;
+ }
+ return status ? 0 : -1;
+}
+
+int tsv_setter_chrom(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char tmp = *tsv->se;
+ *tsv->se = 0;
+ rec->rid = bcf_hdr_name2id((bcf_hdr_t*)usr, tsv->ss);
+ *tsv->se = tmp;
+ return rec->rid==-1 ? -1 : 0;
+}
+
+int tsv_setter_pos(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char *endptr;
+ rec->pos = strtol(tsv->ss, &endptr, 10) - 1;
+ if ( tsv->ss==endptr ) return -1;
+ return 0;
+}
+
+int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char tmp = *tsv->se;
+ *tsv->se = 0;
+ bcf_update_id((bcf_hdr_t*)usr, rec, tsv->ss);
+ *tsv->se = tmp;
+ return 0;
+}
+
+
diff --git a/bcftools/tsv2vcf.h b/bcftools/tsv2vcf.h
new file mode 100644
index 0000000..6fe5b45
--- /dev/null
+++ b/bcftools/tsv2vcf.h
@@ -0,0 +1,85 @@
+/* tsv2vcf.h -- convert from whitespace-separated fields to VCF
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#ifndef __TSV2VCF_H__
+#define __TSV2VCF_H__
+
+#include <htslib/vcf.h>
+
+typedef struct _tsv_t tsv_t;
+typedef int (*tsv_setter_t)(tsv_t *, bcf1_t *, void *);
+
+typedef struct
+{
+ char *name;
+ tsv_setter_t setter;
+ void *usr;
+}
+tsv_col_t;
+
+struct _tsv_t
+{
+ int ncols, icol;
+ tsv_col_t *cols;
+ char *se, *ss;
+};
+
+tsv_t *tsv_init(const char *str);
+void tsv_destroy(tsv_t *tsv);
+int tsv_register(tsv_t *tsv, const char *id, tsv_setter_t setter, void *usr);
+
+/**
+ * tsv_parse() - parse tsv line and fill VCF record
+ * Returns 0 on success or -1 on parse error
+ */
+int tsv_parse(tsv_t *tsv, bcf1_t *rec, char *str);
+
+/**
+ * tstv_next() - position ss,se to next field; first pass with ss=se=str
+ * Returns 0 on success, or -1 if no more fields
+ */
+static inline int tsv_next(tsv_t *tsv)
+{
+ if ( !*tsv->se ) return -1;
+ if ( tsv->ss==tsv->se )
+ {
+ while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
+ return 0;
+ }
+ while ( *tsv->se && isspace(*tsv->se) ) tsv->se++;
+ tsv->ss = tsv->se;
+ while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
+ return 0;
+}
+
+/**
+ * The setters return 0 on success or negative value if the line is to be skipped.
+ */
+int tsv_setter_chrom(tsv_t *tsv, bcf1_t *rec, void *usr);
+int tsv_setter_pos(tsv_t *tsv, bcf1_t *rec, void *usr);
+int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr);
+
+#endif
+
diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c
new file mode 100644
index 0000000..96a1649
--- /dev/null
+++ b/bcftools/vcfannotate.c
@@ -0,0 +1,1760 @@
+/* vcfannotate.c -- Annotate and edit VCF/BCF files.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include <dlfcn.h>
+#include "bcftools.h"
+#include "vcmp.h"
+#include "filter.h"
+#include "convert.h"
+
+struct _args_t;
+
+typedef struct _rm_tag_t
+{
+ char *key;
+ int hdr_id;
+ void (*handler)(struct _args_t *, bcf1_t *, struct _rm_tag_t *);
+}
+rm_tag_t;
+
+typedef struct
+{
+ char **cols;
+ int ncols, mcols;
+ char **als;
+ int nals, mals;
+ kstring_t line;
+ int rid, start, end;
+}
+annot_line_t;
+
+#define REPLACE_MISSING 0 // replace only missing values
+#define REPLACE_ALL 1 // replace both missing and existing values
+#define REPLACE_EXISTING 2 // replace only if tgt is not missing
+#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
+typedef struct _annot_col_t
+{
+ int icol, replace, number; // number: one of BCF_VL_* types
+ char *hdr_key;
+ int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+}
+annot_col_t;
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define MARK_LISTED 1
+#define MARK_UNLISTED 2
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr, *hdr_out;
+ htsFile *out_fh;
+ int output_type, n_threads;
+ bcf_sr_regions_t *tgts;
+
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
+ rm_tag_t *rm; // tags scheduled for removal
+ int nrm;
+ int flt_keep_pass; // when all filters removed, reset to PASS
+
+ vcmp_t *vcmp; // for matching annotation and VCF lines by allele
+ annot_line_t *alines; // buffered annotation lines
+ int nalines, malines;
+ int ref_idx, alt_idx, chr_idx, from_idx, to_idx; // -1 if not present
+ annot_col_t *cols; // column indexes and setters
+ int ncols;
+
+ char *set_ids_fmt;
+ convert_t *set_ids;
+ int set_ids_replace;
+
+ int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc
+ int mtmpi, mtmpf, mtmps;
+ int mtmpi2, mtmpf2, mtmps2;
+ int mtmpi3, mtmpf3, mtmps3;
+ int32_t *tmpi, *tmpi2, *tmpi3;
+ float *tmpf, *tmpf2, *tmpf3;
+ char *tmps, *tmps2, **tmpp, **tmpp2;
+ kstring_t tmpks;
+
+ char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
+ char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
+ int argc, drop_header, tgts_is_vcf, mark_sites_logic;
+}
+args_t;
+
+char *msprintf(const char *fmt, ...);
+
+void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_update_id(args->hdr,line,NULL);
+}
+void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass);
+ else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass);
+}
+void remove_qual(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_float_set_missing(line->qual);
+}
+void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ // remove all INFO fields
+ if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
+
+ int i;
+ for (i=0; i<line->n_info; i++)
+ {
+ bcf_info_t *inf = &line->d.info[i];
+ if ( inf->vptr_free )
+ {
+ free(inf->vptr - inf->vptr_off);
+ inf->vptr_free = 0;
+ }
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ inf->vptr = NULL;
+ }
+}
+void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_update_info(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT); // the type does not matter with n=0
+}
+void remove_format_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_update_format(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT); // the type does not matter with n=0
+}
+void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ // remove all FORMAT fields except GT
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ const char *key = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
+ if ( key[0]=='G' && key[1]=='T' && !key[2] ) continue;
+
+ if ( fmt->p_free )
+ {
+ free(fmt->p - fmt->p_off);
+ fmt->p_free = 0;
+ }
+ line->d.indiv_dirty = 1;
+ fmt->p = NULL;
+ }
+}
+
+static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
+{
+ int i = 0, nrm = 0;
+ while ( i<hdr->nhrec )
+ {
+ if ( hdr->hrec[i]->type!=type ) { i++; continue; }
+ bcf_hrec_t *hrec = hdr->hrec[i];
+ if ( type==BCF_HL_FMT )
+ {
+ // everything except FORMAT/GT
+ int id = bcf_hrec_find_key(hrec, "ID");
+ if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ }
+ nrm++;
+ hdr->nhrec--;
+ if ( i < hdr->nhrec )
+ memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
+ bcf_hrec_destroy(hrec);
+ }
+ if ( nrm ) bcf_hdr_sync(hdr);
+}
+
+static void init_remove_annots(args_t *args)
+{
+ int keep_info = 0, keep_fmt = 0, keep_flt = 0;
+ void *keep = khash_str2int_init();
+ kstring_t str = {0,0,0};
+ char *ss = args->remove_annots;
+ while ( *ss )
+ {
+ args->nrm++;
+ args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
+ rm_tag_t *tag = &args->rm[args->nrm-1];
+ tag->key = NULL;
+
+ int type = BCF_HL_GEN;
+ if ( !strncasecmp("INFO/",ss,5) ) { type = BCF_HL_INFO; ss += 5; }
+ else if ( !strncasecmp("INF/",ss,4) ) { type = BCF_HL_INFO; ss += 4; }
+ else if ( !strncasecmp("FORMAT/",ss,7) ) { type = BCF_HL_FMT; ss += 7; }
+ else if ( !strncasecmp("FMT/",ss,4) ) { type = BCF_HL_FMT; ss += 4; }
+ else if ( !strncasecmp("FILTER/",ss,7) ) { type = BCF_HL_FLT; ss += 7; }
+ else if ( !strncasecmp("^INFO/",ss,6) ) { type = BCF_HL_INFO; ss += 6; keep_info = 1; }
+ else if ( !strncasecmp("^INF/",ss,5) ) { type = BCF_HL_INFO; ss += 5; keep_info = 1; }
+ else if ( !strncasecmp("^FORMAT/",ss,8) ) { type = BCF_HL_FMT; ss += 8; keep_fmt = 1; }
+ else if ( !strncasecmp("^FMT/",ss,5) ) { type = BCF_HL_FMT; ss += 5; keep_fmt = 1; }
+ else if ( !strncasecmp("^FILTER/",ss,8) ) { type = BCF_HL_FLT; ss += 8; keep_flt = 1; }
+
+ char *se = ss;
+ while ( *se && *se!=',' ) se++;
+ str.l = 0;
+ kputsn(ss, se-ss, &str);
+
+ if ( type==BCF_HL_FLT )
+ {
+ if ( !keep_flt )
+ {
+ args->flt_keep_pass = 1;
+ tag->handler = remove_filter;
+ tag->key = strdup(str.s);
+ tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
+ bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
+ }
+ else
+ {
+ int value, ret = khash_str2int_get(keep, str.s, &value);
+ if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<BCF_HL_FLT);
+ else khash_str2int_set(keep, str.s, value | 1<<BCF_HL_FLT);
+ args->nrm--;
+ }
+ }
+ else if ( type!=BCF_HL_GEN )
+ {
+ int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s);
+ if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) )
+ {
+ fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
+ args->nrm--;
+ }
+ else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) )
+ {
+ int value, ret = khash_str2int_get(keep, str.s, &value);
+ if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<type);
+ else khash_str2int_set(keep, str.s, value | 1<<type);
+ args->nrm--;
+ }
+ else
+ {
+ tag->key = strdup(str.s);
+ if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
+ bcf_hdr_remove(args->hdr_out,type,tag->key);
+ }
+ }
+ else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
+ else if ( !strcasecmp("FILTER",str.s) )
+ {
+ tag->handler = remove_filter;
+ remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
+ }
+ else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
+ else if ( !strcasecmp("INFO",str.s) )
+ {
+ tag->handler = remove_info;
+ remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
+ }
+ else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
+ {
+ tag->handler = remove_format;
+ remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
+ }
+ else if ( str.l )
+ {
+ if ( str.s[0]=='#' && str.s[1]=='#' )
+ bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
+ else
+ bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+ args->nrm--;
+ }
+
+ ss = *se ? se+1 : se;
+ }
+ free(str.s);
+ if ( keep_flt || keep_info || keep_fmt )
+ {
+ int j;
+ for (j=0; j<args->hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = args->hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue;
+ if ( !keep_flt && hrec->type==BCF_HL_FLT ) continue;
+ if ( !keep_info && hrec->type==BCF_HL_INFO ) continue;
+ if ( !keep_fmt && hrec->type==BCF_HL_FMT ) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ int value, ret = khash_str2int_get(keep,hrec->vals[k],&value);
+ if ( ret==0 && value>>hrec->type ) // keep
+ {
+ if ( hrec->type==BCF_HL_FLT && !strcmp("PASS",hrec->vals[k]) ) args->flt_keep_pass = 1;
+ continue;
+ }
+ args->nrm++;
+ args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
+ rm_tag_t *tag = &args->rm[args->nrm-1];
+ if ( hrec->type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ else if ( hrec->type==BCF_HL_FMT ) tag->handler = remove_format_tag;
+ else
+ {
+ tag->handler = remove_filter;
+ tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
+ }
+ tag->key = strdup(hrec->vals[k]);
+ bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
+ }
+ }
+ khash_str2int_destroy_free(keep);
+ if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots);
+ bcf_hdr_sync(args->hdr_out);
+}
+static void init_header_lines(args_t *args)
+{
+ htsFile *file = hts_open(args->header_fname, "rb");
+ if ( !file ) error("Error reading %s\n", args->header_fname);
+ kstring_t str = {0,0,0};
+ while ( hts_getline(file, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s);
+ bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else)
+ }
+ hts_close(file);
+ free(str.s);
+ bcf_hdr_sync(args->hdr_out);
+ bcf_hdr_sync(args->hdr);
+}
+static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ // note: so far this works only with one filter, not a list of filters
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ hts_expand(int,1,args->mtmpi,args->tmpi);
+ args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
+ if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]);
+ if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; }
+ if ( col->replace!=REPLACE_MISSING )
+ {
+ bcf_update_filter(args->hdr_out,line,NULL,0);
+ bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+ return 0;
+ }
+
+ // only update missing FILTER
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ if ( !line->d.n_flt )
+ bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+ return 0;
+}
+static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ int i;
+ bcf1_t *rec = (bcf1_t*) data;
+ if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ if ( !rec->d.n_flt ) return 0; // don't overwrite with a missing value
+ if ( col->replace==SET_OR_APPEND || col->replace==REPLACE_MISSING )
+ {
+ if ( col->replace==REPLACE_MISSING && line->d.n_flt ) return 0; // only update missing FILTER
+ for (i=0; i<rec->d.n_flt; i++)
+ {
+ const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
+ bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt));
+ }
+ return 0;
+ }
+ hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
+ for (i=0; i<rec->d.n_flt; i++)
+ {
+ const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
+ args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
+ }
+ bcf_update_filter(args->hdr_out,line,NULL,0);
+ bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
+ return 0;
+}
+static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ // possible cases:
+ // IN ANNOT OUT ACHIEVED_BY
+ // x y x -c +ID
+ // x y y -c ID
+ // x y x,y -c =ID
+ // x . x -c +ID, ID
+ // x . . -x ID
+ // . y y -c +ID, -c ID
+ //
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
+ if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
+
+ // running with +ID, only update missing ids
+ if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
+ return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
+ return 0;
+}
+static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id);
+ if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id);
+
+ // running with +ID, only update missing ids
+ if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
+ return bcf_update_id(args->hdr_out,line,rec->d.id);
+ return 0;
+}
+static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol];
+ if ( str[0]=='.' && str[1]==0 ) return 0; // empty
+
+ if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+
+ line->qual = strtod(str, &str);
+ if ( str == tab->cols[col->icol] )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ return 0;
+}
+static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ if ( bcf_float_is_missing(rec->qual) ) return 0;
+ if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+ line->qual = rec->qual;
+ return 0;
+}
+static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol];
+ if ( str[0]=='.' && str[1]==0 ) return 0;
+
+ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1);
+ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0);
+ error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ return -1;
+}
+static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL);
+ bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag);
+ return 0;
+}
+static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
+{
+ if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
+ if ( !map ) error("REF alleles not compatible at %s:%d\n");
+
+ // fill in any missing values in the target VCF (or all, if not present)
+ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
+
+ int i;
+ for (i=0; i<ndst; i++)
+ {
+ if ( map[i]<0 )
+ {
+ if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing;
+ continue;
+ }
+ if ( ntmpi2==ndst && col->replace==REPLACE_MISSING
+ && args->tmpi2[i]!=bcf_int32_missing
+ && args->tmpi2[i]!=bcf_int32_vector_end ) continue;
+
+ args->tmpi2[i] = args->tmpi[ map[i] ];
+ }
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst);
+ return 0;
+}
+static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol], *end = str;
+ if ( str[0]=='.' && str[1]==0 ) return 0;
+
+ int ntmpi = 0;
+ while ( *end )
+ {
+ int val = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ ntmpi++;
+ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
+ args->tmpi[ntmpi-1] = val;
+ str = end+1;
+ }
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
+ }
+
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ return 0;
+}
+static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ if ( ntmpi < 0 ) return 0; // nothing to add
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
+ }
+
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ return 0;
+}
+static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
+{
+ if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
+ if ( !map ) error("REF alleles not compatible at %s:%d\n");
+
+ // fill in any missing values in the target VCF (or all, if not present)
+ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
+
+ int i;
+ for (i=0; i<ndst; i++)
+ {
+ if ( map[i]<0 )
+ {
+ if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]);
+ continue;
+ }
+ if ( ntmpf2==ndst && col->replace==REPLACE_MISSING
+ && !bcf_float_is_missing(args->tmpf2[i])
+ && !bcf_float_is_vector_end(args->tmpf2[i]) ) continue;
+
+ args->tmpf2[i] = args->tmpf[ map[i] ];
+ }
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst);
+ return 0;
+}
+static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol], *end = str;
+ if ( str[0]=='.' && str[1]==0 ) return 0;
+
+ int ntmpf = 0;
+ while ( *end )
+ {
+ double val = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ ntmpf++;
+ hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
+ args->tmpf[ntmpf-1] = val;
+ str = end+1;
+ }
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
+ }
+
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ return 0;
+}
+static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ if ( ntmpf < 0 ) return 0; // nothing to add
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
+ }
+
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ return 0;
+}
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
+static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
+{
+ int nsrc = 1, lsrc = 0;
+ while ( args->tmps[lsrc] )
+ {
+ if ( args->tmps[lsrc]==',' ) nsrc++;
+ lsrc++;
+ }
+ if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
+ if ( !map ) error("REF alleles not compatible at %s:%d\n");
+
+ // fill in any missing values in the target VCF (or all, if not present)
+ int i, empty = 0, nstr, mstr = args->tmpks.m;
+ nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr);
+ args->tmpks.m = mstr;
+ if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
+ {
+ empty = 0;
+ args->tmpks.l = 0;
+ kputc('.',&args->tmpks);
+ for (i=1; i<ndst; i++) kputs(",.",&args->tmpks);
+ }
+ else args->tmpks.l = nstr;
+ for (i=0; i<ndst; i++)
+ {
+ if ( map[i]<0 )
+ {
+ if ( empty ) copy_string_field(".",0,1,&args->tmpks,i);
+ continue;
+ }
+ if ( col->replace==REPLACE_MISSING )
+ {
+ // Do not replace filled values. The field must be looked up again because
+ // of realloc in copy_string_field
+ int n = 0;
+ char *str = args->tmpks.s;
+ while ( *str && n<i )
+ {
+ if ( *str==',' ) n++;
+ str++;
+ }
+ if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue; // value already set
+ }
+ int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
+ assert( ret==0 );
+ }
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s);
+ return 0;
+}
+static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int len = strlen(tab->cols[col->icol]);
+ if ( !len ) return 0;
+ hts_expand(char,len+1,args->mtmps,args->tmps);
+ memcpy(args->tmps,tab->cols[col->icol],len+1);
+ if ( args->tmps[0]=='.' && args->tmps[1]==0 ) return 0;
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
+ }
+
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ return 0;
+}
+static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps);
+ if ( ntmps < 0 ) return 0; // nothing to add
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
+ }
+
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ return 0;
+}
+static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_genotypes(args->files->readers[1].header,rec,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc);
+
+ int i, j, ndst = bcf_get_genotypes(args->hdr,line,&args->tmpi2,&args->mtmpi2);
+ if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
+ nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
+ if ( ndst<=0 ) // field not present in dst file
+ {
+ if ( col->replace==REPLACE_EXISTING ) return 0;
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *dst = args->tmpi2 + nsrc*i;
+ if ( args->sample_map[i]==-1 )
+ {
+ dst[0] = bcf_gt_missing;
+ for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else if ( ndst >= nsrc )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *dst = args->tmpi2 + ndst*i;
+ if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
+ }
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else // ndst < nsrc
+ {
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *ori = args->tmpi2 + ndst*i;
+ int32_t *dst = args->tmpi3 + nsrc*i;
+ int keep_ori = 0;
+ if ( args->sample_map[i]==-1 ) keep_ori = 1;
+ else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ if ( keep_ori )
+ {
+ for (j=0; j<ndst; j++) dst[j] = ori[j];
+ for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+}
+static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc);
+
+ int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2);
+ if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
+ nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
+ if ( ndst<=0 )
+ {
+ if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *dst = args->tmpi2 + nsrc*i;
+ if ( args->sample_map[i]==-1 )
+ {
+ dst[0] = bcf_int32_missing;
+ for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else if ( ndst >= nsrc )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *dst = args->tmpi2 + ndst*i;
+ if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue;
+ if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue;
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else // ndst < nsrc
+ {
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *ori = args->tmpi2 + ndst*i;
+ int32_t *dst = args->tmpi3 + nsrc*i;
+ int keep_ori = 0;
+ if ( args->sample_map[i]==-1 ) keep_ori = 1;
+ else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1;
+ else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1;
+ if ( keep_ori )
+ {
+ for (j=0; j<ndst; j++) dst[j] = ori[j];
+ for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+}
+static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc);
+
+ int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2);
+ if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
+ nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
+ if ( ndst<=0 )
+ {
+ if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
+ hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ float *dst = args->tmpf2 + nsrc*i;
+ if ( args->sample_map[i]==-1 )
+ {
+ bcf_float_set_missing(dst[0]);
+ for (j=1; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ }
+ else
+ {
+ float *src = args->tmpf + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else if ( ndst >= nsrc )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ float *src = args->tmpf + nsrc*args->sample_map[i];
+ float *dst = args->tmpf2 + ndst*i;
+ if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue;
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else // ndst < nsrc
+ {
+ hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ float *ori = args->tmpf2 + ndst*i;
+ float *dst = args->tmpf3 + nsrc*i;
+ int keep_ori = 0;
+ if ( args->sample_map[i]==-1 ) keep_ori = 1;
+ else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1;
+ if ( keep_ori )
+ {
+ for (j=0; j<ndst; j++) dst[j] = ori[j];
+ for (; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ }
+ else
+ {
+ float *src = args->tmpf + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+}
+static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ args->tmpp[0] = args->tmps;
+ int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps);
+ args->tmps = args->tmpp[0]; // tmps might be realloced
+ if ( ret==-3 ) return 0; // the tag is not present
+ if ( ret<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out));
+
+ int i;
+ args->tmpp2[0] = args->tmps2;
+ ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2);
+ args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced
+
+ if ( ret<=0 ) // not present in dst
+ {
+ hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ args->tmps2[2*i] = '.';
+ args->tmps2[2*i+1] = 0;
+ args->tmpp2[i] = args->tmps2+2*i;
+ }
+ }
+
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int isrc = args->sample_map[i];
+ if ( isrc==-1 ) continue;
+ args->tmpp2[i] = args->tmpp[isrc];
+ }
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out));
+}
+static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples)
+{
+ int i;
+ if ( !args->sample_names )
+ {
+ int nmatch = 0, order_ok = 1;
+ for (i=0; i<bcf_hdr_nsamples(src); i++)
+ {
+ int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
+ if ( id!=-1 )
+ {
+ nmatch++;
+ if ( i!=id ) order_ok = 0;
+ }
+ }
+ if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok && !need_samples )
+ return; // the same samples in both files
+
+ if ( !nmatch ) error("No matching samples found in the source and the destination file\n");
+ if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(stderr,"%d sample(s) in common\n", nmatch);
+
+ args->nsample_map = bcf_hdr_nsamples(dst);
+ args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
+ for (i=0; i<args->nsample_map; i++)
+ {
+ int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
+ args->sample_map[i] = id; // idst -> isrc, -1 if not present
+ }
+ return;
+ }
+
+ args->nsample_map = bcf_hdr_nsamples(dst);
+ args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
+ for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
+
+ int nsamples = 0;
+ char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples);
+ for (i=0; i<nsamples; i++)
+ {
+ int isrc, idst;
+ char *ss = samples[i], *se = samples[i];
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se )
+ {
+ // only one sample name
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+ args->sample_map[idst] = isrc;
+ continue;
+ }
+ *se = 0;
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+
+ ss = se+1;
+ while ( isspace(*ss) ) ss++;
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+
+ args->sample_map[idst] = isrc;
+ }
+ for (i=0; i<nsamples; i++) free(samples[i]);
+ free(samples);
+}
+static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
+{
+ kstring_t str = {0,0,0};
+ char *ss = columns, *se = ss;
+ while ( *ss )
+ {
+ if ( *se && *se!=',' ) { se++; continue; }
+ if ( *ss!='^' )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputsn(ss, se-ss, &str);
+ if ( !*se ) break;
+ ss = ++se;
+ continue;
+ }
+
+ if ( !strncasecmp("^INFO/",ss,6) )
+ {
+ if ( !*skip_info )
+ {
+ *skip_info = khash_str2int_init();
+ if ( str.l ) kputc(',',&str);
+ kputs("INFO",&str);
+ }
+ char tmp = *se; *se = 0;
+ khash_str2int_inc(*skip_info, strdup(ss+6));
+ *se = tmp;
+ }
+ else if ( !strncasecmp("^FORMAT/",ss,8) || !strncasecmp("^FMT/",ss,5) )
+ {
+ int n = !strncasecmp("^FMT/",ss,5) ? 5 : 8;
+ if ( !*skip_fmt )
+ {
+ *skip_fmt = khash_str2int_init();
+ if ( str.l ) kputc(',',&str);
+ kputs("FORMAT",&str);
+ }
+ char tmp = *se; *se = 0;
+ khash_str2int_inc(*skip_fmt, strdup(ss+n));
+ *se = tmp;
+ }
+ else
+ {
+ if ( !*skip_info )
+ {
+ *skip_info = khash_str2int_init();
+ if ( str.l ) kputc(',',&str);
+ kputs("INFO",&str);
+ }
+ char tmp = *se; *se = 0;
+ khash_str2int_inc(*skip_info, strdup(ss+1));
+ *se = tmp;
+ }
+
+ if ( !*se ) break;
+ ss = ++se;
+ }
+ free(columns);
+ return str.s;
+}
+static void init_columns(args_t *args)
+{
+ void *skip_fmt = NULL, *skip_info = NULL;
+ if ( args->tgts_is_vcf )
+ args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
+
+ kstring_t str = {0,0,0}, tmp = {0,0,0};
+ char *ss = args->columns, *se = ss;
+ args->ncols = 0;
+ int i = -1, has_fmt_str = 0, force_samples = -1;
+ while ( *ss )
+ {
+ if ( *se && *se!=',' ) { se++; continue; }
+ int replace = REPLACE_ALL;
+ if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
+ else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
+ else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
+ i++;
+ str.l = 0;
+ kputsn(ss, se-ss, &str);
+ if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
+ else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = i;
+ else if ( !strcasecmp("POS",str.s) ) args->from_idx = i;
+ else if ( !strcasecmp("FROM",str.s) ) args->from_idx = i;
+ else if ( !strcasecmp("TO",str.s) ) args->to_idx = i;
+ else if ( !strcasecmp("REF",str.s) ) args->ref_idx = i;
+ else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = i;
+ else if ( !strcasecmp("ID",str.s) )
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
+ col->hdr_key = strdup(str.s);
+ }
+ else if ( !strcasecmp("FILTER",str.s) )
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
+ col->hdr_key = strdup(str.s);
+ if ( args->tgts_is_vcf )
+ {
+ bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
+ int j;
+ for (j=0; j<tgts_hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = tgts_hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_FLT ) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ }
+ bcf_hdr_sync(args->hdr_out);
+ }
+ }
+ else if ( !strcasecmp("QUAL",str.s) )
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
+ col->hdr_key = strdup(str.s);
+ }
+ else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+ bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
+ int j;
+ for (j=0; j<tgts_hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = tgts_hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_INFO ) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue;
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = -1;
+ col->replace = replace;
+ col->hdr_key = strdup(hrec->vals[k]);
+ col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
+ {
+ case BCF_HT_FLAG: col->setter = vcf_setter_info_flag; break;
+ case BCF_HT_INT: col->setter = vcf_setter_info_int; break;
+ case BCF_HT_REAL: col->setter = vcf_setter_info_real; break;
+ case BCF_HT_STR: col->setter = vcf_setter_info_str; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
+ }
+ }
+ }
+ else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
+ {
+ bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
+ if ( force_samples<0 ) force_samples = replace;
+ if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ int j;
+ for (j=0; j<tgts_hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = tgts_hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_FMT) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue;
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = -1;
+ col->replace = replace;
+ col->hdr_key = strdup(hrec->vals[k]);
+ if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt;
+ else
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
+ {
+ case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
+ case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
+ case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
+ }
+ }
+ }
+ else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
+ {
+ if ( !args->tgts_is_vcf )
+ error("Error: FORMAT fields can be carried over from a VCF file only.\n");
+
+ char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
+ if ( force_samples<0 ) force_samples = replace;
+ if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;;
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = -1;
+ col->replace = replace;
+ col->hdr_key = strdup(key);
+ if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
+ else
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
+ {
+ case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
+ case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
+ case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
+ }
+ }
+ else
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+ if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); }
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
+ {
+ if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL);
+ if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname);
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ }
+ else
+ error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
+ }
+
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->hdr_key = strdup(str.s);
+ col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
+ {
+ case BCF_HT_FLAG: col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
+ case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_info_int : setter_info_int; break;
+ case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_info_real : setter_info_real; break;
+ case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
+ }
+ }
+ if ( !*se ) break;
+ ss = ++se;
+ }
+ free(str.s);
+ free(tmp.s);
+ if ( args->to_idx==-1 ) args->to_idx = args->from_idx;
+ free(args->columns);
+ if ( skip_info ) khash_str2int_destroy_free(skip_info);
+ if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt);
+ if ( has_fmt_str )
+ {
+ int n = bcf_hdr_nsamples(args->hdr_out) > bcf_hdr_nsamples(args->files->readers[1].header) ? bcf_hdr_nsamples(args->hdr_out) : bcf_hdr_nsamples(args->files->readers[1].header);
+ args->tmpp = (char**)malloc(sizeof(char*)*n);
+ args->tmpp2 = (char**)malloc(sizeof(char*)*n);
+ }
+ if ( force_samples>=0 )
+ set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
+}
+
+static void rename_chrs(args_t *args, char *fname)
+{
+ int n, i;
+ char **map = hts_readlist(fname, 1, &n);
+ if ( !map ) error("Could not read: %s\n", fname);
+ for (i=0; i<n; i++)
+ {
+ char *ss = map[i];
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) error("Could not parse: %s\n", fname);
+ *ss = 0;
+ int rid = bcf_hdr_name2id(args->hdr_out, map[i]);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_CTG, "ID", map[i], NULL);
+ if ( !hrec ) continue; // the sequence not present
+ int j = bcf_hrec_find_key(hrec, "ID");
+ assert( j>=0 );
+ free(hrec->vals[j]);
+ ss++;
+ while ( *ss && isspace(*ss) ) ss++;
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ hrec->vals[j] = strdup(ss);
+ args->hdr_out->id[BCF_DT_CTG][rid].key = hrec->vals[j];
+ }
+ for (i=0; i<n; i++) free(map[i]);
+ free(map);
+}
+
+static void init_data(args_t *args)
+{
+ args->hdr = args->files->readers[0].header;
+ args->hdr_out = bcf_hdr_dup(args->hdr);
+
+ if ( args->remove_annots ) init_remove_annots(args);
+ if ( args->header_fname ) init_header_lines(args);
+ if ( args->targets_fname && args->tgts_is_vcf )
+ {
+ // reading annots from a VCF
+ if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
+ error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
+ }
+ if ( args->columns ) init_columns(args);
+ if ( args->targets_fname && !args->tgts_is_vcf )
+ {
+ if ( !args->columns ) error("The -c option not given\n");
+ if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
+ if ( args->from_idx==-1 ) error("The -c POS option not given\n");
+ if ( args->to_idx==-1 ) args->to_idx = -args->from_idx - 1;
+
+ args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->from_idx,args->to_idx);
+ if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname);
+ if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname);
+ }
+ args->vcmp = vcmp_init();
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ if ( args->set_ids_fmt )
+ {
+ if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
+ args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
+ }
+
+ if ( args->mark_sites )
+ {
+ if ( !args->targets_fname ) error("The -a option not given\n");
+ if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add..
+ bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
+ args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
+ }
+
+ bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
+ if ( !args->drop_header )
+ {
+ if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
+
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ bcf_hdr_write(args->out_fh, args->hdr_out);
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nrm; i++) free(args->rm[i].key);
+ free(args->rm);
+ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
+ if (args->vcmp) vcmp_destroy(args->vcmp);
+ for (i=0; i<args->ncols; i++)
+ free(args->cols[i].hdr_key);
+ free(args->cols);
+ for (i=0; i<args->malines; i++)
+ {
+ free(args->alines[i].cols);
+ free(args->alines[i].als);
+ free(args->alines[i].line.s);
+ }
+ free(args->alines);
+ if ( args->tgts ) bcf_sr_regions_destroy(args->tgts);
+ free(args->tmpks.s);
+ free(args->tmpi);
+ free(args->tmpf);
+ free(args->tmps);
+ free(args->tmpp);
+ free(args->tmpi2);
+ free(args->tmpf2);
+ free(args->tmps2);
+ free(args->tmpp2);
+ free(args->tmpi3);
+ free(args->tmpf3);
+ if ( args->set_ids )
+ convert_destroy(args->set_ids);
+ if ( args->filter )
+ filter_destroy(args->filter);
+ if (args->out_fh) hts_close(args->out_fh);
+ free(args->sample_map);
+}
+
+static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos)
+{
+ if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0;
+
+ int i = 0;
+ while ( i<args->nalines )
+ {
+ if ( line->pos > args->alines[i].end )
+ {
+ args->nalines--;
+ if ( args->nalines && i<args->nalines )
+ {
+ annot_line_t tmp = args->alines[i];
+ memmove(&args->alines[i],&args->alines[i+1],(args->nalines-i)*sizeof(annot_line_t));
+ args->alines[args->nalines] = tmp;
+ }
+ }
+ else i++;
+ }
+
+ if ( args->ref_idx==-1 && args->nalines ) return;
+
+ while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
+ {
+ args->nalines++;
+ hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
+ annot_line_t *tmp = &args->alines[args->nalines-1];
+ tmp->rid = line->rid;
+ tmp->start = args->tgts->start;
+ tmp->end = args->tgts->end;
+ tmp->line.l = 0;
+ kputs(args->tgts->line.s, &tmp->line);
+ char *s = tmp->line.s;
+ tmp->ncols = 1;
+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
+ tmp->cols[0] = s;
+ while ( *s )
+ {
+ if ( *s=='\t' )
+ {
+ tmp->ncols++;
+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
+ tmp->cols[tmp->ncols-1] = s+1;
+ *s = 0;
+ }
+ s++;
+ }
+ if ( args->ref_idx != -1 )
+ {
+ assert( args->ref_idx < tmp->ncols );
+ assert( args->alt_idx < tmp->ncols );
+ tmp->nals = 2;
+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
+ tmp->als[0] = tmp->cols[args->ref_idx];
+ tmp->als[1] = s = tmp->cols[args->alt_idx];
+ while ( *s )
+ {
+ if ( *s==',' )
+ {
+ tmp->nals++;
+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
+ tmp->als[tmp->nals-1] = s+1;
+ *s = 0;
+ }
+ s++;
+ }
+ int iseq = args->tgts->iseq;
+ if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break;
+ }
+ else break;
+ }
+}
+
+static void annotate(args_t *args, bcf1_t *line)
+{
+ int i, j;
+ for (i=0; i<args->nrm; i++)
+ args->rm[i].handler(args, line, &args->rm[i]);
+
+ if ( args->tgts )
+ {
+ // Buffer annotation lines. When multiple ALT alleles are present in the
+ // annotation file, at least one must match one of the VCF alleles.
+ int len = 0;
+ bcf_get_variant_types(line);
+ for (i=1; i<line->n_allele; i++)
+ if ( len > line->d.var[i].n ) len = line->d.var[i].n;
+ int end_pos = len<0 ? line->pos - len : line->pos;
+ buffer_annot_lines(args, line, line->pos, end_pos);
+ for (i=0; i<args->nalines; i++)
+ {
+ if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
+ if ( args->ref_idx != -1 )
+ {
+ if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs not compatible
+ for (j=1; j<args->alines[i].nals; j++)
+ {
+ if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break; // no ALT allele in VCF and annot file has "."
+ if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break;
+ }
+ if ( j==args->alines[i].nals ) continue; // none of the annot alleles present in VCF's ALT
+ }
+ break;
+ }
+
+ if ( i<args->nalines )
+ {
+ // there is a matching line
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ }
+
+ if ( args->mark_sites )
+ {
+ // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
+ if ( args->mark_sites_logic==MARK_LISTED )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?1:0);
+ else
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?0:1);
+ }
+ }
+ else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) )
+ {
+ bcf1_t *aline = bcf_sr_get_line(args->files,1);
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ }
+ if ( args->set_ids )
+ {
+ args->tmpks.l = 0;
+ convert_line(args->set_ids, line, &args->tmpks);
+ if ( args->tmpks.l )
+ {
+ int replace = 0;
+ if ( args->set_ids_replace ) replace = 1;
+ else if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) replace = 1;
+ if ( replace )
+ bcf_update_id(args->hdr_out,line,args->tmpks.s);
+ }
+ }
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Annotate and edit VCF/BCF files.\n");
+ fprintf(stderr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
+ fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man pagee for details\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man pagee for details)\n");
+ fprintf(stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
+ fprintf(stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfannotate(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
+ args->set_ids_replace = 1;
+ int regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"mark-sites",required_argument,NULL,'m'},
+ {"set-id",required_argument,NULL,'I'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"annotations",required_argument,NULL,'a'},
+ {"include",required_argument,NULL,'i'},
+ {"exclude",required_argument,NULL,'e'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"remove",required_argument,NULL,'x'},
+ {"columns",required_argument,NULL,'c'},
+ {"rename-chrs",required_argument,NULL,1},
+ {"header-lines",required_argument,NULL,'h'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'm':
+ args->mark_sites_logic = MARK_LISTED;
+ if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
+ else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
+ else args->mark_sites = optarg;
+ break;
+ case 'I': args->set_ids_fmt = optarg; break;
+ case 's': args->sample_names = optarg; break;
+ case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
+ case 'c': args->columns = strdup(optarg); break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'x': args->remove_annots = optarg; break;
+ case 'a': args->targets_fname = optarg; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'h': args->header_fname = optarg; break;
+ case 1 : args->rename_chrs = optarg; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
+
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_fname )
+ {
+ htsFile *fp = hts_open(args->targets_fname,"r");
+ htsFormat type = *hts_get_format(fp);
+ hts_close(fp);
+
+ if ( type.format==vcf || type.format==bcf )
+ {
+ args->tgts_is_vcf = 1;
+ args->files->require_index = 1;
+ args->files->collapse |= COLLAPSE_SOME;
+ }
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ if ( !bcf_sr_has_line(args->files,0) ) continue;
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( line->errcode ) error("Encountered error, cannot proceed. Please check the error output above.\n");
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+ annotate(args, line);
+ bcf_write1(args->out_fh, args->hdr_out, line);
+ }
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c
new file mode 100644
index 0000000..1d86dbe
--- /dev/null
+++ b/bcftools/vcfannotate.c.pysam.c
@@ -0,0 +1,1762 @@
+#include "pysam.h"
+
+/* vcfannotate.c -- Annotate and edit VCF/BCF files.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include <dlfcn.h>
+#include "bcftools.h"
+#include "vcmp.h"
+#include "filter.h"
+#include "convert.h"
+
+struct _args_t;
+
+typedef struct _rm_tag_t
+{
+ char *key;
+ int hdr_id;
+ void (*handler)(struct _args_t *, bcf1_t *, struct _rm_tag_t *);
+}
+rm_tag_t;
+
+typedef struct
+{
+ char **cols;
+ int ncols, mcols;
+ char **als;
+ int nals, mals;
+ kstring_t line;
+ int rid, start, end;
+}
+annot_line_t;
+
+#define REPLACE_MISSING 0 // replace only missing values
+#define REPLACE_ALL 1 // replace both missing and existing values
+#define REPLACE_EXISTING 2 // replace only if tgt is not missing
+#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
+typedef struct _annot_col_t
+{
+ int icol, replace, number; // number: one of BCF_VL_* types
+ char *hdr_key;
+ int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+}
+annot_col_t;
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define MARK_LISTED 1
+#define MARK_UNLISTED 2
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr, *hdr_out;
+ htsFile *out_fh;
+ int output_type, n_threads;
+ bcf_sr_regions_t *tgts;
+
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
+ rm_tag_t *rm; // tags scheduled for removal
+ int nrm;
+ int flt_keep_pass; // when all filters removed, reset to PASS
+
+ vcmp_t *vcmp; // for matching annotation and VCF lines by allele
+ annot_line_t *alines; // buffered annotation lines
+ int nalines, malines;
+ int ref_idx, alt_idx, chr_idx, from_idx, to_idx; // -1 if not present
+ annot_col_t *cols; // column indexes and setters
+ int ncols;
+
+ char *set_ids_fmt;
+ convert_t *set_ids;
+ int set_ids_replace;
+
+ int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc
+ int mtmpi, mtmpf, mtmps;
+ int mtmpi2, mtmpf2, mtmps2;
+ int mtmpi3, mtmpf3, mtmps3;
+ int32_t *tmpi, *tmpi2, *tmpi3;
+ float *tmpf, *tmpf2, *tmpf3;
+ char *tmps, *tmps2, **tmpp, **tmpp2;
+ kstring_t tmpks;
+
+ char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
+ char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
+ int argc, drop_header, tgts_is_vcf, mark_sites_logic;
+}
+args_t;
+
+char *msprintf(const char *fmt, ...);
+
+void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_update_id(args->hdr,line,NULL);
+}
+void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass);
+ else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass);
+}
+void remove_qual(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_float_set_missing(line->qual);
+}
+void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ // remove all INFO fields
+ if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
+
+ int i;
+ for (i=0; i<line->n_info; i++)
+ {
+ bcf_info_t *inf = &line->d.info[i];
+ if ( inf->vptr_free )
+ {
+ free(inf->vptr - inf->vptr_off);
+ inf->vptr_free = 0;
+ }
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ inf->vptr = NULL;
+ }
+}
+void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_update_info(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT); // the type does not matter with n=0
+}
+void remove_format_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ bcf_update_format(args->hdr, line, tag->key, NULL, 0, BCF_HT_INT); // the type does not matter with n=0
+}
+void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag)
+{
+ // remove all FORMAT fields except GT
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ const char *key = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
+ if ( key[0]=='G' && key[1]=='T' && !key[2] ) continue;
+
+ if ( fmt->p_free )
+ {
+ free(fmt->p - fmt->p_off);
+ fmt->p_free = 0;
+ }
+ line->d.indiv_dirty = 1;
+ fmt->p = NULL;
+ }
+}
+
+static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
+{
+ int i = 0, nrm = 0;
+ while ( i<hdr->nhrec )
+ {
+ if ( hdr->hrec[i]->type!=type ) { i++; continue; }
+ bcf_hrec_t *hrec = hdr->hrec[i];
+ if ( type==BCF_HL_FMT )
+ {
+ // everything except FORMAT/GT
+ int id = bcf_hrec_find_key(hrec, "ID");
+ if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ }
+ nrm++;
+ hdr->nhrec--;
+ if ( i < hdr->nhrec )
+ memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
+ bcf_hrec_destroy(hrec);
+ }
+ if ( nrm ) bcf_hdr_sync(hdr);
+}
+
+static void init_remove_annots(args_t *args)
+{
+ int keep_info = 0, keep_fmt = 0, keep_flt = 0;
+ void *keep = khash_str2int_init();
+ kstring_t str = {0,0,0};
+ char *ss = args->remove_annots;
+ while ( *ss )
+ {
+ args->nrm++;
+ args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
+ rm_tag_t *tag = &args->rm[args->nrm-1];
+ tag->key = NULL;
+
+ int type = BCF_HL_GEN;
+ if ( !strncasecmp("INFO/",ss,5) ) { type = BCF_HL_INFO; ss += 5; }
+ else if ( !strncasecmp("INF/",ss,4) ) { type = BCF_HL_INFO; ss += 4; }
+ else if ( !strncasecmp("FORMAT/",ss,7) ) { type = BCF_HL_FMT; ss += 7; }
+ else if ( !strncasecmp("FMT/",ss,4) ) { type = BCF_HL_FMT; ss += 4; }
+ else if ( !strncasecmp("FILTER/",ss,7) ) { type = BCF_HL_FLT; ss += 7; }
+ else if ( !strncasecmp("^INFO/",ss,6) ) { type = BCF_HL_INFO; ss += 6; keep_info = 1; }
+ else if ( !strncasecmp("^INF/",ss,5) ) { type = BCF_HL_INFO; ss += 5; keep_info = 1; }
+ else if ( !strncasecmp("^FORMAT/",ss,8) ) { type = BCF_HL_FMT; ss += 8; keep_fmt = 1; }
+ else if ( !strncasecmp("^FMT/",ss,5) ) { type = BCF_HL_FMT; ss += 5; keep_fmt = 1; }
+ else if ( !strncasecmp("^FILTER/",ss,8) ) { type = BCF_HL_FLT; ss += 8; keep_flt = 1; }
+
+ char *se = ss;
+ while ( *se && *se!=',' ) se++;
+ str.l = 0;
+ kputsn(ss, se-ss, &str);
+
+ if ( type==BCF_HL_FLT )
+ {
+ if ( !keep_flt )
+ {
+ args->flt_keep_pass = 1;
+ tag->handler = remove_filter;
+ tag->key = strdup(str.s);
+ tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
+ bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
+ }
+ else
+ {
+ int value, ret = khash_str2int_get(keep, str.s, &value);
+ if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<BCF_HL_FLT);
+ else khash_str2int_set(keep, str.s, value | 1<<BCF_HL_FLT);
+ args->nrm--;
+ }
+ }
+ else if ( type!=BCF_HL_GEN )
+ {
+ int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s);
+ if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) )
+ {
+ fprintf(pysamerr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
+ args->nrm--;
+ }
+ else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) )
+ {
+ int value, ret = khash_str2int_get(keep, str.s, &value);
+ if ( ret==-1 ) khash_str2int_set(keep, strdup(str.s),1<<type);
+ else khash_str2int_set(keep, str.s, value | 1<<type);
+ args->nrm--;
+ }
+ else
+ {
+ tag->key = strdup(str.s);
+ if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
+ bcf_hdr_remove(args->hdr_out,type,tag->key);
+ }
+ }
+ else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
+ else if ( !strcasecmp("FILTER",str.s) )
+ {
+ tag->handler = remove_filter;
+ remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
+ }
+ else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
+ else if ( !strcasecmp("INFO",str.s) )
+ {
+ tag->handler = remove_info;
+ remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
+ }
+ else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
+ {
+ tag->handler = remove_format;
+ remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
+ }
+ else if ( str.l )
+ {
+ if ( str.s[0]=='#' && str.s[1]=='#' )
+ bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
+ else
+ bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+ args->nrm--;
+ }
+
+ ss = *se ? se+1 : se;
+ }
+ free(str.s);
+ if ( keep_flt || keep_info || keep_fmt )
+ {
+ int j;
+ for (j=0; j<args->hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = args->hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue;
+ if ( !keep_flt && hrec->type==BCF_HL_FLT ) continue;
+ if ( !keep_info && hrec->type==BCF_HL_INFO ) continue;
+ if ( !keep_fmt && hrec->type==BCF_HL_FMT ) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ int value, ret = khash_str2int_get(keep,hrec->vals[k],&value);
+ if ( ret==0 && value>>hrec->type ) // keep
+ {
+ if ( hrec->type==BCF_HL_FLT && !strcmp("PASS",hrec->vals[k]) ) args->flt_keep_pass = 1;
+ continue;
+ }
+ args->nrm++;
+ args->rm = (rm_tag_t*) realloc(args->rm,sizeof(rm_tag_t)*args->nrm);
+ rm_tag_t *tag = &args->rm[args->nrm-1];
+ if ( hrec->type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ else if ( hrec->type==BCF_HL_FMT ) tag->handler = remove_format_tag;
+ else
+ {
+ tag->handler = remove_filter;
+ tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
+ }
+ tag->key = strdup(hrec->vals[k]);
+ bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
+ }
+ }
+ khash_str2int_destroy_free(keep);
+ if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots);
+ bcf_hdr_sync(args->hdr_out);
+}
+static void init_header_lines(args_t *args)
+{
+ htsFile *file = hts_open(args->header_fname, "rb");
+ if ( !file ) error("Error reading %s\n", args->header_fname);
+ kstring_t str = {0,0,0};
+ while ( hts_getline(file, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s);
+ bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else)
+ }
+ hts_close(file);
+ free(str.s);
+ bcf_hdr_sync(args->hdr_out);
+ bcf_hdr_sync(args->hdr);
+}
+static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ // note: so far this works only with one filter, not a list of filters
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ hts_expand(int,1,args->mtmpi,args->tmpi);
+ args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
+ if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]);
+ if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; }
+ if ( col->replace!=REPLACE_MISSING )
+ {
+ bcf_update_filter(args->hdr_out,line,NULL,0);
+ bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+ return 0;
+ }
+
+ // only update missing FILTER
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ if ( !line->d.n_flt )
+ bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+ return 0;
+}
+static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ int i;
+ bcf1_t *rec = (bcf1_t*) data;
+ if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ if ( !rec->d.n_flt ) return 0; // don't overwrite with a missing value
+ if ( col->replace==SET_OR_APPEND || col->replace==REPLACE_MISSING )
+ {
+ if ( col->replace==REPLACE_MISSING && line->d.n_flt ) return 0; // only update missing FILTER
+ for (i=0; i<rec->d.n_flt; i++)
+ {
+ const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
+ bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt));
+ }
+ return 0;
+ }
+ hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
+ for (i=0; i<rec->d.n_flt; i++)
+ {
+ const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
+ args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
+ }
+ bcf_update_filter(args->hdr_out,line,NULL,0);
+ bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
+ return 0;
+}
+static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ // possible cases:
+ // IN ANNOT OUT ACHIEVED_BY
+ // x y x -c +ID
+ // x y y -c ID
+ // x y x,y -c =ID
+ // x . x -c +ID, ID
+ // x . . -x ID
+ // . y y -c +ID, -c ID
+ //
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
+ if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
+
+ // running with +ID, only update missing ids
+ if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
+ return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
+ return 0;
+}
+static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id);
+ if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id);
+
+ // running with +ID, only update missing ids
+ if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
+ return bcf_update_id(args->hdr_out,line,rec->d.id);
+ return 0;
+}
+static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol];
+ if ( str[0]=='.' && str[1]==0 ) return 0; // empty
+
+ if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+
+ line->qual = strtod(str, &str);
+ if ( str == tab->cols[col->icol] )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ return 0;
+}
+static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ if ( bcf_float_is_missing(rec->qual) ) return 0;
+ if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+ line->qual = rec->qual;
+ return 0;
+}
+static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol];
+ if ( str[0]=='.' && str[1]==0 ) return 0;
+
+ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1);
+ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0);
+ error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ return -1;
+}
+static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL);
+ bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag);
+ return 0;
+}
+static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
+{
+ if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
+ if ( !map ) error("REF alleles not compatible at %s:%d\n");
+
+ // fill in any missing values in the target VCF (or all, if not present)
+ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
+
+ int i;
+ for (i=0; i<ndst; i++)
+ {
+ if ( map[i]<0 )
+ {
+ if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing;
+ continue;
+ }
+ if ( ntmpi2==ndst && col->replace==REPLACE_MISSING
+ && args->tmpi2[i]!=bcf_int32_missing
+ && args->tmpi2[i]!=bcf_int32_vector_end ) continue;
+
+ args->tmpi2[i] = args->tmpi[ map[i] ];
+ }
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst);
+ return 0;
+}
+static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol], *end = str;
+ if ( str[0]=='.' && str[1]==0 ) return 0;
+
+ int ntmpi = 0;
+ while ( *end )
+ {
+ int val = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ ntmpi++;
+ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
+ args->tmpi[ntmpi-1] = val;
+ str = end+1;
+ }
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
+ }
+
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ return 0;
+}
+static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ if ( ntmpi < 0 ) return 0; // nothing to add
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
+ }
+
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ return 0;
+}
+static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
+{
+ if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
+ if ( !map ) error("REF alleles not compatible at %s:%d\n");
+
+ // fill in any missing values in the target VCF (or all, if not present)
+ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
+
+ int i;
+ for (i=0; i<ndst; i++)
+ {
+ if ( map[i]<0 )
+ {
+ if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]);
+ continue;
+ }
+ if ( ntmpf2==ndst && col->replace==REPLACE_MISSING
+ && !bcf_float_is_missing(args->tmpf2[i])
+ && !bcf_float_is_vector_end(args->tmpf2[i]) ) continue;
+
+ args->tmpf2[i] = args->tmpf[ map[i] ];
+ }
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst);
+ return 0;
+}
+static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ char *str = tab->cols[col->icol], *end = str;
+ if ( str[0]=='.' && str[1]==0 ) return 0;
+
+ int ntmpf = 0;
+ while ( *end )
+ {
+ double val = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ ntmpf++;
+ hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
+ args->tmpf[ntmpf-1] = val;
+ str = end+1;
+ }
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
+ }
+
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ return 0;
+}
+static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ if ( ntmpf < 0 ) return 0; // nothing to add
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
+ }
+
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ return 0;
+}
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
+static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
+{
+ int nsrc = 1, lsrc = 0;
+ while ( args->tmps[lsrc] )
+ {
+ if ( args->tmps[lsrc]==',' ) nsrc++;
+ lsrc++;
+ }
+ if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
+ if ( !map ) error("REF alleles not compatible at %s:%d\n");
+
+ // fill in any missing values in the target VCF (or all, if not present)
+ int i, empty = 0, nstr, mstr = args->tmpks.m;
+ nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr);
+ args->tmpks.m = mstr;
+ if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
+ {
+ empty = 0;
+ args->tmpks.l = 0;
+ kputc('.',&args->tmpks);
+ for (i=1; i<ndst; i++) kputs(",.",&args->tmpks);
+ }
+ else args->tmpks.l = nstr;
+ for (i=0; i<ndst; i++)
+ {
+ if ( map[i]<0 )
+ {
+ if ( empty ) copy_string_field(".",0,1,&args->tmpks,i);
+ continue;
+ }
+ if ( col->replace==REPLACE_MISSING )
+ {
+ // Do not replace filled values. The field must be looked up again because
+ // of realloc in copy_string_field
+ int n = 0;
+ char *str = args->tmpks.s;
+ while ( *str && n<i )
+ {
+ if ( *str==',' ) n++;
+ str++;
+ }
+ if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue; // value already set
+ }
+ int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
+ assert( ret==0 );
+ }
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s);
+ return 0;
+}
+static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int len = strlen(tab->cols[col->icol]);
+ if ( !len ) return 0;
+ hts_expand(char,len+1,args->mtmps,args->tmps);
+ memcpy(args->tmps,tab->cols[col->icol],len+1);
+ if ( args->tmps[0]=='.' && args->tmps[1]==0 ) return 0;
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
+ }
+
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ return 0;
+}
+static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps);
+ if ( ntmps < 0 ) return 0; // nothing to add
+
+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
+ return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
+
+ if ( col->replace==REPLACE_MISSING )
+ {
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
+ }
+
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ return 0;
+}
+static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_genotypes(args->files->readers[1].header,rec,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc);
+
+ int i, j, ndst = bcf_get_genotypes(args->hdr,line,&args->tmpi2,&args->mtmpi2);
+ if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
+ nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
+ if ( ndst<=0 ) // field not present in dst file
+ {
+ if ( col->replace==REPLACE_EXISTING ) return 0;
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *dst = args->tmpi2 + nsrc*i;
+ if ( args->sample_map[i]==-1 )
+ {
+ dst[0] = bcf_gt_missing;
+ for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else if ( ndst >= nsrc )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *dst = args->tmpi2 + ndst*i;
+ if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
+ }
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else // ndst < nsrc
+ {
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *ori = args->tmpi2 + ndst*i;
+ int32_t *dst = args->tmpi3 + nsrc*i;
+ int keep_ori = 0;
+ if ( args->sample_map[i]==-1 ) keep_ori = 1;
+ else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ if ( keep_ori )
+ {
+ for (j=0; j<ndst; j++) dst[j] = ori[j];
+ for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+}
+static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc);
+
+ int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2);
+ if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
+ nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
+ if ( ndst<=0 )
+ {
+ if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *dst = args->tmpi2 + nsrc*i;
+ if ( args->sample_map[i]==-1 )
+ {
+ dst[0] = bcf_int32_missing;
+ for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else if ( ndst >= nsrc )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *dst = args->tmpi2 + ndst*i;
+ if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue;
+ if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue;
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else // ndst < nsrc
+ {
+ hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int32_t *ori = args->tmpi2 + ndst*i;
+ int32_t *dst = args->tmpi3 + nsrc*i;
+ int keep_ori = 0;
+ if ( args->sample_map[i]==-1 ) keep_ori = 1;
+ else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1;
+ else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1;
+ if ( keep_ori )
+ {
+ for (j=0; j<ndst; j++) dst[j] = ori[j];
+ for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ }
+ else
+ {
+ int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+}
+static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc);
+
+ int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2);
+ if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
+ nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
+ if ( ndst<=0 )
+ {
+ if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
+ hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ float *dst = args->tmpf2 + nsrc*i;
+ if ( args->sample_map[i]==-1 )
+ {
+ bcf_float_set_missing(dst[0]);
+ for (j=1; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ }
+ else
+ {
+ float *src = args->tmpf + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else if ( ndst >= nsrc )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ float *src = args->tmpf + nsrc*args->sample_map[i];
+ float *dst = args->tmpf2 + ndst*i;
+ if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue;
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ }
+ else // ndst < nsrc
+ {
+ hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ float *ori = args->tmpf2 + ndst*i;
+ float *dst = args->tmpf3 + nsrc*i;
+ int keep_ori = 0;
+ if ( args->sample_map[i]==-1 ) keep_ori = 1;
+ else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1;
+ if ( keep_ori )
+ {
+ for (j=0; j<ndst; j++) dst[j] = ori[j];
+ for (; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ }
+ else
+ {
+ float *src = args->tmpf + nsrc*args->sample_map[i];
+ for (j=0; j<nsrc; j++) dst[j] = src[j];
+ }
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ }
+}
+static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ args->tmpp[0] = args->tmps;
+ int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps);
+ args->tmps = args->tmpp[0]; // tmps might be realloced
+ if ( ret==-3 ) return 0; // the tag is not present
+ if ( ret<=0 ) return 1; // error
+
+ if ( !args->sample_map )
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out));
+
+ int i;
+ args->tmpp2[0] = args->tmps2;
+ ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2);
+ args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced
+
+ if ( ret<=0 ) // not present in dst
+ {
+ hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ args->tmps2[2*i] = '.';
+ args->tmps2[2*i+1] = 0;
+ args->tmpp2[i] = args->tmps2+2*i;
+ }
+ }
+
+ for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ {
+ int isrc = args->sample_map[i];
+ if ( isrc==-1 ) continue;
+ args->tmpp2[i] = args->tmpp[isrc];
+ }
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out));
+}
+static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples)
+{
+ int i;
+ if ( !args->sample_names )
+ {
+ int nmatch = 0, order_ok = 1;
+ for (i=0; i<bcf_hdr_nsamples(src); i++)
+ {
+ int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
+ if ( id!=-1 )
+ {
+ nmatch++;
+ if ( i!=id ) order_ok = 0;
+ }
+ }
+ if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok && !need_samples )
+ return; // the same samples in both files
+
+ if ( !nmatch ) error("No matching samples found in the source and the destination file\n");
+ if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(pysamerr,"%d sample(s) in common\n", nmatch);
+
+ args->nsample_map = bcf_hdr_nsamples(dst);
+ args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
+ for (i=0; i<args->nsample_map; i++)
+ {
+ int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
+ args->sample_map[i] = id; // idst -> isrc, -1 if not present
+ }
+ return;
+ }
+
+ args->nsample_map = bcf_hdr_nsamples(dst);
+ args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
+ for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
+
+ int nsamples = 0;
+ char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples);
+ for (i=0; i<nsamples; i++)
+ {
+ int isrc, idst;
+ char *ss = samples[i], *se = samples[i];
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se )
+ {
+ // only one sample name
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+ args->sample_map[idst] = isrc;
+ continue;
+ }
+ *se = 0;
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+
+ ss = se+1;
+ while ( isspace(*ss) ) ss++;
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+
+ args->sample_map[idst] = isrc;
+ }
+ for (i=0; i<nsamples; i++) free(samples[i]);
+ free(samples);
+}
+static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
+{
+ kstring_t str = {0,0,0};
+ char *ss = columns, *se = ss;
+ while ( *ss )
+ {
+ if ( *se && *se!=',' ) { se++; continue; }
+ if ( *ss!='^' )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputsn(ss, se-ss, &str);
+ if ( !*se ) break;
+ ss = ++se;
+ continue;
+ }
+
+ if ( !strncasecmp("^INFO/",ss,6) )
+ {
+ if ( !*skip_info )
+ {
+ *skip_info = khash_str2int_init();
+ if ( str.l ) kputc(',',&str);
+ kputs("INFO",&str);
+ }
+ char tmp = *se; *se = 0;
+ khash_str2int_inc(*skip_info, strdup(ss+6));
+ *se = tmp;
+ }
+ else if ( !strncasecmp("^FORMAT/",ss,8) || !strncasecmp("^FMT/",ss,5) )
+ {
+ int n = !strncasecmp("^FMT/",ss,5) ? 5 : 8;
+ if ( !*skip_fmt )
+ {
+ *skip_fmt = khash_str2int_init();
+ if ( str.l ) kputc(',',&str);
+ kputs("FORMAT",&str);
+ }
+ char tmp = *se; *se = 0;
+ khash_str2int_inc(*skip_fmt, strdup(ss+n));
+ *se = tmp;
+ }
+ else
+ {
+ if ( !*skip_info )
+ {
+ *skip_info = khash_str2int_init();
+ if ( str.l ) kputc(',',&str);
+ kputs("INFO",&str);
+ }
+ char tmp = *se; *se = 0;
+ khash_str2int_inc(*skip_info, strdup(ss+1));
+ *se = tmp;
+ }
+
+ if ( !*se ) break;
+ ss = ++se;
+ }
+ free(columns);
+ return str.s;
+}
+static void init_columns(args_t *args)
+{
+ void *skip_fmt = NULL, *skip_info = NULL;
+ if ( args->tgts_is_vcf )
+ args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
+
+ kstring_t str = {0,0,0}, tmp = {0,0,0};
+ char *ss = args->columns, *se = ss;
+ args->ncols = 0;
+ int i = -1, has_fmt_str = 0, force_samples = -1;
+ while ( *ss )
+ {
+ if ( *se && *se!=',' ) { se++; continue; }
+ int replace = REPLACE_ALL;
+ if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
+ else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
+ else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
+ i++;
+ str.l = 0;
+ kputsn(ss, se-ss, &str);
+ if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
+ else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = i;
+ else if ( !strcasecmp("POS",str.s) ) args->from_idx = i;
+ else if ( !strcasecmp("FROM",str.s) ) args->from_idx = i;
+ else if ( !strcasecmp("TO",str.s) ) args->to_idx = i;
+ else if ( !strcasecmp("REF",str.s) ) args->ref_idx = i;
+ else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = i;
+ else if ( !strcasecmp("ID",str.s) )
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
+ col->hdr_key = strdup(str.s);
+ }
+ else if ( !strcasecmp("FILTER",str.s) )
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
+ col->hdr_key = strdup(str.s);
+ if ( args->tgts_is_vcf )
+ {
+ bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
+ int j;
+ for (j=0; j<tgts_hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = tgts_hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_FLT ) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ }
+ bcf_hdr_sync(args->hdr_out);
+ }
+ }
+ else if ( !strcasecmp("QUAL",str.s) )
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
+ col->hdr_key = strdup(str.s);
+ }
+ else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+ bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
+ int j;
+ for (j=0; j<tgts_hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = tgts_hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_INFO ) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue;
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = -1;
+ col->replace = replace;
+ col->hdr_key = strdup(hrec->vals[k]);
+ col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
+ {
+ case BCF_HT_FLAG: col->setter = vcf_setter_info_flag; break;
+ case BCF_HT_INT: col->setter = vcf_setter_info_int; break;
+ case BCF_HT_REAL: col->setter = vcf_setter_info_real; break;
+ case BCF_HT_STR: col->setter = vcf_setter_info_str; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
+ }
+ }
+ }
+ else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
+ {
+ bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
+ if ( force_samples<0 ) force_samples = replace;
+ if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ int j;
+ for (j=0; j<tgts_hdr->nhrec; j++)
+ {
+ bcf_hrec_t *hrec = tgts_hdr->hrec[j];
+ if ( hrec->type!=BCF_HL_FMT) continue;
+ int k = bcf_hrec_find_key(hrec,"ID");
+ assert( k>=0 ); // this should always be true for valid VCFs
+ if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue;
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = -1;
+ col->replace = replace;
+ col->hdr_key = strdup(hrec->vals[k]);
+ if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt;
+ else
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
+ {
+ case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
+ case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
+ case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
+ }
+ }
+ }
+ else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
+ {
+ if ( !args->tgts_is_vcf )
+ error("Error: FORMAT fields can be carried over from a VCF file only.\n");
+
+ char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
+ if ( force_samples<0 ) force_samples = replace;
+ if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;;
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = -1;
+ col->replace = replace;
+ col->hdr_key = strdup(key);
+ if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
+ else
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
+ {
+ case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
+ case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
+ case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
+ }
+ }
+ else
+ {
+ if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+ if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); }
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
+ {
+ if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL);
+ if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname);
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ }
+ else
+ error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
+ }
+
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ col->icol = i;
+ col->replace = replace;
+ col->hdr_key = strdup(str.s);
+ col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+ switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
+ {
+ case BCF_HT_FLAG: col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
+ case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_info_int : setter_info_int; break;
+ case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_info_real : setter_info_real; break;
+ case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break;
+ default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
+ }
+ }
+ if ( !*se ) break;
+ ss = ++se;
+ }
+ free(str.s);
+ free(tmp.s);
+ if ( args->to_idx==-1 ) args->to_idx = args->from_idx;
+ free(args->columns);
+ if ( skip_info ) khash_str2int_destroy_free(skip_info);
+ if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt);
+ if ( has_fmt_str )
+ {
+ int n = bcf_hdr_nsamples(args->hdr_out) > bcf_hdr_nsamples(args->files->readers[1].header) ? bcf_hdr_nsamples(args->hdr_out) : bcf_hdr_nsamples(args->files->readers[1].header);
+ args->tmpp = (char**)malloc(sizeof(char*)*n);
+ args->tmpp2 = (char**)malloc(sizeof(char*)*n);
+ }
+ if ( force_samples>=0 )
+ set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
+}
+
+static void rename_chrs(args_t *args, char *fname)
+{
+ int n, i;
+ char **map = hts_readlist(fname, 1, &n);
+ if ( !map ) error("Could not read: %s\n", fname);
+ for (i=0; i<n; i++)
+ {
+ char *ss = map[i];
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) error("Could not parse: %s\n", fname);
+ *ss = 0;
+ int rid = bcf_hdr_name2id(args->hdr_out, map[i]);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_CTG, "ID", map[i], NULL);
+ if ( !hrec ) continue; // the sequence not present
+ int j = bcf_hrec_find_key(hrec, "ID");
+ assert( j>=0 );
+ free(hrec->vals[j]);
+ ss++;
+ while ( *ss && isspace(*ss) ) ss++;
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ hrec->vals[j] = strdup(ss);
+ args->hdr_out->id[BCF_DT_CTG][rid].key = hrec->vals[j];
+ }
+ for (i=0; i<n; i++) free(map[i]);
+ free(map);
+}
+
+static void init_data(args_t *args)
+{
+ args->hdr = args->files->readers[0].header;
+ args->hdr_out = bcf_hdr_dup(args->hdr);
+
+ if ( args->remove_annots ) init_remove_annots(args);
+ if ( args->header_fname ) init_header_lines(args);
+ if ( args->targets_fname && args->tgts_is_vcf )
+ {
+ // reading annots from a VCF
+ if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
+ error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
+ }
+ if ( args->columns ) init_columns(args);
+ if ( args->targets_fname && !args->tgts_is_vcf )
+ {
+ if ( !args->columns ) error("The -c option not given\n");
+ if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
+ if ( args->from_idx==-1 ) error("The -c POS option not given\n");
+ if ( args->to_idx==-1 ) args->to_idx = -args->from_idx - 1;
+
+ args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->from_idx,args->to_idx);
+ if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname);
+ if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname);
+ }
+ args->vcmp = vcmp_init();
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ if ( args->set_ids_fmt )
+ {
+ if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
+ args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
+ }
+
+ if ( args->mark_sites )
+ {
+ if ( !args->targets_fname ) error("The -a option not given\n");
+ if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add..
+ bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
+ args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
+ }
+
+ bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
+ if ( !args->drop_header )
+ {
+ if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
+
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ bcf_hdr_write(args->out_fh, args->hdr_out);
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nrm; i++) free(args->rm[i].key);
+ free(args->rm);
+ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
+ if (args->vcmp) vcmp_destroy(args->vcmp);
+ for (i=0; i<args->ncols; i++)
+ free(args->cols[i].hdr_key);
+ free(args->cols);
+ for (i=0; i<args->malines; i++)
+ {
+ free(args->alines[i].cols);
+ free(args->alines[i].als);
+ free(args->alines[i].line.s);
+ }
+ free(args->alines);
+ if ( args->tgts ) bcf_sr_regions_destroy(args->tgts);
+ free(args->tmpks.s);
+ free(args->tmpi);
+ free(args->tmpf);
+ free(args->tmps);
+ free(args->tmpp);
+ free(args->tmpi2);
+ free(args->tmpf2);
+ free(args->tmps2);
+ free(args->tmpp2);
+ free(args->tmpi3);
+ free(args->tmpf3);
+ if ( args->set_ids )
+ convert_destroy(args->set_ids);
+ if ( args->filter )
+ filter_destroy(args->filter);
+ if (args->out_fh) hts_close(args->out_fh);
+ free(args->sample_map);
+}
+
+static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos)
+{
+ if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0;
+
+ int i = 0;
+ while ( i<args->nalines )
+ {
+ if ( line->pos > args->alines[i].end )
+ {
+ args->nalines--;
+ if ( args->nalines && i<args->nalines )
+ {
+ annot_line_t tmp = args->alines[i];
+ memmove(&args->alines[i],&args->alines[i+1],(args->nalines-i)*sizeof(annot_line_t));
+ args->alines[args->nalines] = tmp;
+ }
+ }
+ else i++;
+ }
+
+ if ( args->ref_idx==-1 && args->nalines ) return;
+
+ while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
+ {
+ args->nalines++;
+ hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
+ annot_line_t *tmp = &args->alines[args->nalines-1];
+ tmp->rid = line->rid;
+ tmp->start = args->tgts->start;
+ tmp->end = args->tgts->end;
+ tmp->line.l = 0;
+ kputs(args->tgts->line.s, &tmp->line);
+ char *s = tmp->line.s;
+ tmp->ncols = 1;
+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
+ tmp->cols[0] = s;
+ while ( *s )
+ {
+ if ( *s=='\t' )
+ {
+ tmp->ncols++;
+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols);
+ tmp->cols[tmp->ncols-1] = s+1;
+ *s = 0;
+ }
+ s++;
+ }
+ if ( args->ref_idx != -1 )
+ {
+ assert( args->ref_idx < tmp->ncols );
+ assert( args->alt_idx < tmp->ncols );
+ tmp->nals = 2;
+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
+ tmp->als[0] = tmp->cols[args->ref_idx];
+ tmp->als[1] = s = tmp->cols[args->alt_idx];
+ while ( *s )
+ {
+ if ( *s==',' )
+ {
+ tmp->nals++;
+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
+ tmp->als[tmp->nals-1] = s+1;
+ *s = 0;
+ }
+ s++;
+ }
+ int iseq = args->tgts->iseq;
+ if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break;
+ }
+ else break;
+ }
+}
+
+static void annotate(args_t *args, bcf1_t *line)
+{
+ int i, j;
+ for (i=0; i<args->nrm; i++)
+ args->rm[i].handler(args, line, &args->rm[i]);
+
+ if ( args->tgts )
+ {
+ // Buffer annotation lines. When multiple ALT alleles are present in the
+ // annotation file, at least one must match one of the VCF alleles.
+ int len = 0;
+ bcf_get_variant_types(line);
+ for (i=1; i<line->n_allele; i++)
+ if ( len > line->d.var[i].n ) len = line->d.var[i].n;
+ int end_pos = len<0 ? line->pos - len : line->pos;
+ buffer_annot_lines(args, line, line->pos, end_pos);
+ for (i=0; i<args->nalines; i++)
+ {
+ if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
+ if ( args->ref_idx != -1 )
+ {
+ if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs not compatible
+ for (j=1; j<args->alines[i].nals; j++)
+ {
+ if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break; // no ALT allele in VCF and annot file has "."
+ if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break;
+ }
+ if ( j==args->alines[i].nals ) continue; // none of the annot alleles present in VCF's ALT
+ }
+ break;
+ }
+
+ if ( i<args->nalines )
+ {
+ // there is a matching line
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+
+ }
+
+ if ( args->mark_sites )
+ {
+ // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
+ if ( args->mark_sites_logic==MARK_LISTED )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?1:0);
+ else
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?0:1);
+ }
+ }
+ else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) )
+ {
+ bcf1_t *aline = bcf_sr_get_line(args->files,1);
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ }
+ if ( args->set_ids )
+ {
+ args->tmpks.l = 0;
+ convert_line(args->set_ids, line, &args->tmpks);
+ if ( args->tmpks.l )
+ {
+ int replace = 0;
+ if ( args->set_ids_replace ) replace = 1;
+ else if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) replace = 1;
+ if ( replace )
+ bcf_update_id(args->hdr_out,line,args->tmpks.s);
+ }
+ }
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Annotate and edit VCF/BCF files.\n");
+ fprintf(pysamerr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(pysamerr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+ fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysamerr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
+ fprintf(pysamerr, " -I, --set-id [+]<format> set ID column, see man pagee for details\n");
+ fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true (see man pagee for details)\n");
+ fprintf(pysamerr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
+ fprintf(pysamerr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(pysamerr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(pysamerr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfannotate(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
+ args->set_ids_replace = 1;
+ int regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"mark-sites",required_argument,NULL,'m'},
+ {"set-id",required_argument,NULL,'I'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"annotations",required_argument,NULL,'a'},
+ {"include",required_argument,NULL,'i'},
+ {"exclude",required_argument,NULL,'e'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"remove",required_argument,NULL,'x'},
+ {"columns",required_argument,NULL,'c'},
+ {"rename-chrs",required_argument,NULL,1},
+ {"header-lines",required_argument,NULL,'h'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'm':
+ args->mark_sites_logic = MARK_LISTED;
+ if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
+ else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
+ else args->mark_sites = optarg;
+ break;
+ case 'I': args->set_ids_fmt = optarg; break;
+ case 's': args->sample_names = optarg; break;
+ case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
+ case 'c': args->columns = strdup(optarg); break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'x': args->remove_annots = optarg; break;
+ case 'a': args->targets_fname = optarg; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'h': args->header_fname = optarg; break;
+ case 1 : args->rename_chrs = optarg; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
+
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_fname )
+ {
+ htsFile *fp = hts_open(args->targets_fname,"r");
+ htsFormat type = *hts_get_format(fp);
+ hts_close(fp);
+
+ if ( type.format==vcf || type.format==bcf )
+ {
+ args->tgts_is_vcf = 1;
+ args->files->require_index = 1;
+ args->files->collapse |= COLLAPSE_SOME;
+ }
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ if ( !bcf_sr_has_line(args->files,0) ) continue;
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( line->errcode ) error("Encountered error, cannot proceed. Please check the error output above.\n");
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+ annotate(args, line);
+ bcf_write1(args->out_fh, args->hdr_out, line);
+ }
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c
new file mode 100644
index 0000000..a28caee
--- /dev/null
+++ b/bcftools/vcfcall.c
@@ -0,0 +1,822 @@
+/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <time.h>
+#include <zlib.h>
+#include <stdarg.h>
+#include <htslib/kfunc.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash_str2int.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "call.h"
+#include "prob1.h"
+#include "ploidy.h"
+#include "gvcf.h"
+
+void error(const char *format, ...);
+
+#ifdef _WIN32
+#define srand48(x) srand(x)
+#define lrand48() rand()
+#endif
+
+#define CF_NO_GENO 1
+#define CF_INS_MISSED (1<<1)
+#define CF_CCALL (1<<2)
+// (1<<3)
+// (1<<4)
+// (1<<5)
+#define CF_ACGT_ONLY (1<<6)
+#define CF_QCALL (1<<7)
+#define CF_ADJLD (1<<8)
+#define CF_NO_INDEL (1<<9)
+#define CF_ANNO_MAX (1<<10)
+#define CF_MCALL (1<<11)
+#define CF_PAIRCALL (1<<12)
+#define CF_QCNT (1<<13)
+#define CF_INDEL_ONLY (1<<14)
+
+typedef struct
+{
+ int flag; // combination of CF_* flags above
+ int output_type, n_threads;
+ htsFile *bcf_in, *out_fh;
+ char *bcf_fname, *output_fname;
+ char **samples; // for subsampling and ploidy
+ int nsamples, *samples_map; // mapping from output sample names to original VCF
+ char *regions, *targets; // regions to process
+ int regions_is_file, targets_is_file;
+
+ char *samples_fname;
+ int samples_is_file;
+ int *sample2sex; // mapping for ploidy. If negative, interpreted as -1*ploidy
+ int *sex2ploidy, *sex2ploidy_prev, nsex;
+ ploidy_t *ploidy;
+ gvcf_t *gvcf;
+
+ bcf1_t *missed_line;
+ call_t aux; // parameters and temporary data
+
+ int argc;
+ char **argv;
+
+ // int flag, prior_type, n1, n_sub, *sublist, n_perm;
+ // uint32_t *trio_aux;
+ // char *prior_file, **subsam;
+ // uint8_t *ploidy;
+ // double theta, pref, indel_frac, min_smpl_frac, min_lrt;
+ // Permutation tests
+ // int n_perm, *seeds;
+ // double min_perm_p;
+ // void *bed;
+}
+args_t;
+
+static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith)
+{
+ int ret = khash_str2int_get(name2idx, name, ith);
+ if ( ret==0 ) return lines;
+
+ hts_expand(char*,(*nlines+1),*mlines,lines);
+ int len = strlen(name);
+ lines[*nlines] = (char*) malloc(len+3);
+ memcpy(lines[*nlines],name,len);
+ lines[*nlines][len] = ' ';
+ lines[*nlines][len+1] = sex;
+ lines[*nlines][len+2] = 0;
+ *ith = *nlines;
+ (*nlines)++;
+ khash_str2int_set(name2idx, strdup(name), *ith);
+ return lines;
+}
+
+typedef struct
+{
+ const char *alias, *about, *ploidy;
+}
+ploidy_predef_t;
+
+static ploidy_predef_t ploidy_predefs[] =
+{
+ { .alias = "GRCh37",
+ .about = "Human Genome reference assembly GRCh37 / hg19",
+ .ploidy =
+ "X 1 60000 M 1\n"
+ "X 2699521 154931043 M 1\n"
+ "Y 1 59373566 M 1\n"
+ "Y 1 59373566 F 0\n"
+ "MT 1 16569 M 1\n"
+ "MT 1 16569 F 1\n"
+ "chrX 1 60000 M 1\n"
+ "chrX 2699521 154931043 M 1\n"
+ "chrY 1 59373566 M 1\n"
+ "chrY 1 59373566 F 0\n"
+ "chrM 1 16569 M 1\n"
+ "chrM 1 16569 F 1\n"
+ "* * * M 2\n"
+ "* * * F 2\n"
+ },
+ { .alias = "GRCh38",
+ .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)",
+ .ploidy =
+ "X 1 9999 M 1\n"
+ "X 2781480 155701381 M 1\n"
+ "Y 1 57227415 M 1\n"
+ "Y 1 57227415 F 0\n"
+ "MT 1 16569 M 1\n"
+ "MT 1 16569 F 1\n"
+ "chrX 1 9999 M 1\n"
+ "chrX 2781480 155701381 M 1\n"
+ "chrY 1 57227415 M 1\n"
+ "chrY 1 57227415 F 0\n"
+ "chrM 1 16569 M 1\n"
+ "chrM 1 16569 F 1\n"
+ "* * * M 2\n"
+ "* * * F 2\n"
+ },
+ { .alias = "X",
+ .about = "Treat male samples as haploid and female as diploid regardless of the chromosome name",
+ .ploidy =
+ "* * * M 1\n"
+ "* * * F 2\n"
+ },
+ { .alias = "Y",
+ .about = "Treat male samples as haploid and female as no-copy, regardless of the chromosome name",
+ .ploidy =
+ "* * * M 1\n"
+ "* * * F 0\n"
+ },
+ {
+ .alias = NULL,
+ .about = NULL,
+ .ploidy = NULL,
+ }
+};
+
+// only 5 columns are required and the first is ignored:
+// ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F)
+static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl)
+{
+ int i, j, mlines = 0, nlines = 0;
+ kstring_t str = {0,0,0}, fam_str = {0,0,0};
+ void *name2idx = khash_str2int_init();
+ char **lines = NULL;
+ for (i=0; i<nvals; i++)
+ {
+ str.l = 0;
+ kputs(vals[i], &str);
+ char *col_ends[5], *tmp = str.s;
+ j = 0;
+ while ( *tmp && j<5 )
+ {
+ if ( isspace(*tmp) )
+ {
+ *tmp = 0;
+ ++tmp;
+ while ( isspace(*tmp) ) tmp++; // allow multiple spaces
+ col_ends[j] = tmp-1;
+ j++;
+ continue;
+ }
+ tmp++;
+ }
+ if ( j!=5 ) break;
+
+ char sex = col_ends[3][1]=='1' ? 'M' : 'F';
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
+ if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother
+ {
+ call->nfams++;
+ hts_expand(family_t, call->nfams, call->mfams, call->fams);
+ family_t *fam = &call->fams[call->nfams-1];
+ fam_str.l = 0;
+ ksprintf(&fam_str,"father=%s, mother=%s, child=%s", col_ends[1]+1,col_ends[2]+1,col_ends[0]+1);
+ fam->name = strdup(fam_str.s);
+
+ if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) )
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]);
+ if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) )
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]);
+
+ khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]);
+ khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]);
+ khash_str2int_get(name2idx, col_ends[2]+1, &fam->sample[MOTHER]);
+ }
+ }
+ free(str.s);
+ free(fam_str.s);
+ khash_str2int_destroy_free(name2idx);
+
+ if ( i!=nvals ) // not a ped file
+ {
+ if ( i>0 ) error("Could not parse samples, not a PED format.\n");
+ return NULL;
+ }
+ *nsmpl = nlines;
+ return lines;
+}
+
+
+/*
+ * Reads sample names and their ploidy (optional) from a file.
+ * Alternatively, if no such file exists, the file name is interpreted
+ * as a comma-separated list of samples. When ploidy is not present,
+ * the default ploidy 2 is assumed.
+ */
+static void set_samples(args_t *args, const char *fn, int is_file)
+{
+ int i, nlines;
+ char **lines = hts_readlist(fn, is_file, &nlines);
+ if ( !lines ) error("Could not read the file: %s\n", fn);
+
+ int nsmpls;
+ char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls);
+ if ( smpls )
+ {
+ for (i=0; i<nlines; i++) free(lines[i]);
+ free(lines);
+ lines = smpls;
+ nlines = nsmpls;
+ }
+
+ args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
+ args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
+ int dflt_sex_id = ploidy_add_sex(args->ploidy, "F");
+ for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
+
+ int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
+ for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) old2new[i] = -1;
+
+ int nsmpl = 0, map_needed = 0;
+ for (i=0; i<nlines; i++)
+ {
+ char *ss = lines[i];
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) error("Could not parse: %s\n", lines[i]);
+ if ( *ss=='#' ) continue;
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ char x = *se, *xptr = se; *se = 0;
+
+ int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
+ if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+
+ ss = se+1;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) ss = "2"; // default ploidy
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
+
+ if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') )
+ args->sample2sex[nsmpl] = -1*(ss[0]-'0');
+ else
+ args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss);
+
+ if ( ismpl!=nsmpl ) map_needed = 1;
+ args->samples_map[nsmpl] = ismpl;
+ old2new[ismpl] = nsmpl;
+ nsmpl++;
+ }
+
+ for (i=0; i<args->aux.nfams; i++)
+ {
+ int j, nmiss = 0;
+ family_t *fam = &args->aux.fams[i];
+ for (j=0; j<3; j++)
+ {
+ fam->sample[i] = old2new[fam->sample[i]];
+ if ( fam->sample[i]<0 ) nmiss++;
+ }
+ assert( nmiss==0 || nmiss==3 );
+ }
+ free(old2new);
+
+ if ( !map_needed ) { free(args->samples_map); args->samples_map = NULL; }
+
+ args->nsamples = nsmpl;
+ args->samples = lines;
+}
+
+static void init_missed_line(args_t *args)
+{
+ int i;
+ for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
+ {
+ args->aux.gts[i*2] = bcf_gt_missing;
+ args->aux.gts[i*2+1] = bcf_int32_vector_end;
+ }
+ args->missed_line = bcf_init1();
+ bcf_update_genotypes(args->aux.hdr, args->missed_line, args->aux.gts, 2*bcf_hdr_nsamples(args->aux.hdr));
+ bcf_float_set_missing(args->missed_line->qual);
+}
+
+static void print_missed_line(bcf_sr_regions_t *regs, void *data)
+{
+ args_t *args = (args_t*) data;
+ call_t *call = &args->aux;
+ bcf1_t *missed = args->missed_line;
+
+ char *ss = regs->line.s;
+ int i = 0;
+ while ( i<args->aux.srs->targets_als-1 && *ss )
+ {
+ if ( *ss=='\t' ) i++;
+ ss++;
+ }
+ if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als);
+
+ missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]);
+ missed->pos = regs->start;
+ bcf_update_alleles_str(call->hdr, missed,ss);
+
+ bcf_write1(args->out_fh, call->hdr, missed);
+}
+
+static void init_data(args_t *args)
+{
+ args->aux.srs = bcf_sr_init();
+
+ // Open files for input and output, initialize structures
+ if ( args->targets )
+ {
+ if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets);
+
+ if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED )
+ {
+ args->aux.srs->targets->missed_reg_handler = print_missed_line;
+ args->aux.srs->targets->missed_reg_data = args;
+ }
+ }
+ if ( args->regions )
+ {
+ if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
+ error("Failed to read the targets: %s\n", args->regions);
+ }
+
+ if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
+ args->aux.hdr = bcf_sr_get_header(args->aux.srs,0);
+
+ int i;
+ if ( args->samples_fname )
+ {
+ set_samples(args, args->samples_fname, args->samples_is_file);
+ if ( args->aux.flag&CALL_CONSTR_TRIO )
+ {
+ if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
+ fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
+ }
+ args->nsex = ploidy_nsex(args->ploidy);
+ args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
+ args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
+ args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
+ for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
+ for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
+ }
+
+ if ( args->gvcf )
+ gvcf_update_header(args->gvcf, args->aux.hdr);
+
+ if ( args->samples_map )
+ {
+ args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map);
+ if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n");
+ for (i=0; i<args->nsamples; i++)
+ if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]);
+ if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
+ }
+ else
+ {
+ args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
+ for (i=0; i<args->nsamples; i++)
+ if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
+ error("No such sample: %s\n", args->samples[i]);
+ }
+
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+
+ if ( args->flag & CF_QCALL )
+ return;
+
+ if ( args->flag & CF_MCALL )
+ mcall_init(&args->aux);
+
+ if ( args->flag & CF_CCALL )
+ ccall_init(&args->aux);
+
+ bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
+ bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");
+
+ bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
+ bcf_hdr_write(args->out_fh, args->aux.hdr);
+
+ if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
+}
+
+static void destroy_data(args_t *args)
+{
+ if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux);
+ else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
+ else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
+ int i;
+ for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ if ( args->aux.fams )
+ {
+ for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name);
+ free(args->aux.fams);
+ }
+ if ( args->missed_line ) bcf_destroy(args->missed_line);
+ ploidy_destroy(args->ploidy);
+ free(args->sex2ploidy);
+ free(args->sex2ploidy_prev);
+ free(args->samples);
+ free(args->samples_map);
+ free(args->sample2sex);
+ free(args->aux.ploidy);
+ if ( args->gvcf ) gvcf_destroy(args->gvcf);
+ bcf_hdr_destroy(args->aux.hdr);
+ hts_close(args->out_fh);
+ bcf_sr_destroy(args->aux.srs);
+}
+
+void parse_novel_rate(args_t *args, const char *str)
+{
+ if ( sscanf(str,"%le,%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del,&args->aux.trio_Pm_ins)==3 ) // explicit for all
+ {
+ args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
+ args->aux.trio_Pm_del = 1 - args->aux.trio_Pm_del;
+ args->aux.trio_Pm_ins = 1 - args->aux.trio_Pm_ins;
+ }
+ else if ( sscanf(str,"%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del)==2 ) // dynamic for indels
+ {
+ args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
+ args->aux.trio_Pm_ins = -1; // negative value for dynamic calculation
+ }
+ else if ( sscanf(str,"%le",&args->aux.trio_Pm_SNPs)==1 ) // same for all
+ {
+ args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
+ args->aux.trio_Pm_del = -1;
+ args->aux.trio_Pm_ins = -1;
+ }
+ else error("Could not parse --novel-rate %s\n", str);
+}
+
+static int parse_format_flag(const char *str)
+{
+ int flag = 0;
+ const char *ss = str;
+ while ( *ss )
+ {
+ const char *se = ss;
+ while ( *se && *se!=',' ) se++;
+ if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ;
+ else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP;
+ else
+ {
+ fprintf(stderr,"Could not parse \"%s\"\n", str);
+ exit(1);
+ }
+ if ( !*se ) break;
+ ss = se + 1;
+ }
+ return flag;
+}
+
+static void set_ploidy(args_t *args, bcf1_t *rec)
+{
+ ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL);
+
+ int i;
+ for (i=0; i<args->nsex; i++)
+ if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break;
+
+ if ( i==args->nsex ) return; // ploidy same as previously
+
+ for (i=0; i<args->nsamples; i++)
+ {
+ if ( args->sample2sex[i]<0 )
+ args->aux.ploidy[i] = -1*args->sample2sex[i];
+ else
+ args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
+ }
+
+ int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
+}
+
+ploidy_t *init_ploidy(char *alias)
+{
+ const ploidy_predef_t *pld = ploidy_predefs;
+
+ int detailed = 0, len = strlen(alias);
+ if ( alias[len-1]=='?' ) { detailed = 1; alias[len-1] = 0; }
+
+ while ( pld->alias && strcasecmp(alias,pld->alias) ) pld++;
+
+ if ( !pld->alias )
+ {
+ fprintf(stderr,"Predefined ploidies:\n");
+ pld = ploidy_predefs;
+ while ( pld->alias )
+ {
+ fprintf(stderr,"%s\n .. %s\n\n", pld->alias,pld->about);
+ if ( detailed )
+ fprintf(stderr,"%s\n", pld->ploidy);
+ pld++;
+ }
+ fprintf(stderr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
+ fprintf(stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
+ fprintf(stderr,"\n");
+ exit(-1);
+ }
+ else if ( detailed )
+ {
+ fprintf(stderr,"%s", pld->ploidy);
+ exit(-1);
+ }
+ return ploidy_init_string(pld->ploidy,2);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n");
+ fprintf(stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n");
+ fprintf(stderr, " functionality has been temporarily lost in the process of transition to htslib,\n");
+ fprintf(stderr, " but will be added back on popular demand. The original calling model can be\n");
+ fprintf(stderr, " invoked with the -c option.\n");
+ fprintf(stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "File format options:\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
+ fprintf(stderr, " --ploidy-file <file> space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --samples <list> list of samples to include [all samples]\n");
+ fprintf(stderr, " -S, --samples-file <file> PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Input/output options:\n");
+ fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
+ fprintf(stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
+ fprintf(stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+ fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
+ fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
+ fprintf(stderr, " -V, --skip-variants <type> skip indels/snps\n");
+ fprintf(stderr, " -v, --variants-only output variant sites only\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Consensus/variant calling options:\n");
+ fprintf(stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n");
+ fprintf(stderr, " -C, --constrain <str> one of: alleles, trio (see manual)\n");
+ fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+ fprintf(stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+ fprintf(stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
+ fprintf(stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
+
+ // todo (and more)
+ // fprintf(stderr, "\nContrast calling and association test options:\n");
+ // fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
+ // fprintf(stderr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
+ // fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
+ // fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
+ fprintf(stderr, "\n");
+ exit(-1);
+}
+
+int main_vcfcall(int argc, char *argv[])
+{
+ char *ploidy_fname = NULL, *ploidy = NULL;
+ args_t args;
+ memset(&args, 0, sizeof(args_t));
+ args.argc = argc; args.argv = argv;
+ args.aux.prior_type = -1;
+ args.aux.indel_frac = -1;
+ args.aux.theta = 1.1e-3;
+ args.aux.pref = 0.5;
+ args.aux.min_perm_p = 0.01;
+ args.aux.min_lrt = 1;
+ args.flag = CF_ACGT_ONLY;
+ args.output_fname = "-";
+ args.output_type = FT_VCF;
+ args.n_threads = 0;
+ args.aux.trio_Pm_SNPs = 1 - 1e-8;
+ args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
+
+ int c;
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"format-fields",required_argument,NULL,'f'},
+ {"gvcf",required_argument,NULL,'g'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"threads",required_argument,NULL,9},
+ {"keep-alts",no_argument,NULL,'A'},
+ {"insert-missed",no_argument,NULL,'i'},
+ {"skip-Ns",no_argument,NULL,'N'}, // now the new default
+ {"keep-masked-refs",no_argument,NULL,'M'},
+ {"skip-variants",required_argument,NULL,'V'},
+ {"variants-only",no_argument,NULL,'v'},
+ {"consensus-caller",no_argument,NULL,'c'},
+ {"constrain",required_argument,NULL,'C'},
+ {"multiallelic-caller",no_argument,NULL,'m'},
+ {"pval-threshold",required_argument,NULL,'p'},
+ {"prior",required_argument,NULL,'P'},
+ {"novel-rate",required_argument,NULL,'n'},
+ {"ploidy",required_argument,NULL,1},
+ {"ploidy-file",required_argument,NULL,2},
+ {"chromosome-X",no_argument,NULL,'X'},
+ {"chromosome-Y",no_argument,NULL,'Y'},
+ {NULL,0,NULL,0}
+ };
+
+ char *tmp = NULL;
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 2 : ploidy_fname = optarg; break;
+ case 1 : ploidy = optarg; break;
+ case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
+ case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
+ case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
+ case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N
+ case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default)
+ case 'A': args.aux.flag |= CALL_KEEPALT; break;
+ case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
+ case 'i': args.flag |= CF_INS_MISSED; break;
+ case 'v': args.aux.flag |= CALL_VARONLY; break;
+ case 'g':
+ args.gvcf = gvcf_init(optarg);
+ if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
+ break;
+ case 'o': args.output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args.output_type = FT_BCF_GZ; break;
+ case 'u': args.output_type = FT_BCF; break;
+ case 'z': args.output_type = FT_VCF_GZ; break;
+ case 'v': args.output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'C':
+ if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES;
+ else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO;
+ else error("Unknown argument to -C: \"%s\"\n", optarg);
+ break;
+ case 'V':
+ if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY;
+ else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL;
+ else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg);
+ break;
+ case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method
+ case 'p':
+ args.aux.pref = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --pval-threshold %s\n", optarg);
+ break;
+ case 'P': args.aux.theta = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg);
+ break;
+ case 'n': parse_novel_rate(&args,optarg); break;
+ case 'r': args.regions = optarg; break;
+ case 'R': args.regions = optarg; args.regions_is_file = 1; break;
+ case 't': args.targets = optarg; break;
+ case 'T': args.targets = optarg; args.targets_is_file = 1; break;
+ case 's': args.samples_fname = optarg; break;
+ case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
+ case 9 : args.n_threads = strtol(optarg, 0, 0); break;
+ default: usage(&args);
+ }
+ }
+ // Sanity check options and initialize
+ if ( ploidy_fname ) args.ploidy = ploidy_init(ploidy_fname, 2);
+ else if ( ploidy ) args.ploidy = init_ploidy(ploidy);
+
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-"; // reading from stdin
+ else usage(&args);
+ }
+ else args.bcf_fname = argv[optind++];
+
+ if ( !ploidy_fname && !ploidy )
+ {
+ fprintf(stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
+ args.ploidy = ploidy_init_string("",2);
+ }
+
+ if ( !args.ploidy ) error("Could not initialize ploidy\n");
+ if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n");
+ if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n");
+ if ( (args.flag & CF_CCALL ? 1: 0) && args.gvcf ) error("gvcf -g option not functional with -c calling mode yet\n");
+ if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix
+ if ( args.aux.flag & CALL_CONSTR_ALLELES )
+ {
+ if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n");
+ if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n");
+ }
+ if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n");
+ if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n");
+ init_data(&args);
+
+ while ( bcf_sr_next_line(args.aux.srs) )
+ {
+ bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0];
+ if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map);
+ bcf_unpack(bcf_rec, BCF_UN_STR);
+
+ // Skip unwanted sites
+ int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1;
+ if ( (args.flag & CF_INDEL_ONLY) && !is_indel ) continue;
+ if ( (args.flag & CF_NO_INDEL) && is_indel ) continue;
+ if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N'
+
+ // Which allele is symbolic? All SNPs should have it, but not indels
+ args.aux.unseen = 0;
+ for (i=1; i<bcf_rec->n_allele; i++)
+ {
+ if ( bcf_rec->d.allele[i][0]=='X' ) { args.aux.unseen = i; break; } // old X
+ if ( bcf_rec->d.allele[i][0]=='<' )
+ {
+ if ( bcf_rec->d.allele[i][1]=='X' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // old <X>
+ if ( bcf_rec->d.allele[i][1]=='*' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // new <*>
+ }
+ }
+ int is_ref = (bcf_rec->n_allele==1 || (bcf_rec->n_allele==2 && args.aux.unseen>0)) ? 1 : 0;
+
+ if ( is_ref && args.aux.flag&CALL_VARONLY )
+ continue;
+
+ bcf_unpack(bcf_rec, BCF_UN_ALL);
+ if ( args.nsex ) set_ploidy(&args, bcf_rec);
+
+ // Various output modes: QCall output (todo)
+ if ( args.flag & CF_QCALL )
+ {
+ qcall(&args.aux, bcf_rec);
+ continue;
+ }
+
+ // Calling modes which output VCFs
+ int ret;
+ if ( args.flag & CF_MCALL )
+ ret = mcall(&args.aux, bcf_rec);
+ else
+ ret = ccall(&args.aux, bcf_rec);
+ if ( ret==-1 ) error("Something is wrong\n");
+
+ // Normal output
+ if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant
+ if ( args.gvcf )
+ bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0);
+ if ( bcf_rec )
+ bcf_write1(args.out_fh, args.aux.hdr, bcf_rec);
+ }
+ if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0);
+ if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets);
+ destroy_data(&args);
+ return 0;
+}
+
diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c
new file mode 100644
index 0000000..9e8c1bb
--- /dev/null
+++ b/bcftools/vcfcall.c.pysam.c
@@ -0,0 +1,824 @@
+#include "pysam.h"
+
+/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <time.h>
+#include <zlib.h>
+#include <stdarg.h>
+#include <htslib/kfunc.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash_str2int.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "call.h"
+#include "prob1.h"
+#include "ploidy.h"
+#include "gvcf.h"
+
+void error(const char *format, ...);
+
+#ifdef _WIN32
+#define srand48(x) srand(x)
+#define lrand48() rand()
+#endif
+
+#define CF_NO_GENO 1
+#define CF_INS_MISSED (1<<1)
+#define CF_CCALL (1<<2)
+// (1<<3)
+// (1<<4)
+// (1<<5)
+#define CF_ACGT_ONLY (1<<6)
+#define CF_QCALL (1<<7)
+#define CF_ADJLD (1<<8)
+#define CF_NO_INDEL (1<<9)
+#define CF_ANNO_MAX (1<<10)
+#define CF_MCALL (1<<11)
+#define CF_PAIRCALL (1<<12)
+#define CF_QCNT (1<<13)
+#define CF_INDEL_ONLY (1<<14)
+
+typedef struct
+{
+ int flag; // combination of CF_* flags above
+ int output_type, n_threads;
+ htsFile *bcf_in, *out_fh;
+ char *bcf_fname, *output_fname;
+ char **samples; // for subsampling and ploidy
+ int nsamples, *samples_map; // mapping from output sample names to original VCF
+ char *regions, *targets; // regions to process
+ int regions_is_file, targets_is_file;
+
+ char *samples_fname;
+ int samples_is_file;
+ int *sample2sex; // mapping for ploidy. If negative, interpreted as -1*ploidy
+ int *sex2ploidy, *sex2ploidy_prev, nsex;
+ ploidy_t *ploidy;
+ gvcf_t *gvcf;
+
+ bcf1_t *missed_line;
+ call_t aux; // parameters and temporary data
+
+ int argc;
+ char **argv;
+
+ // int flag, prior_type, n1, n_sub, *sublist, n_perm;
+ // uint32_t *trio_aux;
+ // char *prior_file, **subsam;
+ // uint8_t *ploidy;
+ // double theta, pref, indel_frac, min_smpl_frac, min_lrt;
+ // Permutation tests
+ // int n_perm, *seeds;
+ // double min_perm_p;
+ // void *bed;
+}
+args_t;
+
+static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith)
+{
+ int ret = khash_str2int_get(name2idx, name, ith);
+ if ( ret==0 ) return lines;
+
+ hts_expand(char*,(*nlines+1),*mlines,lines);
+ int len = strlen(name);
+ lines[*nlines] = (char*) malloc(len+3);
+ memcpy(lines[*nlines],name,len);
+ lines[*nlines][len] = ' ';
+ lines[*nlines][len+1] = sex;
+ lines[*nlines][len+2] = 0;
+ *ith = *nlines;
+ (*nlines)++;
+ khash_str2int_set(name2idx, strdup(name), *ith);
+ return lines;
+}
+
+typedef struct
+{
+ const char *alias, *about, *ploidy;
+}
+ploidy_predef_t;
+
+static ploidy_predef_t ploidy_predefs[] =
+{
+ { .alias = "GRCh37",
+ .about = "Human Genome reference assembly GRCh37 / hg19",
+ .ploidy =
+ "X 1 60000 M 1\n"
+ "X 2699521 154931043 M 1\n"
+ "Y 1 59373566 M 1\n"
+ "Y 1 59373566 F 0\n"
+ "MT 1 16569 M 1\n"
+ "MT 1 16569 F 1\n"
+ "chrX 1 60000 M 1\n"
+ "chrX 2699521 154931043 M 1\n"
+ "chrY 1 59373566 M 1\n"
+ "chrY 1 59373566 F 0\n"
+ "chrM 1 16569 M 1\n"
+ "chrM 1 16569 F 1\n"
+ "* * * M 2\n"
+ "* * * F 2\n"
+ },
+ { .alias = "GRCh38",
+ .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)",
+ .ploidy =
+ "X 1 9999 M 1\n"
+ "X 2781480 155701381 M 1\n"
+ "Y 1 57227415 M 1\n"
+ "Y 1 57227415 F 0\n"
+ "MT 1 16569 M 1\n"
+ "MT 1 16569 F 1\n"
+ "chrX 1 9999 M 1\n"
+ "chrX 2781480 155701381 M 1\n"
+ "chrY 1 57227415 M 1\n"
+ "chrY 1 57227415 F 0\n"
+ "chrM 1 16569 M 1\n"
+ "chrM 1 16569 F 1\n"
+ "* * * M 2\n"
+ "* * * F 2\n"
+ },
+ { .alias = "X",
+ .about = "Treat male samples as haploid and female as diploid regardless of the chromosome name",
+ .ploidy =
+ "* * * M 1\n"
+ "* * * F 2\n"
+ },
+ { .alias = "Y",
+ .about = "Treat male samples as haploid and female as no-copy, regardless of the chromosome name",
+ .ploidy =
+ "* * * M 1\n"
+ "* * * F 0\n"
+ },
+ {
+ .alias = NULL,
+ .about = NULL,
+ .ploidy = NULL,
+ }
+};
+
+// only 5 columns are required and the first is ignored:
+// ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F)
+static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl)
+{
+ int i, j, mlines = 0, nlines = 0;
+ kstring_t str = {0,0,0}, fam_str = {0,0,0};
+ void *name2idx = khash_str2int_init();
+ char **lines = NULL;
+ for (i=0; i<nvals; i++)
+ {
+ str.l = 0;
+ kputs(vals[i], &str);
+ char *col_ends[5], *tmp = str.s;
+ j = 0;
+ while ( *tmp && j<5 )
+ {
+ if ( isspace(*tmp) )
+ {
+ *tmp = 0;
+ ++tmp;
+ while ( isspace(*tmp) ) tmp++; // allow multiple spaces
+ col_ends[j] = tmp-1;
+ j++;
+ continue;
+ }
+ tmp++;
+ }
+ if ( j!=5 ) break;
+
+ char sex = col_ends[3][1]=='1' ? 'M' : 'F';
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
+ if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother
+ {
+ call->nfams++;
+ hts_expand(family_t, call->nfams, call->mfams, call->fams);
+ family_t *fam = &call->fams[call->nfams-1];
+ fam_str.l = 0;
+ ksprintf(&fam_str,"father=%s, mother=%s, child=%s", col_ends[1]+1,col_ends[2]+1,col_ends[0]+1);
+ fam->name = strdup(fam_str.s);
+
+ if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) )
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]);
+ if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) )
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]);
+
+ khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]);
+ khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]);
+ khash_str2int_get(name2idx, col_ends[2]+1, &fam->sample[MOTHER]);
+ }
+ }
+ free(str.s);
+ free(fam_str.s);
+ khash_str2int_destroy_free(name2idx);
+
+ if ( i!=nvals ) // not a ped file
+ {
+ if ( i>0 ) error("Could not parse samples, not a PED format.\n");
+ return NULL;
+ }
+ *nsmpl = nlines;
+ return lines;
+}
+
+
+/*
+ * Reads sample names and their ploidy (optional) from a file.
+ * Alternatively, if no such file exists, the file name is interpreted
+ * as a comma-separated list of samples. When ploidy is not present,
+ * the default ploidy 2 is assumed.
+ */
+static void set_samples(args_t *args, const char *fn, int is_file)
+{
+ int i, nlines;
+ char **lines = hts_readlist(fn, is_file, &nlines);
+ if ( !lines ) error("Could not read the file: %s\n", fn);
+
+ int nsmpls;
+ char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls);
+ if ( smpls )
+ {
+ for (i=0; i<nlines; i++) free(lines[i]);
+ free(lines);
+ lines = smpls;
+ nlines = nsmpls;
+ }
+
+ args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
+ args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
+ int dflt_sex_id = ploidy_add_sex(args->ploidy, "F");
+ for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
+
+ int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
+ for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) old2new[i] = -1;
+
+ int nsmpl = 0, map_needed = 0;
+ for (i=0; i<nlines; i++)
+ {
+ char *ss = lines[i];
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) error("Could not parse: %s\n", lines[i]);
+ if ( *ss=='#' ) continue;
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ char x = *se, *xptr = se; *se = 0;
+
+ int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
+ if ( ismpl < 0 ) { fprintf(pysamerr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+
+ ss = se+1;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) ss = "2"; // default ploidy
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
+
+ if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') )
+ args->sample2sex[nsmpl] = -1*(ss[0]-'0');
+ else
+ args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss);
+
+ if ( ismpl!=nsmpl ) map_needed = 1;
+ args->samples_map[nsmpl] = ismpl;
+ old2new[ismpl] = nsmpl;
+ nsmpl++;
+ }
+
+ for (i=0; i<args->aux.nfams; i++)
+ {
+ int j, nmiss = 0;
+ family_t *fam = &args->aux.fams[i];
+ for (j=0; j<3; j++)
+ {
+ fam->sample[i] = old2new[fam->sample[i]];
+ if ( fam->sample[i]<0 ) nmiss++;
+ }
+ assert( nmiss==0 || nmiss==3 );
+ }
+ free(old2new);
+
+ if ( !map_needed ) { free(args->samples_map); args->samples_map = NULL; }
+
+ args->nsamples = nsmpl;
+ args->samples = lines;
+}
+
+static void init_missed_line(args_t *args)
+{
+ int i;
+ for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
+ {
+ args->aux.gts[i*2] = bcf_gt_missing;
+ args->aux.gts[i*2+1] = bcf_int32_vector_end;
+ }
+ args->missed_line = bcf_init1();
+ bcf_update_genotypes(args->aux.hdr, args->missed_line, args->aux.gts, 2*bcf_hdr_nsamples(args->aux.hdr));
+ bcf_float_set_missing(args->missed_line->qual);
+}
+
+static void print_missed_line(bcf_sr_regions_t *regs, void *data)
+{
+ args_t *args = (args_t*) data;
+ call_t *call = &args->aux;
+ bcf1_t *missed = args->missed_line;
+
+ char *ss = regs->line.s;
+ int i = 0;
+ while ( i<args->aux.srs->targets_als-1 && *ss )
+ {
+ if ( *ss=='\t' ) i++;
+ ss++;
+ }
+ if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als);
+
+ missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]);
+ missed->pos = regs->start;
+ bcf_update_alleles_str(call->hdr, missed,ss);
+
+ bcf_write1(args->out_fh, call->hdr, missed);
+}
+
+static void init_data(args_t *args)
+{
+ args->aux.srs = bcf_sr_init();
+
+ // Open files for input and output, initialize structures
+ if ( args->targets )
+ {
+ if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets);
+
+ if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED )
+ {
+ args->aux.srs->targets->missed_reg_handler = print_missed_line;
+ args->aux.srs->targets->missed_reg_data = args;
+ }
+ }
+ if ( args->regions )
+ {
+ if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
+ error("Failed to read the targets: %s\n", args->regions);
+ }
+
+ if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
+ args->aux.hdr = bcf_sr_get_header(args->aux.srs,0);
+
+ int i;
+ if ( args->samples_fname )
+ {
+ set_samples(args, args->samples_fname, args->samples_is_file);
+ if ( args->aux.flag&CALL_CONSTR_TRIO )
+ {
+ if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
+ fprintf(pysamerr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
+ }
+ args->nsex = ploidy_nsex(args->ploidy);
+ args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
+ args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
+ args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
+ for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
+ for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
+ }
+
+ if ( args->gvcf )
+ gvcf_update_header(args->gvcf, args->aux.hdr);
+
+ if ( args->samples_map )
+ {
+ args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map);
+ if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n");
+ for (i=0; i<args->nsamples; i++)
+ if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]);
+ if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
+ }
+ else
+ {
+ args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
+ for (i=0; i<args->nsamples; i++)
+ if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
+ error("No such sample: %s\n", args->samples[i]);
+ }
+
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+
+ if ( args->flag & CF_QCALL )
+ return;
+
+ if ( args->flag & CF_MCALL )
+ mcall_init(&args->aux);
+
+ if ( args->flag & CF_CCALL )
+ ccall_init(&args->aux);
+
+ bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
+ bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");
+
+ bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
+ bcf_hdr_write(args->out_fh, args->aux.hdr);
+
+ if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
+}
+
+static void destroy_data(args_t *args)
+{
+ if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux);
+ else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
+ else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
+ int i;
+ for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ if ( args->aux.fams )
+ {
+ for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name);
+ free(args->aux.fams);
+ }
+ if ( args->missed_line ) bcf_destroy(args->missed_line);
+ ploidy_destroy(args->ploidy);
+ free(args->sex2ploidy);
+ free(args->sex2ploidy_prev);
+ free(args->samples);
+ free(args->samples_map);
+ free(args->sample2sex);
+ free(args->aux.ploidy);
+ if ( args->gvcf ) gvcf_destroy(args->gvcf);
+ bcf_hdr_destroy(args->aux.hdr);
+ hts_close(args->out_fh);
+ bcf_sr_destroy(args->aux.srs);
+}
+
+void parse_novel_rate(args_t *args, const char *str)
+{
+ if ( sscanf(str,"%le,%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del,&args->aux.trio_Pm_ins)==3 ) // explicit for all
+ {
+ args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
+ args->aux.trio_Pm_del = 1 - args->aux.trio_Pm_del;
+ args->aux.trio_Pm_ins = 1 - args->aux.trio_Pm_ins;
+ }
+ else if ( sscanf(str,"%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del)==2 ) // dynamic for indels
+ {
+ args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
+ args->aux.trio_Pm_ins = -1; // negative value for dynamic calculation
+ }
+ else if ( sscanf(str,"%le",&args->aux.trio_Pm_SNPs)==1 ) // same for all
+ {
+ args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
+ args->aux.trio_Pm_del = -1;
+ args->aux.trio_Pm_ins = -1;
+ }
+ else error("Could not parse --novel-rate %s\n", str);
+}
+
+static int parse_format_flag(const char *str)
+{
+ int flag = 0;
+ const char *ss = str;
+ while ( *ss )
+ {
+ const char *se = ss;
+ while ( *se && *se!=',' ) se++;
+ if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ;
+ else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP;
+ else
+ {
+ fprintf(pysamerr,"Could not parse \"%s\"\n", str);
+ exit(1);
+ }
+ if ( !*se ) break;
+ ss = se + 1;
+ }
+ return flag;
+}
+
+static void set_ploidy(args_t *args, bcf1_t *rec)
+{
+ ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL);
+
+ int i;
+ for (i=0; i<args->nsex; i++)
+ if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break;
+
+ if ( i==args->nsex ) return; // ploidy same as previously
+
+ for (i=0; i<args->nsamples; i++)
+ {
+ if ( args->sample2sex[i]<0 )
+ args->aux.ploidy[i] = -1*args->sample2sex[i];
+ else
+ args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
+ }
+
+ int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
+}
+
+ploidy_t *init_ploidy(char *alias)
+{
+ const ploidy_predef_t *pld = ploidy_predefs;
+
+ int detailed = 0, len = strlen(alias);
+ if ( alias[len-1]=='?' ) { detailed = 1; alias[len-1] = 0; }
+
+ while ( pld->alias && strcasecmp(alias,pld->alias) ) pld++;
+
+ if ( !pld->alias )
+ {
+ fprintf(pysamerr,"Predefined ploidies:\n");
+ pld = ploidy_predefs;
+ while ( pld->alias )
+ {
+ fprintf(pysamerr,"%s\n .. %s\n\n", pld->alias,pld->about);
+ if ( detailed )
+ fprintf(pysamerr,"%s\n", pld->ploidy);
+ pld++;
+ }
+ fprintf(pysamerr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
+ fprintf(pysamerr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
+ fprintf(pysamerr,"\n");
+ exit(-1);
+ }
+ else if ( detailed )
+ {
+ fprintf(pysamerr,"%s", pld->ploidy);
+ exit(-1);
+ }
+ return ploidy_init_string(pld->ploidy,2);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n");
+ fprintf(pysamerr, " This command replaces the former \"bcftools view\" caller. Some of the original\n");
+ fprintf(pysamerr, " functionality has been temporarily lost in the process of transition to htslib,\n");
+ fprintf(pysamerr, " but will be added back on popular demand. The original calling model can be\n");
+ fprintf(pysamerr, " invoked with the -c option.\n");
+ fprintf(pysamerr, "Usage: bcftools call [options] <in.vcf.gz>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "File format options:\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysamerr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
+ fprintf(pysamerr, " --ploidy-file <file> space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --samples <list> list of samples to include [all samples]\n");
+ fprintf(pysamerr, " -S, --samples-file <file> PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Input/output options:\n");
+ fprintf(pysamerr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
+ fprintf(pysamerr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
+ fprintf(pysamerr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+ fprintf(pysamerr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
+ fprintf(pysamerr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
+ fprintf(pysamerr, " -V, --skip-variants <type> skip indels/snps\n");
+ fprintf(pysamerr, " -v, --variants-only output variant sites only\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Consensus/variant calling options:\n");
+ fprintf(pysamerr, " -c, --consensus-caller the original calling method (conflicts with -m)\n");
+ fprintf(pysamerr, " -C, --constrain <str> one of: alleles, trio (see manual)\n");
+ fprintf(pysamerr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+ fprintf(pysamerr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+ fprintf(pysamerr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
+ fprintf(pysamerr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
+
+ // todo (and more)
+ // fprintf(pysamerr, "\nContrast calling and association test options:\n");
+ // fprintf(pysamerr, " -1 INT number of group-1 samples [0]\n");
+ // fprintf(pysamerr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
+ // fprintf(pysamerr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
+ // fprintf(pysamerr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
+ fprintf(pysamerr, "\n");
+ exit(-1);
+}
+
+int main_vcfcall(int argc, char *argv[])
+{
+ char *ploidy_fname = NULL, *ploidy = NULL;
+ args_t args;
+ memset(&args, 0, sizeof(args_t));
+ args.argc = argc; args.argv = argv;
+ args.aux.prior_type = -1;
+ args.aux.indel_frac = -1;
+ args.aux.theta = 1.1e-3;
+ args.aux.pref = 0.5;
+ args.aux.min_perm_p = 0.01;
+ args.aux.min_lrt = 1;
+ args.flag = CF_ACGT_ONLY;
+ args.output_fname = "-";
+ args.output_type = FT_VCF;
+ args.n_threads = 0;
+ args.aux.trio_Pm_SNPs = 1 - 1e-8;
+ args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
+
+ int c;
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"format-fields",required_argument,NULL,'f'},
+ {"gvcf",required_argument,NULL,'g'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"threads",required_argument,NULL,9},
+ {"keep-alts",no_argument,NULL,'A'},
+ {"insert-missed",no_argument,NULL,'i'},
+ {"skip-Ns",no_argument,NULL,'N'}, // now the new default
+ {"keep-masked-refs",no_argument,NULL,'M'},
+ {"skip-variants",required_argument,NULL,'V'},
+ {"variants-only",no_argument,NULL,'v'},
+ {"consensus-caller",no_argument,NULL,'c'},
+ {"constrain",required_argument,NULL,'C'},
+ {"multiallelic-caller",no_argument,NULL,'m'},
+ {"pval-threshold",required_argument,NULL,'p'},
+ {"prior",required_argument,NULL,'P'},
+ {"novel-rate",required_argument,NULL,'n'},
+ {"ploidy",required_argument,NULL,1},
+ {"ploidy-file",required_argument,NULL,2},
+ {"chromosome-X",no_argument,NULL,'X'},
+ {"chromosome-Y",no_argument,NULL,'Y'},
+ {NULL,0,NULL,0}
+ };
+
+ char *tmp = NULL;
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 2 : ploidy_fname = optarg; break;
+ case 1 : ploidy = optarg; break;
+ case 'X': ploidy = "X"; fprintf(pysamerr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
+ case 'Y': ploidy = "Y"; fprintf(pysamerr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
+ case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
+ case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N
+ case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default)
+ case 'A': args.aux.flag |= CALL_KEEPALT; break;
+ case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
+ case 'i': args.flag |= CF_INS_MISSED; break;
+ case 'v': args.aux.flag |= CALL_VARONLY; break;
+ case 'g':
+ args.gvcf = gvcf_init(optarg);
+ if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
+ break;
+ case 'o': args.output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args.output_type = FT_BCF_GZ; break;
+ case 'u': args.output_type = FT_BCF; break;
+ case 'z': args.output_type = FT_VCF_GZ; break;
+ case 'v': args.output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'C':
+ if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES;
+ else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO;
+ else error("Unknown argument to -C: \"%s\"\n", optarg);
+ break;
+ case 'V':
+ if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY;
+ else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL;
+ else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg);
+ break;
+ case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method
+ case 'p':
+ args.aux.pref = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --pval-threshold %s\n", optarg);
+ break;
+ case 'P': args.aux.theta = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg);
+ break;
+ case 'n': parse_novel_rate(&args,optarg); break;
+ case 'r': args.regions = optarg; break;
+ case 'R': args.regions = optarg; args.regions_is_file = 1; break;
+ case 't': args.targets = optarg; break;
+ case 'T': args.targets = optarg; args.targets_is_file = 1; break;
+ case 's': args.samples_fname = optarg; break;
+ case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
+ case 9 : args.n_threads = strtol(optarg, 0, 0); break;
+ default: usage(&args);
+ }
+ }
+ // Sanity check options and initialize
+ if ( ploidy_fname ) args.ploidy = ploidy_init(ploidy_fname, 2);
+ else if ( ploidy ) args.ploidy = init_ploidy(ploidy);
+
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-"; // reading from stdin
+ else usage(&args);
+ }
+ else args.bcf_fname = argv[optind++];
+
+ if ( !ploidy_fname && !ploidy )
+ {
+ fprintf(pysamerr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
+ args.ploidy = ploidy_init_string("",2);
+ }
+
+ if ( !args.ploidy ) error("Could not initialize ploidy\n");
+ if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n");
+ if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n");
+ if ( (args.flag & CF_CCALL ? 1: 0) && args.gvcf ) error("gvcf -g option not functional with -c calling mode yet\n");
+ if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix
+ if ( args.aux.flag & CALL_CONSTR_ALLELES )
+ {
+ if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n");
+ if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n");
+ }
+ if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n");
+ if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n");
+ init_data(&args);
+
+ while ( bcf_sr_next_line(args.aux.srs) )
+ {
+ bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0];
+ if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map);
+ bcf_unpack(bcf_rec, BCF_UN_STR);
+
+ // Skip unwanted sites
+ int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1;
+ if ( (args.flag & CF_INDEL_ONLY) && !is_indel ) continue;
+ if ( (args.flag & CF_NO_INDEL) && is_indel ) continue;
+ if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N'
+
+ // Which allele is symbolic? All SNPs should have it, but not indels
+ args.aux.unseen = 0;
+ for (i=1; i<bcf_rec->n_allele; i++)
+ {
+ if ( bcf_rec->d.allele[i][0]=='X' ) { args.aux.unseen = i; break; } // old X
+ if ( bcf_rec->d.allele[i][0]=='<' )
+ {
+ if ( bcf_rec->d.allele[i][1]=='X' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // old <X>
+ if ( bcf_rec->d.allele[i][1]=='*' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // new <*>
+ }
+ }
+ int is_ref = (bcf_rec->n_allele==1 || (bcf_rec->n_allele==2 && args.aux.unseen>0)) ? 1 : 0;
+
+ if ( is_ref && args.aux.flag&CALL_VARONLY )
+ continue;
+
+ bcf_unpack(bcf_rec, BCF_UN_ALL);
+ if ( args.nsex ) set_ploidy(&args, bcf_rec);
+
+ // Various output modes: QCall output (todo)
+ if ( args.flag & CF_QCALL )
+ {
+ qcall(&args.aux, bcf_rec);
+ continue;
+ }
+
+ // Calling modes which output VCFs
+ int ret;
+ if ( args.flag & CF_MCALL )
+ ret = mcall(&args.aux, bcf_rec);
+ else
+ ret = ccall(&args.aux, bcf_rec);
+ if ( ret==-1 ) error("Something is wrong\n");
+
+ // Normal output
+ if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant
+ if ( args.gvcf )
+ bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0);
+ if ( bcf_rec )
+ bcf_write1(args.out_fh, args.aux.hdr, bcf_rec);
+ }
+ if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0);
+ if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets);
+ destroy_data(&args);
+ return 0;
+}
+
diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c
new file mode 100644
index 0000000..e4b9372
--- /dev/null
+++ b/bcftools/vcfcnv.c
@@ -0,0 +1,1386 @@
+/* The MIT License
+
+ Copyright (c) 2014-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+/*
+ Known issues:
+ - The --AF-file option behaves like --targets-file, sites not listed in the AFs
+ are skipped.
+*/
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kstring.h>
+#include <htslib/kfunc.h>
+#include <htslib/khash_str2int.h>
+#include "bcftools.h"
+#include "HMM.h"
+#include "rbuf.h"
+
+#define DBG0 0
+
+#define N_STATES 4
+#define CN0 0
+#define CN1 1
+#define CN2 2
+#define CN3 3
+
+typedef struct
+{
+ float mean, dev2, norm;
+}
+gauss_param_t;
+
+typedef struct
+{
+ char *name;
+ int idx; // VCF sample index
+ float *lrr,*baf, baf_dev2, baf_dev2_dflt, lrr_dev2;
+ float cell_frac, cell_frac_dflt;
+ gauss_param_t gauss_param[18];
+ double pobs[N_STATES];
+ FILE *dat_fh, *cn_fh, *summary_fh;
+ char *dat_fname, *cn_fname, *summary_fname;
+}
+sample_t;
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ int prev_rid, ntot, nused;
+ sample_t query_sample, control_sample;
+
+ int nstates; // number of states: N_STATES for one sample, N_STATES^2 for two samples
+ double lrr_bias, baf_bias; // LRR/BAF weights
+ double same_prob, ij_prob; // prior of both samples being the same and the transition probability P(i|j)
+ double err_prob; // constant probability of erroneous measurement
+ float *nonref_afs, nonref_af, nonref_af_dflt, fRR, fRA, fAA, *tmpf;
+ unsigned long int nRR, nRA, nAA;
+ int mtmpf;
+
+ double *tprob, *tprob_arr; // array of transition matrices, precalculated up to ntprob_arr positions
+ double *iprobs; // states' initial probabilities
+ int ntprob_arr;
+
+ hmm_t *hmm;
+ double *eprob; // emission probs [nstates*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+
+ double baum_welch_th, optimize_frac;
+ float plot_th;
+ FILE *summary_fh;
+ char **argv, *regions_list, *summary_fname, *output_dir;
+ char *targets_list, *af_fname;
+ int argc, verbose, lrr_smooth_win;
+}
+args_t;
+
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+
+static inline void hmm2cn_state(int nstates, int i, int *a, int *b)
+{
+ *a = i / N_STATES;
+ *b = i - (*a)*N_STATES;
+}
+static double *init_tprob_matrix(int ndim, double ij_prob, double same_prob)
+{
+ int i,j;
+ double *mat = (double*) malloc(sizeof(double)*ndim*ndim);
+
+ assert( ndim==N_STATES || ndim==N_STATES*N_STATES);
+
+ if ( ndim==N_STATES ) // one sample
+ {
+ double pii = 1 - ij_prob*(N_STATES-1);
+ if ( pii < ij_prob ) error("Error: -x set a bit too high, P(x|x) < P(x|y): %e vs %e\n", pii,ij_prob);
+ for (j=0; j<ndim; j++)
+ {
+ double sum = 0;
+ for (i=0; i<ndim; i++)
+ {
+ // transition from j-th to i-th state
+ if ( i==j )
+ MAT(mat,ndim,i,j) = pii;
+ else
+ MAT(mat,ndim,i,j) = ij_prob;
+
+ sum += MAT(mat,ndim,i,j);
+ }
+ assert( fabs(sum - 1.0)<1e-15 );
+ }
+ }
+ else // two samples
+ {
+ // interpret ij_prob differently, as ii_prob in fact, so that for two
+ // samples the behaviour is somewhat closer to single sample calling
+ // with s=0.
+ double pii = 1 - ij_prob*(N_STATES-1);
+ ij_prob = (1 - pii) / (ndim - 1);
+ for (j=0; j<ndim; j++)
+ {
+ int ja,jb;
+ hmm2cn_state(ndim, j, &ja, &jb);
+
+ double sum = 0;
+ for (i=0; i<ndim; i++)
+ {
+ int ia,ib;
+ hmm2cn_state(ndim, i, &ia, &ib);
+
+ // transition from (ja,jb)-th to (ia,ib)-th state
+ double pa = ja==ia ? pii : ij_prob;
+ double pb = jb==ib ? pii : ij_prob;
+
+ if ( ia==ib && ja==jb )
+ MAT(mat,ndim,i,j) = pa*pb - pa*pb*same_prob + sqrt(pa*pb)*same_prob;
+ else if ( ia==ib )
+ MAT(mat,ndim,i,j) = pa*pb;
+ else
+ MAT(mat,ndim,i,j) = pa*pb*(1-same_prob);
+
+ sum += MAT(mat,ndim,i,j);
+ }
+ for (i=0; i<ndim; i++) MAT(mat,ndim,i,j) /= sum;
+ }
+ }
+ return mat;
+}
+
+static double *init_iprobs(int ndim, double same_prob)
+{
+ int i;
+ double *probs = (double*) malloc(sizeof(double)*ndim);
+
+ assert( ndim==N_STATES || ndim==N_STATES*N_STATES);
+
+ if ( ndim==N_STATES )
+ {
+ // one sample: prior on CN2
+ for (i=0; i<ndim; i++)
+ probs[i] = i==CN2 ? 0.5 : 0.5/3;
+ }
+ else
+ {
+ // two samples
+ double norm = 0;
+ for (i=0; i<ndim; i++)
+ {
+ int ia,ib;
+ hmm2cn_state(ndim, i, &ia, &ib);
+
+ double pa = ia==CN2 ? 0.5 : 0.5/3;
+ double pb = ib==CN2 ? 0.5 : 0.5/3;
+
+ probs[i] = pa*pb;
+ if ( ia!=ib ) probs[i] *= 1-same_prob;
+
+ norm += probs[i];
+ }
+ for (i=0; i<ndim; i++) probs[i] /= norm;
+ }
+ return probs;
+}
+
+static void init_sample_files(sample_t *smpl, char *dir)
+{
+ smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name);
+ smpl->cn_fh = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name);
+ smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name);
+ fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n");
+ fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n");
+ fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n");
+}
+static void close_sample_files(sample_t *smpl)
+{
+ fclose(smpl->dat_fh);
+ fclose(smpl->cn_fh);
+ fclose(smpl->summary_fh);
+}
+
+static double norm_cdf(double mean, double dev);
+static void init_data(args_t *args)
+{
+ args->prev_rid = -1;
+ args->hdr = args->files->readers[0].header;
+
+ if ( !args->query_sample.name )
+ {
+ if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Multi-sample VCF, missing the -s option\n");
+ args->query_sample.name = strdup(args->hdr->samples[0]);
+ }
+ else
+ if ( bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name)<0 ) error("The sample \"%s\" not found\n", args->query_sample.name);
+ if ( !args->files->readers[0].file->is_bin )
+ {
+ int ret;
+ kstring_t tmp = {0,0,0};
+ if ( args->control_sample.name )
+ {
+ ksprintf(&tmp, "%s,%s", args->query_sample.name,args->control_sample.name);
+ ret = bcf_hdr_set_samples(args->hdr, tmp.s, 0);
+ }
+ else
+ {
+ ret = bcf_hdr_set_samples(args->hdr, args->query_sample.name, 0);
+ tmp.s = args->query_sample.name;
+ }
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", tmp.s);
+ else if ( ret>0 ) error("The sample not found in the VCF: %s\n", ret==1 ? args->query_sample.name : args->control_sample.name);
+
+ if ( args->control_sample.name ) free(tmp.s);
+ }
+ args->query_sample.idx = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name);
+ args->control_sample.idx = args->control_sample.name ? bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->control_sample.name) : -1;
+ args->nstates = args->control_sample.name ? N_STATES*N_STATES : N_STATES;
+ args->tprob = init_tprob_matrix(args->nstates, args->ij_prob, args->same_prob);
+ args->iprobs = init_iprobs(args->nstates, args->same_prob);
+ args->hmm = hmm_init(args->nstates, args->tprob, 10000);
+ hmm_init_states(args->hmm, args->iprobs);
+
+ args->summary_fh = stdout;
+ if ( args->output_dir )
+ {
+ init_sample_files(&args->query_sample, args->output_dir);
+ if ( args->control_sample.name )
+ {
+ init_sample_files(&args->control_sample, args->output_dir);
+ args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
+ }
+ else
+ args->summary_fh = NULL; // one sample only, no two-file summary
+ }
+
+ int i;
+ FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
+
+ fprintf(fh, "# This file was produced by: bcftools cnv(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ fprintf(fh, "# The command line was:\tbcftools %s", args->argv[0]);
+ for (i=1; i<args->argc; i++) fprintf(fh, " %s",args->argv[i]);
+ if ( args->control_sample.name )
+ fprintf(fh, "\n#\n"
+ "# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Copy number:%s\t[7]Quality"
+ "\t[8]nSites in (5)\t[9]nHETs in (5)\t[10]nSites in (6)\t[11]nHETs in(6)\n",
+ args->query_sample.name,args->control_sample.name
+ );
+ else
+ fprintf(fh, "\n#\n"
+ "# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n",
+ args->query_sample.name
+ );
+}
+
+char *msprintf(const char *fmt, ...);
+static void py_plot_cnv(char *script, float th)
+{
+ if ( th>100 ) return; // create no plots
+
+ char *cmd = msprintf("python %s -p %f", script, th);
+ int ret = system(cmd);
+ if ( ret) fprintf(stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
+ free(cmd);
+}
+
+static void plot_sample(args_t *args, sample_t *smpl)
+{
+ char *fname;
+ FILE *fp = open_file(&fname,"w","%s/plot.%s.py",args->output_dir,smpl->name);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import csv\n"
+ "import numpy as np\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "\n"
+ "dat = {}\n"
+ "with open('%s', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr[0]=='#': continue\n"
+ " if chr not in dat: dat[chr] = []\n"
+ " dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
+ "\n"
+ "cnv = {}\n"
+ "with open('%s', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr[0]=='#': continue\n"
+ " if chr not in cnv: cnv[chr] = []\n"
+ " row[2] = int(row[2]) + 0.5\n"
+ " cnv[chr].append(row[1:])\n"
+ "\n"
+ "for chr in dat:\n"
+ " fig,(ax1, ax2, ax3) = plt.subplots(3,1,figsize=(10,8),sharex=True)\n"
+ " ax1.plot([x[0] for x in dat[chr]],[x[2] for x in dat[chr]], '.', ms=3)\n"
+ " ax2.plot([x[0] for x in dat[chr]],[x[1] for x in dat[chr]], '.', ms=3)\n"
+ " cn_dat = cnv[chr]\n"
+ " xgrid = [float(x[0]) for x in cn_dat]\n"
+ " ygrid = np.linspace(0,5,6)\n"
+ " xgrid, ygrid = np.meshgrid(xgrid, ygrid)\n"
+ " heat = np.zeros_like(xgrid)\n"
+ " for x in range(len(heat[0])-1):\n"
+ " heat[0][x] = cn_dat[x][2]\n"
+ " heat[1][x] = cn_dat[x][3]\n"
+ " heat[2][x] = cn_dat[x][4]\n"
+ " heat[3][x] = cn_dat[x][5]\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n"
+ " mesh.set_clim(vmin=-1,vmax=1)\n"
+ " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'.-',ms=3,color='black')\n"
+ " fig.suptitle('%s (chr '+chr+')')\n"
+ " ax1.set_title('Log-R intensities Ratio',fontsize=10)\n"
+ " ax2.set_title('B-Allele Frequency',fontsize=10)\n"
+ " ax3.set_title('Copy Number Variation',fontsize=10)\n"
+ " ax1.set_ylabel('LRR')\n"
+ " ax2.set_ylabel('BAF')\n"
+ " ax3.set_ylabel('CN')\n"
+ " ax3.set_xlabel('Coordinate (chrom '+chr+')',fontsize=10)\n"
+ " ax3.set_ylim(-0.1,4.1)\n"
+ " ax3.set_yticks([0.5,1.5,2.5,3.5])\n"
+ " ax3.set_yticklabels(['CN0','CN1','CN2','CN3'])\n"
+ " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92)\n"
+ " plt.savefig('%s/plot.%s.chr'+chr+'.png')\n"
+ " plt.close()\n"
+ "\n",
+ smpl->dat_fname,smpl->cn_fname,smpl->name,args->output_dir,smpl->name
+ );
+ fclose(fp);
+
+ py_plot_cnv(fname, args->plot_th);
+ free(fname);
+}
+
+static void create_plots(args_t *args)
+{
+ close_sample_files(&args->query_sample);
+ if ( args->control_sample.name ) close_sample_files(&args->control_sample);
+ if ( args->summary_fh ) fclose(args->summary_fh);
+
+ if ( !args->control_sample.name )
+ {
+ plot_sample(args, &args->query_sample);
+ return;
+ }
+
+ char *fname;
+ FILE *fp = open_file(&fname,"w","%s/plot.%s.%s.py",args->output_dir,args->control_sample.name,args->query_sample.name);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import csv,argparse\n"
+ "import numpy as np\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "\n"
+ "control_sample = '%s'\n"
+ "query_sample = '%s'\n"
+ "\n"
+ "parser = argparse.ArgumentParser()\n"
+ "parser.add_argument('-p', '--plot-threshold', type=float)\n"
+ "parser.add_argument('-c', '--chromosome')\n"
+ "args = parser.parse_args()\n"
+ "if args.plot_threshold==None: args.plot_threshold = 0\n"
+ "\n"
+ "def chroms_to_plot(th):\n"
+ " dat = {}\n"
+ " with open('%s/summary.tab', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " if row[0]!='RG': continue\n"
+ " chr = row[1]\n"
+ " start = row[2]\n"
+ " end = row[3]\n"
+ " qual = float(row[6])\n"
+ " if row[4]==row[5] and args.plot_threshold!=0: continue\n"
+ " if chr not in dat: dat[chr] = 0.0\n"
+ " if qual > dat[chr]: dat[chr] = qual\n"
+ " out = {}\n"
+ " for chr in dat:\n"
+ " if (chr not in dat) or dat[chr]<th: continue\n"
+ " out[chr] = 1\n"
+ " return out\n"
+ "if args.chromosome!=None:\n"
+ " plot_chroms = { args.chromosome:1 }\n"
+ "else:\n"
+ " plot_chroms = chroms_to_plot(args.plot_threshold)\n"
+ "\n"
+ "def read_dat(file,dat,plot_chr):\n"
+ " with open(file, 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr != plot_chr: continue\n"
+ " dat.append([row[1], float(row[2]), float(row[3])])\n"
+ "def read_cnv(file,cnv,plot_chr):\n"
+ " with open(file, 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr != plot_chr: continue\n"
+ " row[2] = int(row[2]) + 0.5\n"
+ " cnv.append(row[1:])\n"
+ "def find_diffs(a,b):\n"
+ " out = []\n"
+ " diff = []\n"
+ " for i in range(len(a)):\n"
+ " if a[i][1]!=b[i][1]:\n"
+ " if i>0: diff.append([b[i-1][0],b[i-1][1],a[i-1][1]])\n"
+ " diff.append([b[i][0],b[i][1],a[i][1]])\n"
+ " elif len(diff):\n"
+ " diff.append([b[i][0],b[i][1],a[i][1]])\n"
+ " out.append(diff)\n"
+ " diff = []\n"
+ " if len(diff): out.append(diff)\n"
+ " return out\n"
+ "\n"
+ "for chr in sorted(plot_chroms.keys()):\n"
+ " control_dat = []\n"
+ " control_cnv = []\n"
+ " query_dat = []\n"
+ " query_cnv = []\n"
+ " read_dat('%s',control_dat,chr)\n"
+ " read_dat('%s',query_dat,chr)\n"
+ " read_cnv('%s',control_cnv,chr)\n"
+ " read_cnv('%s',query_cnv,chr)\n"
+ "\n"
+ " fig,(ax1,ax2,ax3,ax4,ax5,ax6) = plt.subplots(6,1,figsize=(10,8),sharex=True)\n"
+ " ax1.plot([x[0] for x in control_dat],[x[2] for x in control_dat], '.', ms=3,color='red')\n"
+ " ax2.plot([x[0] for x in control_dat],[x[1] for x in control_dat], '.', ms=3,color='red')\n"
+ " cn_dat = control_cnv\n"
+ " xgrid = [float(x[0]) for x in cn_dat]\n"
+ " ygrid = np.linspace(0,5,6)\n"
+ " xgrid, ygrid = np.meshgrid(xgrid, ygrid)\n"
+ " heat = np.zeros_like(xgrid)\n"
+ " for x in range(len(heat[0])-1):\n"
+ " heat[0][x] = cn_dat[x][2]\n"
+ " heat[1][x] = cn_dat[x][3]\n"
+ " heat[2][x] = cn_dat[x][4]\n"
+ " heat[3][x] = cn_dat[x][5]\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr')\n"
+ " mesh.set_clim(vmin=-1,vmax=1)\n"
+ " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n"
+ "\n"
+ " ax6.plot([x[0] for x in query_dat],[x[2] for x in query_dat], '.', ms=3)\n"
+ " ax5.plot([x[0] for x in query_dat],[x[1] for x in query_dat], '.', ms=3)\n"
+ " cn_dat = query_cnv\n"
+ " xgrid = [float(x[0]) for x in cn_dat]\n"
+ " ygrid = np.linspace(0,5,6)\n"
+ " xgrid, ygrid = np.meshgrid(xgrid, ygrid)\n"
+ " heat = np.zeros_like(xgrid)\n"
+ " for x in range(len(heat[0])-1):\n"
+ " heat[0][x] = cn_dat[x][2]\n"
+ " heat[1][x] = cn_dat[x][3]\n"
+ " heat[2][x] = cn_dat[x][4]\n"
+ " heat[3][x] = cn_dat[x][5]\n"
+ " mesh = ax4.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n"
+ " mesh.set_clim(vmin=-1,vmax=1)\n"
+ " ax4.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n"
+ " ax3.annotate(control_sample, xy=(0.02,0.1), xycoords='axes fraction', color='red',fontsize=12, va='bottom',ha='left')\n"
+ " ax4.annotate(query_sample, xy=(0.02,0.9), xycoords='axes fraction', color='blue',fontsize=12, va='top',ha='left')\n"
+ "\n"
+ " diffs = find_diffs(control_cnv,query_cnv)\n"
+ " for diff in diffs:\n"
+ " ax3.plot([x[0] for x in diff],[x[1] for x in diff],'-',ms=3,color='blue',lw=1.7)\n"
+ " ax4.plot([x[0] for x in diff],[x[2] for x in diff],'-',ms=3,color='red',lw=1.7)\n"
+ "\n"
+ " fig.suptitle('chr '+chr+', '+control_sample+' vs '+query_sample)\n"
+ " ax1.tick_params(axis='both', labelsize=8)\n"
+ " ax2.tick_params(axis='both', labelsize=8)\n"
+ " ax3.tick_params(axis='both', labelsize=8)\n"
+ " ax4.tick_params(axis='both', labelsize=8)\n"
+ " ax5.tick_params(axis='both', labelsize=8)\n"
+ " ax6.tick_params(axis='both', labelsize=8)\n"
+ " ax6.set_xlabel('Coordinate (chrom '+chr+')',fontsize=8)\n"
+ " ax1.set_ylabel('LRR')\n"
+ " ax2.set_ylabel('BAF')\n"
+ " ax3.set_ylabel('CN')\n"
+ " ax6.set_ylabel('LRR')\n"
+ " ax5.set_ylabel('BAF')\n"
+ " ax4.set_ylabel('CN')\n"
+ " ax3.set_ylim(-0.1,4.1)\n"
+ " ax3.set_yticks([0.5,1.5,2.5,3.5])\n"
+ " ax3.set_yticklabels(['CN0','CN1','CN2','CN3'])\n"
+ " ax4.set_ylim(-0.1,4.1)\n"
+ " ax4.set_yticks([0.5,1.5,2.5,3.5])\n"
+ " ax4.set_yticklabels(['CN0','CN1','CN2','CN3'])\n"
+ " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92,hspace=0)\n"
+ " plt.savefig('%s/plot.%s.%s.chr'+chr+'.png')\n"
+ " plt.close()\n"
+ "\n",
+ args->control_sample.name,args->query_sample.name,
+ args->output_dir,
+ args->control_sample.dat_fname,args->query_sample.dat_fname,
+ args->control_sample.cn_fname,args->query_sample.cn_fname,
+ args->output_dir,args->control_sample.name,args->query_sample.name
+ );
+ fclose(fp);
+
+ py_plot_cnv(fname,args->plot_th);
+ free(fname);
+}
+
+static void destroy_data(args_t *args)
+{
+ bcf_sr_destroy(args->files);
+ hmm_destroy(args->hmm);
+ free(args->tmpf);
+ free(args->sites);
+ free(args->eprob);
+ free(args->tprob);
+ free(args->summary_fname);
+ free(args->nonref_afs);
+ free(args->query_sample.baf);
+ free(args->query_sample.lrr);
+ free(args->control_sample.baf);
+ free(args->control_sample.lrr);
+ free(args->query_sample.name);
+ free(args->query_sample.dat_fname);
+ free(args->query_sample.cn_fname);
+ free(args->query_sample.summary_fname);
+ free(args->control_sample.dat_fname);
+ free(args->control_sample.cn_fname);
+ free(args->control_sample.summary_fname);
+}
+
+static inline char copy_number_state(args_t *args, int istate, int ismpl)
+{
+ char code[] = "01234";
+ if ( !args->control_sample.name ) return code[istate];
+ int idx = ismpl ? istate - (istate/N_STATES)*N_STATES : istate/N_STATES;
+ return code[idx];
+}
+
+static double avg_ii_prob(int n, double *mat)
+{
+ int i;
+ double avg = 0;
+ for (i=0; i<n; i++) avg += MAT(mat,n,i,i);
+ return avg/n;
+}
+
+#define GAUSS_CN1_PK_R(smpl) (&((smpl)->gauss_param[0]))
+#define GAUSS_CN1_PK_A(smpl) (&((smpl)->gauss_param[1]))
+#define GAUSS_CN2_PK_RR(smpl) (&((smpl)->gauss_param[2]))
+#define GAUSS_CN2_PK_RA(smpl) (&((smpl)->gauss_param[3]))
+#define GAUSS_CN2_PK_AA(smpl) (&((smpl)->gauss_param[4]))
+#define GAUSS_CN3_PK_RRR(smpl) (&((smpl)->gauss_param[5]))
+#define GAUSS_CN3_PK_RRA(smpl) (&((smpl)->gauss_param[6]))
+#define GAUSS_CN3_PK_RAA(smpl) (&((smpl)->gauss_param[7]))
+#define GAUSS_CN3_PK_AAA(smpl) (&((smpl)->gauss_param[8]))
+
+static inline double norm_prob(double baf, gauss_param_t *param)
+{
+ return exp(-(baf-param->mean)*(baf-param->mean)*0.5/param->dev2) / param->norm / sqrt(2*M_PI*param->dev2);
+}
+
+static int set_observed_prob(args_t *args, sample_t *smpl, int isite)
+{
+ float baf = smpl->baf[isite];
+ float lrr = args->lrr_bias>0 ? smpl->lrr[isite] : 0;
+
+ float fRR = args->fRR;
+ float fRA = args->fRA;
+ float fAA = args->fAA;
+
+ if ( baf<0 )
+ {
+ // no call: either some technical issue or the call could not be made because it is CN0
+ int i;
+ smpl->pobs[CN0] = 0.5;
+ for (i=1; i<N_STATES; i++) smpl->pobs[i] = (1.0-smpl->pobs[CN0])/(N_STATES-1);
+ return 0;
+ }
+
+ double cn1_baf =
+ norm_prob(baf,GAUSS_CN1_PK_R(smpl)) * (fRR + fRA*0.5) +
+ norm_prob(baf,GAUSS_CN1_PK_A(smpl)) * (fAA + fRA*0.5) ;
+ double cn2_baf =
+ norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA +
+ norm_prob(baf,GAUSS_CN2_PK_AA(smpl)) * fAA;
+ double cn3_baf =
+ norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 +
+ norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 +
+ norm_prob(baf,GAUSS_CN3_PK_AAA(smpl)) * fAA;
+
+ double norm = cn1_baf + cn2_baf + cn3_baf;
+ cn1_baf /= norm;
+ cn2_baf /= norm;
+ cn3_baf /= norm;
+
+ #if DBG0
+ if ( args->verbose ) fprintf(stderr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf);
+ #endif
+
+ double cn1_lrr = exp(-(lrr + 0.45)*(lrr + 0.45)/smpl->lrr_dev2);
+ double cn2_lrr = exp(-(lrr - 0.00)*(lrr - 0.00)/smpl->lrr_dev2);
+ double cn3_lrr = exp(-(lrr - 0.30)*(lrr - 0.30)/smpl->lrr_dev2);
+
+ smpl->pobs[CN0] = 0;
+ smpl->pobs[CN1] = args->err_prob + (1 - args->baf_bias + args->baf_bias*cn1_baf)*(1 - args->lrr_bias + args->lrr_bias*cn1_lrr);
+ smpl->pobs[CN2] = args->err_prob + (1 - args->baf_bias + args->baf_bias*cn2_baf)*(1 - args->lrr_bias + args->lrr_bias*cn2_lrr);
+ smpl->pobs[CN3] = args->err_prob + (1 - args->baf_bias + args->baf_bias*cn3_baf)*(1 - args->lrr_bias + args->lrr_bias*cn3_lrr);
+
+ return 0;
+}
+
+static void set_emission_prob(args_t *args, int isite)
+{
+ double *eprob = &args->eprob[args->nstates*isite];
+ int i;
+ for (i=0; i<N_STATES; i++)
+ eprob[i] = args->query_sample.pobs[i];
+}
+
+static void set_emission_prob2(args_t *args, int isite)
+{
+ double *eprob = &args->eprob[args->nstates*isite];
+ int i, j;
+ for (i=0; i<N_STATES; i++)
+ {
+ for (j=0; j<N_STATES; j++)
+ {
+ eprob[i*N_STATES+j] = args->query_sample.pobs[i]*args->control_sample.pobs[j];
+ }
+ }
+}
+
+static void set_gauss_params(args_t *args, sample_t *smpl);
+static double norm_cdf(double mean, double dev)
+{
+ double bot = 0, top = 1;
+ top = 1 - 0.5*erfc((top-mean)/(dev*sqrt(2)));
+ bot = 1 - 0.5*erfc((bot-mean)/(dev*sqrt(2)));
+ return top-bot;
+}
+
+static void set_emission_probs(args_t *args)
+{
+ if ( !args->af_fname )
+ {
+ args->fRR = 0.76;
+ args->fRA = 0.14;
+ args->fAA = 0.098;
+ }
+
+ set_gauss_params(args, &args->query_sample);
+ if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample);
+
+ #if DBG0
+ args->verbose = 1;
+ args->query_sample.baf[0] = 0; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 1/3.; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 1/2.; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 2/3.; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 1; set_observed_prob(args,&args->query_sample,0);
+ args->verbose = 0;
+ #endif
+
+ int i;
+ for (i=0; i<args->nsites; i++)
+ {
+ if ( args->af_fname )
+ {
+ args->fRR = (1-args->nonref_afs[i])*(1-args->nonref_afs[i]);
+ args->fRA = 2*args->nonref_afs[i]*(1-args->nonref_afs[i]);
+ args->fAA = args->nonref_afs[i]*args->nonref_afs[i];
+ }
+ set_observed_prob(args,&args->query_sample,i);
+ if ( args->control_sample.name )
+ {
+ set_observed_prob(args,&args->control_sample,i);
+ set_emission_prob2(args,i);
+ }
+ else
+ set_emission_prob(args,i);
+ }
+}
+
+static void smooth_data(float *dat, int ndat, int win)
+{
+ if ( win<=1 ) return;
+
+ int i,j, k1 = win/2, k2 = win-k1;
+ rbuf_t rbuf;
+ rbuf_init(&rbuf,win);
+ float sum = 0, *buf = (float*)malloc(sizeof(float)*win);
+ for (i=0; i<k2; i++)
+ {
+ sum += dat[i];
+ int j = rbuf_append(&rbuf);
+ buf[j] = dat[i];
+ }
+ for (i=0; i<ndat; i++)
+ {
+ dat[i] = sum/rbuf.n;
+ if ( i>=k1 )
+ {
+ j = rbuf_shift(&rbuf);
+ sum -= buf[j];
+ }
+ if ( i+k2<ndat )
+ {
+ sum += dat[i+k2];
+ j = rbuf_append(&rbuf);
+ buf[j] = dat[i+k2];
+ }
+ }
+ free(buf);
+}
+
+static void set_gauss_params(args_t *args, sample_t *smpl)
+{
+ int i;
+ for (i=0; i<18; i++) smpl->gauss_param[i].dev2 = smpl->baf_dev2;
+
+ double dev = sqrt(smpl->baf_dev2);
+
+ GAUSS_CN1_PK_R(smpl)->mean = 0;
+ GAUSS_CN1_PK_A(smpl)->mean = 1;
+ GAUSS_CN1_PK_R(smpl)->norm = norm_cdf(GAUSS_CN1_PK_R(smpl)->mean,dev);
+ GAUSS_CN1_PK_A(smpl)->norm = norm_cdf(GAUSS_CN1_PK_A(smpl)->mean,dev);
+
+ GAUSS_CN2_PK_RR(smpl)->mean = 0;
+ GAUSS_CN2_PK_RA(smpl)->mean = 0.5;
+ GAUSS_CN2_PK_AA(smpl)->mean = 1;
+ GAUSS_CN2_PK_RR(smpl)->norm = norm_cdf(GAUSS_CN2_PK_RR(smpl)->mean,dev);
+ GAUSS_CN2_PK_RA(smpl)->norm = norm_cdf(GAUSS_CN2_PK_RA(smpl)->mean,dev);
+ GAUSS_CN2_PK_AA(smpl)->norm = norm_cdf(GAUSS_CN2_PK_AA(smpl)->mean,dev);
+
+ GAUSS_CN3_PK_RRR(smpl)->mean = 0;
+ GAUSS_CN3_PK_RRA(smpl)->mean = 1.0/(2+smpl->cell_frac);
+ GAUSS_CN3_PK_RAA(smpl)->mean = (1.0+smpl->cell_frac)/(2+smpl->cell_frac);
+ GAUSS_CN3_PK_AAA(smpl)->mean = 1;
+ GAUSS_CN3_PK_RRR(smpl)->norm = norm_cdf(GAUSS_CN3_PK_RRR(smpl)->mean,dev);
+ GAUSS_CN3_PK_RRA(smpl)->norm = norm_cdf(GAUSS_CN3_PK_RRA(smpl)->mean,dev);
+ GAUSS_CN3_PK_RAA(smpl)->norm = norm_cdf(GAUSS_CN3_PK_RAA(smpl)->mean,dev);
+ GAUSS_CN3_PK_AAA(smpl)->norm = norm_cdf(GAUSS_CN3_PK_AAA(smpl)->mean,dev);
+}
+
+static int update_sample_args(args_t *args, sample_t *smpl, int ismpl)
+{
+ hmm_t *hmm = args->hmm;
+ double *fwd = hmm_get_fwd_bwd_prob(hmm);
+ int nstates = hmm_get_nstates(hmm);
+
+ // estimate the BAF mean and deviation for CN3
+ double mean_cn3 = 0, norm_cn3 = 0;
+ double baf_dev2 = 0, baf_AA_dev2 = 0, norm_baf_AA_dev2 = 0;
+
+ // experimental: smooth CN3 probs to bias toward bigger events, this lowers
+ // the FP rate when the data is noisy
+ hts_expand(float,args->nsites,args->mtmpf,args->tmpf);
+ int i, j, k = 0;
+ for (i=0; i<args->nsites; i++)
+ {
+ float baf = smpl->baf[i];
+ if ( baf>4/5.) continue; // skip AA genotypes
+ if ( baf>0.5 ) baf = 1 - baf; // the bands should be symmetric
+ if ( baf<1/5.) continue; // skip RR genotypes
+
+ double prob_cn3 = 0, *probs = fwd + i*nstates;
+ if ( !args->control_sample.name )
+ {
+ prob_cn3 = probs[CN3];
+ }
+ else if ( ismpl==0 )
+ {
+ // query sample: CN3 probability must be recovered from all states of the control sample
+ for (j=0; j<N_STATES; j++) prob_cn3 += probs[CN3*N_STATES+j];
+ }
+ else
+ {
+ // same as above but for control sample
+ for (j=0; j<N_STATES; j++) prob_cn3 += probs[CN3+j*N_STATES];
+ }
+ args->tmpf[k++] = prob_cn3;
+ }
+ smooth_data(args->tmpf, k, 50);
+ k = 0;
+ for (i=0; i<args->nsites; i++)
+ {
+ float baf = smpl->baf[i];
+ if ( baf>4/5.) { baf_AA_dev2 += (1.0-baf)*(1.0-baf); norm_baf_AA_dev2++; continue; } // skip AA genotypes
+ if ( baf>0.5 ) baf = 1 - baf; // the bands should be symmetric
+ if ( baf<1/5.) continue; // skip RR genotypes
+
+ double prob_cn3 = args->tmpf[k++];
+ mean_cn3 += prob_cn3 * baf;
+ norm_cn3 += prob_cn3;
+ }
+ if ( !norm_cn3 )
+ {
+ smpl->cell_frac = 1.0;
+ return 1;
+ }
+ mean_cn3 /= norm_cn3;
+ k = 0;
+ for (i=0; i<args->nsites; i++)
+ {
+ float baf = smpl->baf[i];
+ if ( baf>0.5 ) baf = 1 - baf; // the bands should be symmetric
+ if ( baf<1/5.) continue; // skip RR,AA genotypes
+
+ double prob_cn3 = args->tmpf[k++];
+ baf_dev2 += prob_cn3 * (baf - mean_cn3)*(baf - mean_cn3);
+ }
+
+ /*
+ A noisy CN2 band is hard to distinguish from two CN3 bands which are
+ close to each other. Set a treshold on the minimum separation based
+ on the BAF deviation at p=0.95
+ */
+ baf_dev2 /= norm_cn3;
+ baf_AA_dev2 /= norm_baf_AA_dev2;
+ if ( baf_dev2 < baf_AA_dev2 ) baf_dev2 = baf_AA_dev2;
+ double max_mean_cn3 = 0.5 - sqrt(baf_dev2)*1.644854; // R: qnorm(0.95)=1.644854
+ //fprintf(stderr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3);
+ assert( max_mean_cn3>0 );
+
+ double new_frac = 1./mean_cn3 - 2;
+ if ( mean_cn3 > max_mean_cn3 || new_frac < args->optimize_frac )
+ {
+ // out of bounds, beyond our detection limits. Give up and say it converged
+ smpl->cell_frac = 1.0;
+ return 1;
+ }
+ if ( new_frac>1 ) new_frac = 1;
+ int converged = fabs(new_frac - smpl->cell_frac) < 1e-1 ? 1 : 0;
+
+ // Update dev2, but stay within safe limits
+ if ( baf_dev2 > 3*smpl->baf_dev2_dflt ) baf_dev2 = 3*smpl->baf_dev2_dflt;
+ else if ( baf_dev2 < 0.5*smpl->baf_dev2_dflt ) baf_dev2 = 0.5*smpl->baf_dev2_dflt;
+
+ smpl->cell_frac = new_frac;
+ smpl->baf_dev2 = baf_dev2;
+
+ return converged;
+}
+
+// Update parameters which depend on the estimated fraction of aberrant cells
+// in CN3. Returns 0 if the current estimate did not need to be updated or 1
+// if there was a change.
+static int update_args(args_t *args)
+{
+ int converged = update_sample_args(args, &args->query_sample, 0);
+ if ( args->control_sample.name )
+ {
+ converged += update_sample_args(args, &args->control_sample, 1);
+ return converged==2 ? 0 : 1;
+ }
+ return converged ? 0 : 1;
+}
+
+// for an approximate estimate of the number of het genotypes in a region
+#define BAF_LIKELY_HET(val) (val)>0.25 && (val)<0.75
+
+static void cnv_flush_viterbi(args_t *args)
+{
+ if ( !args->nsites ) return;
+
+ // Set HMM transition matrix for the new chromsome again. This is for case
+ // Baum-Welch was used, which is experimental, largerly unsupported and not
+ // done by default.
+ hmm_t *hmm = args->hmm;
+ hmm_set_tprob(args->hmm, args->tprob, 10000);
+
+ // Smooth LRR values to reduce noise
+ if ( args->lrr_bias > 0 )
+ {
+ smooth_data(args->query_sample.lrr,args->nsites, args->lrr_smooth_win);
+ if ( args->control_sample.name ) smooth_data(args->control_sample.lrr,args->nsites, args->lrr_smooth_win);
+ }
+
+ // Set the BAF peak likelihoods, such as P(RRR|CN3), taking account the
+ // estimated fraction of aberrant cells in the mixture. With the new chromosome,
+ // reset the fraction to the default value.
+ args->query_sample.cell_frac = args->query_sample.cell_frac_dflt;
+ args->control_sample.cell_frac = args->control_sample.cell_frac_dflt;
+ args->query_sample.baf_dev2 = args->query_sample.baf_dev2_dflt;
+ args->control_sample.baf_dev2 = args->control_sample.baf_dev2_dflt;
+ set_gauss_params(args, &args->query_sample);
+ if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample);
+
+ if ( args->optimize_frac )
+ {
+ int niter = 0;
+ fprintf(stderr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid));
+ do
+ {
+ fprintf(stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
+ if ( args->control_sample.name )
+ fprintf(stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
+ fprintf(stderr,"\n");
+ set_emission_probs(args);
+ hmm_run_fwd_bwd(hmm, args->nsites, args->eprob, args->sites);
+ }
+ while ( update_args(args) && ++niter<20 );
+ if ( niter>=20 )
+ {
+ // no convergence
+ args->query_sample.cell_frac = args->query_sample.cell_frac_dflt;
+ args->control_sample.cell_frac = args->control_sample.cell_frac_dflt;
+ args->query_sample.baf_dev2 = args->query_sample.baf_dev2_dflt;
+ args->control_sample.baf_dev2 = args->control_sample.baf_dev2_dflt;
+ set_gauss_params(args, &args->query_sample);
+ if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample);
+ }
+
+ fprintf(stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
+ if ( args->control_sample.name )
+ fprintf(stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
+ fprintf(stderr,"\n");
+ }
+ set_emission_probs(args);
+
+ while ( args->baum_welch_th!=0 )
+ {
+ int nstates = hmm_get_nstates(hmm);
+ double ori_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm));
+ hmm_run_baum_welch(hmm, args->nsites, args->eprob, args->sites);
+ double new_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm));
+ fprintf(stderr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii);
+ double *tprob = init_tprob_matrix(nstates, 1-new_ii, args->same_prob);
+ hmm_set_tprob(args->hmm, tprob, 10000);
+ double *tprob_arr = hmm_get_tprob(hmm);
+ free(tprob);
+ if ( fabs(new_ii - ori_ii) < args->baum_welch_th )
+ {
+ int i,j;
+ for (i=0; i<nstates; i++)
+ {
+ for (j=0; j<nstates; j++)
+ {
+ printf(" %.15f", MAT(tprob_arr,nstates,j,i));
+ }
+ printf("\n");
+ }
+ break;
+ }
+ }
+ hmm_run_viterbi(hmm, args->nsites, args->eprob, args->sites);
+ hmm_run_fwd_bwd(hmm, args->nsites, args->eprob, args->sites);
+
+
+ // Output the results
+ uint8_t *vpath = hmm_get_viterbi_path(hmm);
+ double qual = 0, *fwd = hmm_get_fwd_bwd_prob(hmm);
+ int i,j, isite, start_cn = vpath[0], start_pos = args->sites[0], istart_pos = 0;
+ int ctrl_ntot = 0, smpl_ntot = 0, ctrl_nhet = 0, smpl_nhet = 0;
+ for (isite=0; isite<args->nsites; isite++)
+ {
+ int state = vpath[args->nstates*isite];
+ double *pval = fwd + isite*args->nstates;
+
+ qual += pval[start_cn];
+
+ // output CN and fwd-bwd likelihood for each site
+ if ( args->query_sample.cn_fh )
+ {
+ fprintf(args->query_sample.cn_fh, "%s\t%d\t%c", bcf_hdr_id2name(args->hdr,args->prev_rid), args->sites[isite]+1, copy_number_state(args,state,0));
+ if ( !args->control_sample.cn_fh )
+ for (i=0; i<args->nstates; i++) fprintf(args->query_sample.cn_fh, "\t%f", pval[i]);
+ else
+ for (i=0; i<N_STATES; i++)
+ {
+ double sum = 0;
+ for (j=0; j<N_STATES; j++) sum += pval[i*N_STATES+j];
+ fprintf(args->query_sample.cn_fh, "\t%f", sum);
+ }
+ fprintf(args->query_sample.cn_fh, "\n");
+ if ( args->query_sample.baf[isite]>=0 ) // if non-missing
+ {
+ if ( BAF_LIKELY_HET(args->query_sample.baf[isite]) ) smpl_nhet++;
+ smpl_ntot++;
+ }
+ }
+ if ( args->control_sample.cn_fh )
+ {
+ fprintf(args->control_sample.cn_fh, "%s\t%d\t%c", bcf_hdr_id2name(args->hdr,args->prev_rid), args->sites[isite]+1, copy_number_state(args,state,1));
+ for (i=0; i<N_STATES; i++)
+ {
+ double sum = 0;
+ for (j=0; j<N_STATES; j++) sum += pval[i+N_STATES*j];
+ fprintf(args->control_sample.cn_fh, "\t%f", sum);
+ }
+ fprintf(args->control_sample.cn_fh, "\n");
+ if ( args->control_sample.baf[isite]>=0 ) // if non-missing
+ {
+ if ( BAF_LIKELY_HET(args->control_sample.baf[isite]) ) ctrl_nhet++;
+ ctrl_ntot++;
+ }
+ }
+
+ if ( start_cn != state )
+ {
+ char start_cn_query = copy_number_state(args,start_cn,0);
+ qual = phred_score(1 - qual/(isite - istart_pos));
+ fprintf(args->query_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite],start_cn_query,qual,smpl_ntot,smpl_nhet);
+
+ if ( args->control_sample.name )
+ {
+ // regions 0-based, half-open
+ char start_cn_ctrl = copy_number_state(args,start_cn,1);
+ fprintf(args->control_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite],start_cn_ctrl,qual,ctrl_ntot,ctrl_nhet);
+ fprintf(args->summary_fh,"RG\t%s\t%d\t%d\t%c\t%c\t%.1f\t%d\t%d\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite],start_cn_query,start_cn_ctrl,qual,smpl_ntot,smpl_nhet,ctrl_ntot,ctrl_nhet);
+ }
+
+ istart_pos = isite;
+ start_pos = args->sites[isite];
+ start_cn = state;
+ qual = 0;
+ smpl_ntot = smpl_nhet = ctrl_ntot = ctrl_nhet = 0;
+ }
+ }
+ qual = phred_score(1 - qual/(isite - istart_pos));
+ char start_cn_query = copy_number_state(args,start_cn,0);
+ fprintf(args->query_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite-1]+1,start_cn_query,qual,smpl_ntot,smpl_nhet);
+ if ( args->control_sample.name )
+ {
+ char start_cn_ctrl = copy_number_state(args,start_cn,1);
+ fprintf(args->control_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite-1]+1,start_cn_ctrl,qual,ctrl_ntot,ctrl_nhet);
+ fprintf(args->summary_fh,"RG\t%s\t%d\t%d\t%c\t%c\t%.1f\t%d\t%d\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite-1]+1,start_cn_query,start_cn_ctrl,qual,smpl_ntot,smpl_nhet,ctrl_ntot,ctrl_nhet);
+ }
+}
+
+static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt, float *baf, float *lrr)
+{
+ *baf = ((float*)(baf_fmt->p + baf_fmt->size*smpl->idx))[0];
+ if ( bcf_float_is_missing(*baf) || isnan(*baf) ) *baf = -0.1; // arbitrary negative value == missing value
+
+ if ( lrr_fmt )
+ {
+ *lrr = ((float*)(lrr_fmt->p + lrr_fmt->size*smpl->idx))[0];
+ if ( bcf_float_is_missing(*lrr) || isnan(*lrr) ) { *lrr = 0; *baf = -0.1; }
+ }
+ else
+ *lrr = 0;
+
+ return *baf<0 ? 0 : 1;
+}
+
+int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq);
+
+static void cnv_next_line(args_t *args, bcf1_t *line)
+{
+ if ( !line )
+ {
+ // Done, flush viterbi
+ cnv_flush_viterbi(args);
+ return;
+ }
+
+ if ( line->rid!=args->prev_rid )
+ {
+ // New chromosome
+ cnv_flush_viterbi(args);
+ args->prev_rid = line->rid;
+ args->nsites = 0;
+ args->nRR = args->nAA = args->nRA = 0;
+ }
+
+ // Process line
+ args->ntot++;
+
+ bcf_fmt_t *baf_fmt, *lrr_fmt = NULL;
+ if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return;
+ if ( args->lrr_bias>0 && !(lrr_fmt = bcf_get_fmt(args->hdr, line, "LRR")) ) return;
+
+ float baf1,lrr1,baf2,lrr2;
+ int ret = 0;
+ ret += parse_lrr_baf(&args->query_sample, baf_fmt,lrr_fmt,&baf1,&lrr1);
+ ret += parse_lrr_baf(&args->control_sample,baf_fmt,lrr_fmt,&baf2,&lrr2);
+ if ( !ret ) return;
+
+ // Realloc buffers needed to store observed data and used by viterbi and fwd-bwd
+ args->nsites++;
+ int m = args->msites;
+ hts_expand(uint32_t,args->nsites,args->msites,args->sites);
+ if ( args->msites!=m )
+ {
+ args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*args->nstates);
+ if ( args->control_sample.name )
+ {
+ args->control_sample.lrr = (float*) realloc(args->control_sample.lrr,sizeof(float)*args->msites);
+ args->control_sample.baf = (float*) realloc(args->control_sample.baf,sizeof(float)*args->msites);
+ }
+ args->query_sample.lrr = (float*) realloc(args->query_sample.lrr,sizeof(float)*args->msites);
+ args->query_sample.baf = (float*) realloc(args->query_sample.baf,sizeof(float)*args->msites);
+ if ( args->af_fname )
+ args->nonref_afs = (float*) realloc(args->nonref_afs,sizeof(float)*args->msites);
+ }
+ args->sites[args->nsites-1] = line->pos;
+ args->query_sample.lrr[args->nsites-1] = lrr1;
+ args->query_sample.baf[args->nsites-1] = baf1;
+ if ( args->af_fname )
+ {
+ double alt_freq;
+ args->nonref_afs[args->nsites-1] = read_AF(args->files->targets,line,&alt_freq)<0 ? args->nonref_af_dflt : alt_freq;
+ }
+ if ( args->control_sample.name )
+ {
+ args->control_sample.lrr[args->nsites-1] = lrr2;
+ args->control_sample.baf[args->nsites-1] = baf2;
+ if ( baf2>=0 ) // skip missing values
+ fprintf(args->control_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf2,lrr2);
+ }
+ if ( baf1>=0 ) // skip missing values
+ fprintf(args->query_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf1,lrr1);
+
+ if ( baf1>=0 )
+ {
+ if ( baf1<1/5. ) args->nRR++;
+ else if ( baf1>4/5. ) args->nAA++;
+ else args->nRA++;
+ }
+ args->nused++;
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n");
+ fprintf(stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n");
+ fprintf(stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n");
+ fprintf(stderr, "Usage: bcftools cnv [OPTIONS] <file.vcf>\n");
+ fprintf(stderr, "General Options:\n");
+ fprintf(stderr, " -c, --control-sample <string> optional control sample name to highlight differences\n");
+ fprintf(stderr, " -f, --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(stderr, " -o, --output-dir <path> \n");
+ fprintf(stderr, " -p, --plot-threshold <float> plot aberrant chromosomes with quality at least 'float'\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --query-sample <string> query samply name\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, "HMM Options:\n");
+ fprintf(stderr, " -a, --aberrant <float[,float]> fraction of aberrant cells in query and control [1.0,1.0]\n");
+ fprintf(stderr, " -b, --BAF-weight <float> relative contribution from BAF [1]\n");
+ fprintf(stderr, " -d, --BAF-dev <float[,float]> expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
+ fprintf(stderr, " -e, --err-prob <float> uniform error probability [1e-4]\n");
+ fprintf(stderr, " -k, --LRR-dev <float[,float]> expected LRR deviation [0.2,0.2]\n"); // experimental
+ fprintf(stderr, " -l, --LRR-weight <float> relative contribution from LRR [0.2]\n");
+ fprintf(stderr, " -L, --LRR-smooth-win <int> window of LRR moving average smoothing [10]\n");
+ fprintf(stderr, " -O, --optimize <float> estimate fraction of aberrant cells down to <float> [1.0]\n");
+ fprintf(stderr, " -P, --same-prob <float> prior probability of -s/-c being the same [0.5]\n");
+ fprintf(stderr, " -x, --xy-prob <float> P(x|y) transition probability [1e-9]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfcnv(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->plot_th = 1e9; // by default plot none
+ args->nonref_af_dflt = 0.1;
+ args->lrr_smooth_win = 10;
+
+ args->query_sample.cell_frac_dflt = 1;
+ args->control_sample.cell_frac_dflt = 1;
+
+ // How much FORMAT/LRR and FORMAT/BAF matter
+ args->lrr_bias = 0.2;
+ args->baf_bias = 1.0;
+ args->err_prob = 1e-4;
+
+ // Transition probability to a different state and the prior of both samples being the same
+ args->ij_prob = 1e-9;
+ args->same_prob = 0.5;
+
+ // Squared std dev of BAF and LRR values (gaussian noise), estimated from real data (hets, one sample, one chr)
+ args->query_sample.baf_dev2_dflt = args->control_sample.baf_dev2_dflt = 0.04*0.04; // illumina: 0.03
+ args->query_sample.lrr_dev2 = args->control_sample.lrr_dev2 = 0.2*0.2; //0.20*0.20; // illumina: 0.18
+
+ int regions_is_file = 0, targets_is_file = 0;
+ static struct option loptions[] =
+ {
+ {"BAF-dev",1,0,'d'},
+ {"LRR-dev",1,0,'k'},
+ {"LRR-smooth-win",1,0,'L'},
+ {"AF-file",1,0,'f'},
+ {"baum-welch",1,0,'W'}, // hidden
+ {"optimize",1,0,'O'},
+ {"aberrant",1,0,'a'},
+ {"err-prob",1,0,'e'},
+ {"BAF-weight",1,0,'b'},
+ {"LRR-weight",1,0,'l'},
+ {"same-prob",1,0,'P'},
+ {"xy-prob",1,0,'x'},
+ {"sample",1,0,'s'},
+ {"control",1,0,'c'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"plot",1,0,'p'},
+ {"output-dir",1,0,'o'},
+ {0,0,0,0}
+ };
+ char *tmp = NULL;
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W:f:a:L:d:k:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'L':
+ args->lrr_smooth_win = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse: --LRR-smooth-win %s\n", optarg);
+ break;
+ case 'f': args->af_fname = optarg; break;
+ case 'O':
+ args->optimize_frac = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -O %s\n", optarg);
+ break;
+ case 'd':
+ args->query_sample.baf_dev2_dflt = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -d %s\n", optarg);
+ args->control_sample.baf_dev2_dflt = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -d %s\n", optarg);
+ }
+ else
+ args->control_sample.baf_dev2_dflt = args->query_sample.baf_dev2_dflt;
+ args->query_sample.baf_dev2_dflt *= args->query_sample.baf_dev2_dflt;
+ args->control_sample.baf_dev2_dflt *= args->control_sample.baf_dev2_dflt;
+ break;
+ case 'k':
+ args->query_sample.lrr_dev2 = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -k %s\n", optarg);
+ args->control_sample.lrr_dev2 = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -d %s\n", optarg);
+ }
+ else
+ args->control_sample.lrr_dev2 = args->query_sample.lrr_dev2;
+ args->query_sample.lrr_dev2 *= args->query_sample.lrr_dev2;
+ args->control_sample.lrr_dev2 *= args->control_sample.lrr_dev2;
+ break;
+ case 'a':
+ args->query_sample.cell_frac_dflt = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -a %s\n", optarg);
+ args->control_sample.cell_frac_dflt = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -a %s\n", optarg);
+ }
+ break;
+ case 'W':
+ args->baum_welch_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -W %s\n", optarg);
+ break;
+ case 'e':
+ args->err_prob = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -e %s\n", optarg);
+ break;
+ case 'b':
+ args->baf_bias = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -b %s\n", optarg);
+ break;
+ case 'x':
+ args->ij_prob = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -x %s\n", optarg);
+ break;
+ case 'P':
+ args->same_prob = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -P %s\n", optarg);
+ break;
+ case 'l':
+ args->lrr_bias = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -l %s\n", optarg);
+ break;
+ case 'p':
+ args->plot_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -p %s\n", optarg);
+ break;
+ case 'o': args->output_dir = optarg; break;
+ case 's': args->query_sample.name = strdup(optarg); break;
+ case 'c': args->control_sample.name = optarg; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'h':
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";
+ }
+ else fname = argv[optind];
+ if ( !fname ) usage(args);
+
+ if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n");
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->af_fname )
+ {
+ if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
+ error("Failed to read the targets: %s\n", args->af_fname);
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ cnv_next_line(args, line);
+ }
+ cnv_next_line(args, NULL);
+ create_plots(args);
+ fprintf(stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ destroy_data(args);
+ free(args);
+ return 0;
+}
+
+
diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c
new file mode 100644
index 0000000..d8a1ca5
--- /dev/null
+++ b/bcftools/vcfcnv.c.pysam.c
@@ -0,0 +1,1388 @@
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2014-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+/*
+ Known issues:
+ - The --AF-file option behaves like --targets-file, sites not listed in the AFs
+ are skipped.
+*/
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kstring.h>
+#include <htslib/kfunc.h>
+#include <htslib/khash_str2int.h>
+#include "bcftools.h"
+#include "HMM.h"
+#include "rbuf.h"
+
+#define DBG0 0
+
+#define N_STATES 4
+#define CN0 0
+#define CN1 1
+#define CN2 2
+#define CN3 3
+
+typedef struct
+{
+ float mean, dev2, norm;
+}
+gauss_param_t;
+
+typedef struct
+{
+ char *name;
+ int idx; // VCF sample index
+ float *lrr,*baf, baf_dev2, baf_dev2_dflt, lrr_dev2;
+ float cell_frac, cell_frac_dflt;
+ gauss_param_t gauss_param[18];
+ double pobs[N_STATES];
+ FILE *dat_fh, *cn_fh, *summary_fh;
+ char *dat_fname, *cn_fname, *summary_fname;
+}
+sample_t;
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ int prev_rid, ntot, nused;
+ sample_t query_sample, control_sample;
+
+ int nstates; // number of states: N_STATES for one sample, N_STATES^2 for two samples
+ double lrr_bias, baf_bias; // LRR/BAF weights
+ double same_prob, ij_prob; // prior of both samples being the same and the transition probability P(i|j)
+ double err_prob; // constant probability of erroneous measurement
+ float *nonref_afs, nonref_af, nonref_af_dflt, fRR, fRA, fAA, *tmpf;
+ unsigned long int nRR, nRA, nAA;
+ int mtmpf;
+
+ double *tprob, *tprob_arr; // array of transition matrices, precalculated up to ntprob_arr positions
+ double *iprobs; // states' initial probabilities
+ int ntprob_arr;
+
+ hmm_t *hmm;
+ double *eprob; // emission probs [nstates*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+
+ double baum_welch_th, optimize_frac;
+ float plot_th;
+ FILE *summary_fh;
+ char **argv, *regions_list, *summary_fname, *output_dir;
+ char *targets_list, *af_fname;
+ int argc, verbose, lrr_smooth_win;
+}
+args_t;
+
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+
+static inline void hmm2cn_state(int nstates, int i, int *a, int *b)
+{
+ *a = i / N_STATES;
+ *b = i - (*a)*N_STATES;
+}
+static double *init_tprob_matrix(int ndim, double ij_prob, double same_prob)
+{
+ int i,j;
+ double *mat = (double*) malloc(sizeof(double)*ndim*ndim);
+
+ assert( ndim==N_STATES || ndim==N_STATES*N_STATES);
+
+ if ( ndim==N_STATES ) // one sample
+ {
+ double pii = 1 - ij_prob*(N_STATES-1);
+ if ( pii < ij_prob ) error("Error: -x set a bit too high, P(x|x) < P(x|y): %e vs %e\n", pii,ij_prob);
+ for (j=0; j<ndim; j++)
+ {
+ double sum = 0;
+ for (i=0; i<ndim; i++)
+ {
+ // transition from j-th to i-th state
+ if ( i==j )
+ MAT(mat,ndim,i,j) = pii;
+ else
+ MAT(mat,ndim,i,j) = ij_prob;
+
+ sum += MAT(mat,ndim,i,j);
+ }
+ assert( fabs(sum - 1.0)<1e-15 );
+ }
+ }
+ else // two samples
+ {
+ // interpret ij_prob differently, as ii_prob in fact, so that for two
+ // samples the behaviour is somewhat closer to single sample calling
+ // with s=0.
+ double pii = 1 - ij_prob*(N_STATES-1);
+ ij_prob = (1 - pii) / (ndim - 1);
+ for (j=0; j<ndim; j++)
+ {
+ int ja,jb;
+ hmm2cn_state(ndim, j, &ja, &jb);
+
+ double sum = 0;
+ for (i=0; i<ndim; i++)
+ {
+ int ia,ib;
+ hmm2cn_state(ndim, i, &ia, &ib);
+
+ // transition from (ja,jb)-th to (ia,ib)-th state
+ double pa = ja==ia ? pii : ij_prob;
+ double pb = jb==ib ? pii : ij_prob;
+
+ if ( ia==ib && ja==jb )
+ MAT(mat,ndim,i,j) = pa*pb - pa*pb*same_prob + sqrt(pa*pb)*same_prob;
+ else if ( ia==ib )
+ MAT(mat,ndim,i,j) = pa*pb;
+ else
+ MAT(mat,ndim,i,j) = pa*pb*(1-same_prob);
+
+ sum += MAT(mat,ndim,i,j);
+ }
+ for (i=0; i<ndim; i++) MAT(mat,ndim,i,j) /= sum;
+ }
+ }
+ return mat;
+}
+
+static double *init_iprobs(int ndim, double same_prob)
+{
+ int i;
+ double *probs = (double*) malloc(sizeof(double)*ndim);
+
+ assert( ndim==N_STATES || ndim==N_STATES*N_STATES);
+
+ if ( ndim==N_STATES )
+ {
+ // one sample: prior on CN2
+ for (i=0; i<ndim; i++)
+ probs[i] = i==CN2 ? 0.5 : 0.5/3;
+ }
+ else
+ {
+ // two samples
+ double norm = 0;
+ for (i=0; i<ndim; i++)
+ {
+ int ia,ib;
+ hmm2cn_state(ndim, i, &ia, &ib);
+
+ double pa = ia==CN2 ? 0.5 : 0.5/3;
+ double pb = ib==CN2 ? 0.5 : 0.5/3;
+
+ probs[i] = pa*pb;
+ if ( ia!=ib ) probs[i] *= 1-same_prob;
+
+ norm += probs[i];
+ }
+ for (i=0; i<ndim; i++) probs[i] /= norm;
+ }
+ return probs;
+}
+
+static void init_sample_files(sample_t *smpl, char *dir)
+{
+ smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name);
+ smpl->cn_fh = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name);
+ smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name);
+ fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n");
+ fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n");
+ fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n");
+}
+static void close_sample_files(sample_t *smpl)
+{
+ fclose(smpl->dat_fh);
+ fclose(smpl->cn_fh);
+ fclose(smpl->summary_fh);
+}
+
+static double norm_cdf(double mean, double dev);
+static void init_data(args_t *args)
+{
+ args->prev_rid = -1;
+ args->hdr = args->files->readers[0].header;
+
+ if ( !args->query_sample.name )
+ {
+ if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Multi-sample VCF, missing the -s option\n");
+ args->query_sample.name = strdup(args->hdr->samples[0]);
+ }
+ else
+ if ( bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name)<0 ) error("The sample \"%s\" not found\n", args->query_sample.name);
+ if ( !args->files->readers[0].file->is_bin )
+ {
+ int ret;
+ kstring_t tmp = {0,0,0};
+ if ( args->control_sample.name )
+ {
+ ksprintf(&tmp, "%s,%s", args->query_sample.name,args->control_sample.name);
+ ret = bcf_hdr_set_samples(args->hdr, tmp.s, 0);
+ }
+ else
+ {
+ ret = bcf_hdr_set_samples(args->hdr, args->query_sample.name, 0);
+ tmp.s = args->query_sample.name;
+ }
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", tmp.s);
+ else if ( ret>0 ) error("The sample not found in the VCF: %s\n", ret==1 ? args->query_sample.name : args->control_sample.name);
+
+ if ( args->control_sample.name ) free(tmp.s);
+ }
+ args->query_sample.idx = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name);
+ args->control_sample.idx = args->control_sample.name ? bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->control_sample.name) : -1;
+ args->nstates = args->control_sample.name ? N_STATES*N_STATES : N_STATES;
+ args->tprob = init_tprob_matrix(args->nstates, args->ij_prob, args->same_prob);
+ args->iprobs = init_iprobs(args->nstates, args->same_prob);
+ args->hmm = hmm_init(args->nstates, args->tprob, 10000);
+ hmm_init_states(args->hmm, args->iprobs);
+
+ args->summary_fh = stdout;
+ if ( args->output_dir )
+ {
+ init_sample_files(&args->query_sample, args->output_dir);
+ if ( args->control_sample.name )
+ {
+ init_sample_files(&args->control_sample, args->output_dir);
+ args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
+ }
+ else
+ args->summary_fh = NULL; // one sample only, no two-file summary
+ }
+
+ int i;
+ FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
+
+ fprintf(fh, "# This file was produced by: bcftools cnv(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ fprintf(fh, "# The command line was:\tbcftools %s", args->argv[0]);
+ for (i=1; i<args->argc; i++) fprintf(fh, " %s",args->argv[i]);
+ if ( args->control_sample.name )
+ fprintf(fh, "\n#\n"
+ "# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Copy number:%s\t[7]Quality"
+ "\t[8]nSites in (5)\t[9]nHETs in (5)\t[10]nSites in (6)\t[11]nHETs in(6)\n",
+ args->query_sample.name,args->control_sample.name
+ );
+ else
+ fprintf(fh, "\n#\n"
+ "# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n",
+ args->query_sample.name
+ );
+}
+
+char *msprintf(const char *fmt, ...);
+static void py_plot_cnv(char *script, float th)
+{
+ if ( th>100 ) return; // create no plots
+
+ char *cmd = msprintf("python %s -p %f", script, th);
+ int ret = system(cmd);
+ if ( ret) fprintf(pysamerr, "The command returned non-zero status %d: %s\n", ret, cmd);
+ free(cmd);
+}
+
+static void plot_sample(args_t *args, sample_t *smpl)
+{
+ char *fname;
+ FILE *fp = open_file(&fname,"w","%s/plot.%s.py",args->output_dir,smpl->name);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import csv\n"
+ "import numpy as np\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "\n"
+ "dat = {}\n"
+ "with open('%s', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr[0]=='#': continue\n"
+ " if chr not in dat: dat[chr] = []\n"
+ " dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
+ "\n"
+ "cnv = {}\n"
+ "with open('%s', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr[0]=='#': continue\n"
+ " if chr not in cnv: cnv[chr] = []\n"
+ " row[2] = int(row[2]) + 0.5\n"
+ " cnv[chr].append(row[1:])\n"
+ "\n"
+ "for chr in dat:\n"
+ " fig,(ax1, ax2, ax3) = plt.subplots(3,1,figsize=(10,8),sharex=True)\n"
+ " ax1.plot([x[0] for x in dat[chr]],[x[2] for x in dat[chr]], '.', ms=3)\n"
+ " ax2.plot([x[0] for x in dat[chr]],[x[1] for x in dat[chr]], '.', ms=3)\n"
+ " cn_dat = cnv[chr]\n"
+ " xgrid = [float(x[0]) for x in cn_dat]\n"
+ " ygrid = np.linspace(0,5,6)\n"
+ " xgrid, ygrid = np.meshgrid(xgrid, ygrid)\n"
+ " heat = np.zeros_like(xgrid)\n"
+ " for x in range(len(heat[0])-1):\n"
+ " heat[0][x] = cn_dat[x][2]\n"
+ " heat[1][x] = cn_dat[x][3]\n"
+ " heat[2][x] = cn_dat[x][4]\n"
+ " heat[3][x] = cn_dat[x][5]\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n"
+ " mesh.set_clim(vmin=-1,vmax=1)\n"
+ " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'.-',ms=3,color='black')\n"
+ " fig.suptitle('%s (chr '+chr+')')\n"
+ " ax1.set_title('Log-R intensities Ratio',fontsize=10)\n"
+ " ax2.set_title('B-Allele Frequency',fontsize=10)\n"
+ " ax3.set_title('Copy Number Variation',fontsize=10)\n"
+ " ax1.set_ylabel('LRR')\n"
+ " ax2.set_ylabel('BAF')\n"
+ " ax3.set_ylabel('CN')\n"
+ " ax3.set_xlabel('Coordinate (chrom '+chr+')',fontsize=10)\n"
+ " ax3.set_ylim(-0.1,4.1)\n"
+ " ax3.set_yticks([0.5,1.5,2.5,3.5])\n"
+ " ax3.set_yticklabels(['CN0','CN1','CN2','CN3'])\n"
+ " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92)\n"
+ " plt.savefig('%s/plot.%s.chr'+chr+'.png')\n"
+ " plt.close()\n"
+ "\n",
+ smpl->dat_fname,smpl->cn_fname,smpl->name,args->output_dir,smpl->name
+ );
+ fclose(fp);
+
+ py_plot_cnv(fname, args->plot_th);
+ free(fname);
+}
+
+static void create_plots(args_t *args)
+{
+ close_sample_files(&args->query_sample);
+ if ( args->control_sample.name ) close_sample_files(&args->control_sample);
+ if ( args->summary_fh ) fclose(args->summary_fh);
+
+ if ( !args->control_sample.name )
+ {
+ plot_sample(args, &args->query_sample);
+ return;
+ }
+
+ char *fname;
+ FILE *fp = open_file(&fname,"w","%s/plot.%s.%s.py",args->output_dir,args->control_sample.name,args->query_sample.name);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import csv,argparse\n"
+ "import numpy as np\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "\n"
+ "control_sample = '%s'\n"
+ "query_sample = '%s'\n"
+ "\n"
+ "parser = argparse.ArgumentParser()\n"
+ "parser.add_argument('-p', '--plot-threshold', type=float)\n"
+ "parser.add_argument('-c', '--chromosome')\n"
+ "args = parser.parse_args()\n"
+ "if args.plot_threshold==None: args.plot_threshold = 0\n"
+ "\n"
+ "def chroms_to_plot(th):\n"
+ " dat = {}\n"
+ " with open('%s/summary.tab', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " if row[0]!='RG': continue\n"
+ " chr = row[1]\n"
+ " start = row[2]\n"
+ " end = row[3]\n"
+ " qual = float(row[6])\n"
+ " if row[4]==row[5] and args.plot_threshold!=0: continue\n"
+ " if chr not in dat: dat[chr] = 0.0\n"
+ " if qual > dat[chr]: dat[chr] = qual\n"
+ " out = {}\n"
+ " for chr in dat:\n"
+ " if (chr not in dat) or dat[chr]<th: continue\n"
+ " out[chr] = 1\n"
+ " return out\n"
+ "if args.chromosome!=None:\n"
+ " plot_chroms = { args.chromosome:1 }\n"
+ "else:\n"
+ " plot_chroms = chroms_to_plot(args.plot_threshold)\n"
+ "\n"
+ "def read_dat(file,dat,plot_chr):\n"
+ " with open(file, 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr != plot_chr: continue\n"
+ " dat.append([row[1], float(row[2]), float(row[3])])\n"
+ "def read_cnv(file,cnv,plot_chr):\n"
+ " with open(file, 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " chr = row[0]\n"
+ " if chr != plot_chr: continue\n"
+ " row[2] = int(row[2]) + 0.5\n"
+ " cnv.append(row[1:])\n"
+ "def find_diffs(a,b):\n"
+ " out = []\n"
+ " diff = []\n"
+ " for i in range(len(a)):\n"
+ " if a[i][1]!=b[i][1]:\n"
+ " if i>0: diff.append([b[i-1][0],b[i-1][1],a[i-1][1]])\n"
+ " diff.append([b[i][0],b[i][1],a[i][1]])\n"
+ " elif len(diff):\n"
+ " diff.append([b[i][0],b[i][1],a[i][1]])\n"
+ " out.append(diff)\n"
+ " diff = []\n"
+ " if len(diff): out.append(diff)\n"
+ " return out\n"
+ "\n"
+ "for chr in sorted(plot_chroms.keys()):\n"
+ " control_dat = []\n"
+ " control_cnv = []\n"
+ " query_dat = []\n"
+ " query_cnv = []\n"
+ " read_dat('%s',control_dat,chr)\n"
+ " read_dat('%s',query_dat,chr)\n"
+ " read_cnv('%s',control_cnv,chr)\n"
+ " read_cnv('%s',query_cnv,chr)\n"
+ "\n"
+ " fig,(ax1,ax2,ax3,ax4,ax5,ax6) = plt.subplots(6,1,figsize=(10,8),sharex=True)\n"
+ " ax1.plot([x[0] for x in control_dat],[x[2] for x in control_dat], '.', ms=3,color='red')\n"
+ " ax2.plot([x[0] for x in control_dat],[x[1] for x in control_dat], '.', ms=3,color='red')\n"
+ " cn_dat = control_cnv\n"
+ " xgrid = [float(x[0]) for x in cn_dat]\n"
+ " ygrid = np.linspace(0,5,6)\n"
+ " xgrid, ygrid = np.meshgrid(xgrid, ygrid)\n"
+ " heat = np.zeros_like(xgrid)\n"
+ " for x in range(len(heat[0])-1):\n"
+ " heat[0][x] = cn_dat[x][2]\n"
+ " heat[1][x] = cn_dat[x][3]\n"
+ " heat[2][x] = cn_dat[x][4]\n"
+ " heat[3][x] = cn_dat[x][5]\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr')\n"
+ " mesh.set_clim(vmin=-1,vmax=1)\n"
+ " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n"
+ "\n"
+ " ax6.plot([x[0] for x in query_dat],[x[2] for x in query_dat], '.', ms=3)\n"
+ " ax5.plot([x[0] for x in query_dat],[x[1] for x in query_dat], '.', ms=3)\n"
+ " cn_dat = query_cnv\n"
+ " xgrid = [float(x[0]) for x in cn_dat]\n"
+ " ygrid = np.linspace(0,5,6)\n"
+ " xgrid, ygrid = np.meshgrid(xgrid, ygrid)\n"
+ " heat = np.zeros_like(xgrid)\n"
+ " for x in range(len(heat[0])-1):\n"
+ " heat[0][x] = cn_dat[x][2]\n"
+ " heat[1][x] = cn_dat[x][3]\n"
+ " heat[2][x] = cn_dat[x][4]\n"
+ " heat[3][x] = cn_dat[x][5]\n"
+ " mesh = ax4.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n"
+ " mesh.set_clim(vmin=-1,vmax=1)\n"
+ " ax4.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n"
+ " ax3.annotate(control_sample, xy=(0.02,0.1), xycoords='axes fraction', color='red',fontsize=12, va='bottom',ha='left')\n"
+ " ax4.annotate(query_sample, xy=(0.02,0.9), xycoords='axes fraction', color='blue',fontsize=12, va='top',ha='left')\n"
+ "\n"
+ " diffs = find_diffs(control_cnv,query_cnv)\n"
+ " for diff in diffs:\n"
+ " ax3.plot([x[0] for x in diff],[x[1] for x in diff],'-',ms=3,color='blue',lw=1.7)\n"
+ " ax4.plot([x[0] for x in diff],[x[2] for x in diff],'-',ms=3,color='red',lw=1.7)\n"
+ "\n"
+ " fig.suptitle('chr '+chr+', '+control_sample+' vs '+query_sample)\n"
+ " ax1.tick_params(axis='both', labelsize=8)\n"
+ " ax2.tick_params(axis='both', labelsize=8)\n"
+ " ax3.tick_params(axis='both', labelsize=8)\n"
+ " ax4.tick_params(axis='both', labelsize=8)\n"
+ " ax5.tick_params(axis='both', labelsize=8)\n"
+ " ax6.tick_params(axis='both', labelsize=8)\n"
+ " ax6.set_xlabel('Coordinate (chrom '+chr+')',fontsize=8)\n"
+ " ax1.set_ylabel('LRR')\n"
+ " ax2.set_ylabel('BAF')\n"
+ " ax3.set_ylabel('CN')\n"
+ " ax6.set_ylabel('LRR')\n"
+ " ax5.set_ylabel('BAF')\n"
+ " ax4.set_ylabel('CN')\n"
+ " ax3.set_ylim(-0.1,4.1)\n"
+ " ax3.set_yticks([0.5,1.5,2.5,3.5])\n"
+ " ax3.set_yticklabels(['CN0','CN1','CN2','CN3'])\n"
+ " ax4.set_ylim(-0.1,4.1)\n"
+ " ax4.set_yticks([0.5,1.5,2.5,3.5])\n"
+ " ax4.set_yticklabels(['CN0','CN1','CN2','CN3'])\n"
+ " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92,hspace=0)\n"
+ " plt.savefig('%s/plot.%s.%s.chr'+chr+'.png')\n"
+ " plt.close()\n"
+ "\n",
+ args->control_sample.name,args->query_sample.name,
+ args->output_dir,
+ args->control_sample.dat_fname,args->query_sample.dat_fname,
+ args->control_sample.cn_fname,args->query_sample.cn_fname,
+ args->output_dir,args->control_sample.name,args->query_sample.name
+ );
+ fclose(fp);
+
+ py_plot_cnv(fname,args->plot_th);
+ free(fname);
+}
+
+static void destroy_data(args_t *args)
+{
+ bcf_sr_destroy(args->files);
+ hmm_destroy(args->hmm);
+ free(args->tmpf);
+ free(args->sites);
+ free(args->eprob);
+ free(args->tprob);
+ free(args->summary_fname);
+ free(args->nonref_afs);
+ free(args->query_sample.baf);
+ free(args->query_sample.lrr);
+ free(args->control_sample.baf);
+ free(args->control_sample.lrr);
+ free(args->query_sample.name);
+ free(args->query_sample.dat_fname);
+ free(args->query_sample.cn_fname);
+ free(args->query_sample.summary_fname);
+ free(args->control_sample.dat_fname);
+ free(args->control_sample.cn_fname);
+ free(args->control_sample.summary_fname);
+}
+
+static inline char copy_number_state(args_t *args, int istate, int ismpl)
+{
+ char code[] = "01234";
+ if ( !args->control_sample.name ) return code[istate];
+ int idx = ismpl ? istate - (istate/N_STATES)*N_STATES : istate/N_STATES;
+ return code[idx];
+}
+
+static double avg_ii_prob(int n, double *mat)
+{
+ int i;
+ double avg = 0;
+ for (i=0; i<n; i++) avg += MAT(mat,n,i,i);
+ return avg/n;
+}
+
+#define GAUSS_CN1_PK_R(smpl) (&((smpl)->gauss_param[0]))
+#define GAUSS_CN1_PK_A(smpl) (&((smpl)->gauss_param[1]))
+#define GAUSS_CN2_PK_RR(smpl) (&((smpl)->gauss_param[2]))
+#define GAUSS_CN2_PK_RA(smpl) (&((smpl)->gauss_param[3]))
+#define GAUSS_CN2_PK_AA(smpl) (&((smpl)->gauss_param[4]))
+#define GAUSS_CN3_PK_RRR(smpl) (&((smpl)->gauss_param[5]))
+#define GAUSS_CN3_PK_RRA(smpl) (&((smpl)->gauss_param[6]))
+#define GAUSS_CN3_PK_RAA(smpl) (&((smpl)->gauss_param[7]))
+#define GAUSS_CN3_PK_AAA(smpl) (&((smpl)->gauss_param[8]))
+
+static inline double norm_prob(double baf, gauss_param_t *param)
+{
+ return exp(-(baf-param->mean)*(baf-param->mean)*0.5/param->dev2) / param->norm / sqrt(2*M_PI*param->dev2);
+}
+
+static int set_observed_prob(args_t *args, sample_t *smpl, int isite)
+{
+ float baf = smpl->baf[isite];
+ float lrr = args->lrr_bias>0 ? smpl->lrr[isite] : 0;
+
+ float fRR = args->fRR;
+ float fRA = args->fRA;
+ float fAA = args->fAA;
+
+ if ( baf<0 )
+ {
+ // no call: either some technical issue or the call could not be made because it is CN0
+ int i;
+ smpl->pobs[CN0] = 0.5;
+ for (i=1; i<N_STATES; i++) smpl->pobs[i] = (1.0-smpl->pobs[CN0])/(N_STATES-1);
+ return 0;
+ }
+
+ double cn1_baf =
+ norm_prob(baf,GAUSS_CN1_PK_R(smpl)) * (fRR + fRA*0.5) +
+ norm_prob(baf,GAUSS_CN1_PK_A(smpl)) * (fAA + fRA*0.5) ;
+ double cn2_baf =
+ norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA +
+ norm_prob(baf,GAUSS_CN2_PK_AA(smpl)) * fAA;
+ double cn3_baf =
+ norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 +
+ norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 +
+ norm_prob(baf,GAUSS_CN3_PK_AAA(smpl)) * fAA;
+
+ double norm = cn1_baf + cn2_baf + cn3_baf;
+ cn1_baf /= norm;
+ cn2_baf /= norm;
+ cn3_baf /= norm;
+
+ #if DBG0
+ if ( args->verbose ) fprintf(pysamerr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf);
+ #endif
+
+ double cn1_lrr = exp(-(lrr + 0.45)*(lrr + 0.45)/smpl->lrr_dev2);
+ double cn2_lrr = exp(-(lrr - 0.00)*(lrr - 0.00)/smpl->lrr_dev2);
+ double cn3_lrr = exp(-(lrr - 0.30)*(lrr - 0.30)/smpl->lrr_dev2);
+
+ smpl->pobs[CN0] = 0;
+ smpl->pobs[CN1] = args->err_prob + (1 - args->baf_bias + args->baf_bias*cn1_baf)*(1 - args->lrr_bias + args->lrr_bias*cn1_lrr);
+ smpl->pobs[CN2] = args->err_prob + (1 - args->baf_bias + args->baf_bias*cn2_baf)*(1 - args->lrr_bias + args->lrr_bias*cn2_lrr);
+ smpl->pobs[CN3] = args->err_prob + (1 - args->baf_bias + args->baf_bias*cn3_baf)*(1 - args->lrr_bias + args->lrr_bias*cn3_lrr);
+
+ return 0;
+}
+
+static void set_emission_prob(args_t *args, int isite)
+{
+ double *eprob = &args->eprob[args->nstates*isite];
+ int i;
+ for (i=0; i<N_STATES; i++)
+ eprob[i] = args->query_sample.pobs[i];
+}
+
+static void set_emission_prob2(args_t *args, int isite)
+{
+ double *eprob = &args->eprob[args->nstates*isite];
+ int i, j;
+ for (i=0; i<N_STATES; i++)
+ {
+ for (j=0; j<N_STATES; j++)
+ {
+ eprob[i*N_STATES+j] = args->query_sample.pobs[i]*args->control_sample.pobs[j];
+ }
+ }
+}
+
+static void set_gauss_params(args_t *args, sample_t *smpl);
+static double norm_cdf(double mean, double dev)
+{
+ double bot = 0, top = 1;
+ top = 1 - 0.5*erfc((top-mean)/(dev*sqrt(2)));
+ bot = 1 - 0.5*erfc((bot-mean)/(dev*sqrt(2)));
+ return top-bot;
+}
+
+static void set_emission_probs(args_t *args)
+{
+ if ( !args->af_fname )
+ {
+ args->fRR = 0.76;
+ args->fRA = 0.14;
+ args->fAA = 0.098;
+ }
+
+ set_gauss_params(args, &args->query_sample);
+ if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample);
+
+ #if DBG0
+ args->verbose = 1;
+ args->query_sample.baf[0] = 0; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 1/3.; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 1/2.; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 2/3.; set_observed_prob(args,&args->query_sample,0);
+ args->query_sample.baf[0] = 1; set_observed_prob(args,&args->query_sample,0);
+ args->verbose = 0;
+ #endif
+
+ int i;
+ for (i=0; i<args->nsites; i++)
+ {
+ if ( args->af_fname )
+ {
+ args->fRR = (1-args->nonref_afs[i])*(1-args->nonref_afs[i]);
+ args->fRA = 2*args->nonref_afs[i]*(1-args->nonref_afs[i]);
+ args->fAA = args->nonref_afs[i]*args->nonref_afs[i];
+ }
+ set_observed_prob(args,&args->query_sample,i);
+ if ( args->control_sample.name )
+ {
+ set_observed_prob(args,&args->control_sample,i);
+ set_emission_prob2(args,i);
+ }
+ else
+ set_emission_prob(args,i);
+ }
+}
+
+static void smooth_data(float *dat, int ndat, int win)
+{
+ if ( win<=1 ) return;
+
+ int i,j, k1 = win/2, k2 = win-k1;
+ rbuf_t rbuf;
+ rbuf_init(&rbuf,win);
+ float sum = 0, *buf = (float*)malloc(sizeof(float)*win);
+ for (i=0; i<k2; i++)
+ {
+ sum += dat[i];
+ int j = rbuf_append(&rbuf);
+ buf[j] = dat[i];
+ }
+ for (i=0; i<ndat; i++)
+ {
+ dat[i] = sum/rbuf.n;
+ if ( i>=k1 )
+ {
+ j = rbuf_shift(&rbuf);
+ sum -= buf[j];
+ }
+ if ( i+k2<ndat )
+ {
+ sum += dat[i+k2];
+ j = rbuf_append(&rbuf);
+ buf[j] = dat[i+k2];
+ }
+ }
+ free(buf);
+}
+
+static void set_gauss_params(args_t *args, sample_t *smpl)
+{
+ int i;
+ for (i=0; i<18; i++) smpl->gauss_param[i].dev2 = smpl->baf_dev2;
+
+ double dev = sqrt(smpl->baf_dev2);
+
+ GAUSS_CN1_PK_R(smpl)->mean = 0;
+ GAUSS_CN1_PK_A(smpl)->mean = 1;
+ GAUSS_CN1_PK_R(smpl)->norm = norm_cdf(GAUSS_CN1_PK_R(smpl)->mean,dev);
+ GAUSS_CN1_PK_A(smpl)->norm = norm_cdf(GAUSS_CN1_PK_A(smpl)->mean,dev);
+
+ GAUSS_CN2_PK_RR(smpl)->mean = 0;
+ GAUSS_CN2_PK_RA(smpl)->mean = 0.5;
+ GAUSS_CN2_PK_AA(smpl)->mean = 1;
+ GAUSS_CN2_PK_RR(smpl)->norm = norm_cdf(GAUSS_CN2_PK_RR(smpl)->mean,dev);
+ GAUSS_CN2_PK_RA(smpl)->norm = norm_cdf(GAUSS_CN2_PK_RA(smpl)->mean,dev);
+ GAUSS_CN2_PK_AA(smpl)->norm = norm_cdf(GAUSS_CN2_PK_AA(smpl)->mean,dev);
+
+ GAUSS_CN3_PK_RRR(smpl)->mean = 0;
+ GAUSS_CN3_PK_RRA(smpl)->mean = 1.0/(2+smpl->cell_frac);
+ GAUSS_CN3_PK_RAA(smpl)->mean = (1.0+smpl->cell_frac)/(2+smpl->cell_frac);
+ GAUSS_CN3_PK_AAA(smpl)->mean = 1;
+ GAUSS_CN3_PK_RRR(smpl)->norm = norm_cdf(GAUSS_CN3_PK_RRR(smpl)->mean,dev);
+ GAUSS_CN3_PK_RRA(smpl)->norm = norm_cdf(GAUSS_CN3_PK_RRA(smpl)->mean,dev);
+ GAUSS_CN3_PK_RAA(smpl)->norm = norm_cdf(GAUSS_CN3_PK_RAA(smpl)->mean,dev);
+ GAUSS_CN3_PK_AAA(smpl)->norm = norm_cdf(GAUSS_CN3_PK_AAA(smpl)->mean,dev);
+}
+
+static int update_sample_args(args_t *args, sample_t *smpl, int ismpl)
+{
+ hmm_t *hmm = args->hmm;
+ double *fwd = hmm_get_fwd_bwd_prob(hmm);
+ int nstates = hmm_get_nstates(hmm);
+
+ // estimate the BAF mean and deviation for CN3
+ double mean_cn3 = 0, norm_cn3 = 0;
+ double baf_dev2 = 0, baf_AA_dev2 = 0, norm_baf_AA_dev2 = 0;
+
+ // experimental: smooth CN3 probs to bias toward bigger events, this lowers
+ // the FP rate when the data is noisy
+ hts_expand(float,args->nsites,args->mtmpf,args->tmpf);
+ int i, j, k = 0;
+ for (i=0; i<args->nsites; i++)
+ {
+ float baf = smpl->baf[i];
+ if ( baf>4/5.) continue; // skip AA genotypes
+ if ( baf>0.5 ) baf = 1 - baf; // the bands should be symmetric
+ if ( baf<1/5.) continue; // skip RR genotypes
+
+ double prob_cn3 = 0, *probs = fwd + i*nstates;
+ if ( !args->control_sample.name )
+ {
+ prob_cn3 = probs[CN3];
+ }
+ else if ( ismpl==0 )
+ {
+ // query sample: CN3 probability must be recovered from all states of the control sample
+ for (j=0; j<N_STATES; j++) prob_cn3 += probs[CN3*N_STATES+j];
+ }
+ else
+ {
+ // same as above but for control sample
+ for (j=0; j<N_STATES; j++) prob_cn3 += probs[CN3+j*N_STATES];
+ }
+ args->tmpf[k++] = prob_cn3;
+ }
+ smooth_data(args->tmpf, k, 50);
+ k = 0;
+ for (i=0; i<args->nsites; i++)
+ {
+ float baf = smpl->baf[i];
+ if ( baf>4/5.) { baf_AA_dev2 += (1.0-baf)*(1.0-baf); norm_baf_AA_dev2++; continue; } // skip AA genotypes
+ if ( baf>0.5 ) baf = 1 - baf; // the bands should be symmetric
+ if ( baf<1/5.) continue; // skip RR genotypes
+
+ double prob_cn3 = args->tmpf[k++];
+ mean_cn3 += prob_cn3 * baf;
+ norm_cn3 += prob_cn3;
+ }
+ if ( !norm_cn3 )
+ {
+ smpl->cell_frac = 1.0;
+ return 1;
+ }
+ mean_cn3 /= norm_cn3;
+ k = 0;
+ for (i=0; i<args->nsites; i++)
+ {
+ float baf = smpl->baf[i];
+ if ( baf>0.5 ) baf = 1 - baf; // the bands should be symmetric
+ if ( baf<1/5.) continue; // skip RR,AA genotypes
+
+ double prob_cn3 = args->tmpf[k++];
+ baf_dev2 += prob_cn3 * (baf - mean_cn3)*(baf - mean_cn3);
+ }
+
+ /*
+ A noisy CN2 band is hard to distinguish from two CN3 bands which are
+ close to each other. Set a treshold on the minimum separation based
+ on the BAF deviation at p=0.95
+ */
+ baf_dev2 /= norm_cn3;
+ baf_AA_dev2 /= norm_baf_AA_dev2;
+ if ( baf_dev2 < baf_AA_dev2 ) baf_dev2 = baf_AA_dev2;
+ double max_mean_cn3 = 0.5 - sqrt(baf_dev2)*1.644854; // R: qnorm(0.95)=1.644854
+ //fprintf(pysamerr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3);
+ assert( max_mean_cn3>0 );
+
+ double new_frac = 1./mean_cn3 - 2;
+ if ( mean_cn3 > max_mean_cn3 || new_frac < args->optimize_frac )
+ {
+ // out of bounds, beyond our detection limits. Give up and say it converged
+ smpl->cell_frac = 1.0;
+ return 1;
+ }
+ if ( new_frac>1 ) new_frac = 1;
+ int converged = fabs(new_frac - smpl->cell_frac) < 1e-1 ? 1 : 0;
+
+ // Update dev2, but stay within safe limits
+ if ( baf_dev2 > 3*smpl->baf_dev2_dflt ) baf_dev2 = 3*smpl->baf_dev2_dflt;
+ else if ( baf_dev2 < 0.5*smpl->baf_dev2_dflt ) baf_dev2 = 0.5*smpl->baf_dev2_dflt;
+
+ smpl->cell_frac = new_frac;
+ smpl->baf_dev2 = baf_dev2;
+
+ return converged;
+}
+
+// Update parameters which depend on the estimated fraction of aberrant cells
+// in CN3. Returns 0 if the current estimate did not need to be updated or 1
+// if there was a change.
+static int update_args(args_t *args)
+{
+ int converged = update_sample_args(args, &args->query_sample, 0);
+ if ( args->control_sample.name )
+ {
+ converged += update_sample_args(args, &args->control_sample, 1);
+ return converged==2 ? 0 : 1;
+ }
+ return converged ? 0 : 1;
+}
+
+// for an approximate estimate of the number of het genotypes in a region
+#define BAF_LIKELY_HET(val) (val)>0.25 && (val)<0.75
+
+static void cnv_flush_viterbi(args_t *args)
+{
+ if ( !args->nsites ) return;
+
+ // Set HMM transition matrix for the new chromsome again. This is for case
+ // Baum-Welch was used, which is experimental, largerly unsupported and not
+ // done by default.
+ hmm_t *hmm = args->hmm;
+ hmm_set_tprob(args->hmm, args->tprob, 10000);
+
+ // Smooth LRR values to reduce noise
+ if ( args->lrr_bias > 0 )
+ {
+ smooth_data(args->query_sample.lrr,args->nsites, args->lrr_smooth_win);
+ if ( args->control_sample.name ) smooth_data(args->control_sample.lrr,args->nsites, args->lrr_smooth_win);
+ }
+
+ // Set the BAF peak likelihoods, such as P(RRR|CN3), taking account the
+ // estimated fraction of aberrant cells in the mixture. With the new chromosome,
+ // reset the fraction to the default value.
+ args->query_sample.cell_frac = args->query_sample.cell_frac_dflt;
+ args->control_sample.cell_frac = args->control_sample.cell_frac_dflt;
+ args->query_sample.baf_dev2 = args->query_sample.baf_dev2_dflt;
+ args->control_sample.baf_dev2 = args->control_sample.baf_dev2_dflt;
+ set_gauss_params(args, &args->query_sample);
+ if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample);
+
+ if ( args->optimize_frac )
+ {
+ int niter = 0;
+ fprintf(pysamerr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid));
+ do
+ {
+ fprintf(pysamerr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
+ if ( args->control_sample.name )
+ fprintf(pysamerr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
+ fprintf(pysamerr,"\n");
+ set_emission_probs(args);
+ hmm_run_fwd_bwd(hmm, args->nsites, args->eprob, args->sites);
+ }
+ while ( update_args(args) && ++niter<20 );
+ if ( niter>=20 )
+ {
+ // no convergence
+ args->query_sample.cell_frac = args->query_sample.cell_frac_dflt;
+ args->control_sample.cell_frac = args->control_sample.cell_frac_dflt;
+ args->query_sample.baf_dev2 = args->query_sample.baf_dev2_dflt;
+ args->control_sample.baf_dev2 = args->control_sample.baf_dev2_dflt;
+ set_gauss_params(args, &args->query_sample);
+ if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample);
+ }
+
+ fprintf(pysamerr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
+ if ( args->control_sample.name )
+ fprintf(pysamerr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
+ fprintf(pysamerr,"\n");
+ }
+ set_emission_probs(args);
+
+ while ( args->baum_welch_th!=0 )
+ {
+ int nstates = hmm_get_nstates(hmm);
+ double ori_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm));
+ hmm_run_baum_welch(hmm, args->nsites, args->eprob, args->sites);
+ double new_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm));
+ fprintf(pysamerr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii);
+ double *tprob = init_tprob_matrix(nstates, 1-new_ii, args->same_prob);
+ hmm_set_tprob(args->hmm, tprob, 10000);
+ double *tprob_arr = hmm_get_tprob(hmm);
+ free(tprob);
+ if ( fabs(new_ii - ori_ii) < args->baum_welch_th )
+ {
+ int i,j;
+ for (i=0; i<nstates; i++)
+ {
+ for (j=0; j<nstates; j++)
+ {
+ printf(" %.15f", MAT(tprob_arr,nstates,j,i));
+ }
+ printf("\n");
+ }
+ break;
+ }
+ }
+ hmm_run_viterbi(hmm, args->nsites, args->eprob, args->sites);
+ hmm_run_fwd_bwd(hmm, args->nsites, args->eprob, args->sites);
+
+
+ // Output the results
+ uint8_t *vpath = hmm_get_viterbi_path(hmm);
+ double qual = 0, *fwd = hmm_get_fwd_bwd_prob(hmm);
+ int i,j, isite, start_cn = vpath[0], start_pos = args->sites[0], istart_pos = 0;
+ int ctrl_ntot = 0, smpl_ntot = 0, ctrl_nhet = 0, smpl_nhet = 0;
+ for (isite=0; isite<args->nsites; isite++)
+ {
+ int state = vpath[args->nstates*isite];
+ double *pval = fwd + isite*args->nstates;
+
+ qual += pval[start_cn];
+
+ // output CN and fwd-bwd likelihood for each site
+ if ( args->query_sample.cn_fh )
+ {
+ fprintf(args->query_sample.cn_fh, "%s\t%d\t%c", bcf_hdr_id2name(args->hdr,args->prev_rid), args->sites[isite]+1, copy_number_state(args,state,0));
+ if ( !args->control_sample.cn_fh )
+ for (i=0; i<args->nstates; i++) fprintf(args->query_sample.cn_fh, "\t%f", pval[i]);
+ else
+ for (i=0; i<N_STATES; i++)
+ {
+ double sum = 0;
+ for (j=0; j<N_STATES; j++) sum += pval[i*N_STATES+j];
+ fprintf(args->query_sample.cn_fh, "\t%f", sum);
+ }
+ fprintf(args->query_sample.cn_fh, "\n");
+ if ( args->query_sample.baf[isite]>=0 ) // if non-missing
+ {
+ if ( BAF_LIKELY_HET(args->query_sample.baf[isite]) ) smpl_nhet++;
+ smpl_ntot++;
+ }
+ }
+ if ( args->control_sample.cn_fh )
+ {
+ fprintf(args->control_sample.cn_fh, "%s\t%d\t%c", bcf_hdr_id2name(args->hdr,args->prev_rid), args->sites[isite]+1, copy_number_state(args,state,1));
+ for (i=0; i<N_STATES; i++)
+ {
+ double sum = 0;
+ for (j=0; j<N_STATES; j++) sum += pval[i+N_STATES*j];
+ fprintf(args->control_sample.cn_fh, "\t%f", sum);
+ }
+ fprintf(args->control_sample.cn_fh, "\n");
+ if ( args->control_sample.baf[isite]>=0 ) // if non-missing
+ {
+ if ( BAF_LIKELY_HET(args->control_sample.baf[isite]) ) ctrl_nhet++;
+ ctrl_ntot++;
+ }
+ }
+
+ if ( start_cn != state )
+ {
+ char start_cn_query = copy_number_state(args,start_cn,0);
+ qual = phred_score(1 - qual/(isite - istart_pos));
+ fprintf(args->query_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite],start_cn_query,qual,smpl_ntot,smpl_nhet);
+
+ if ( args->control_sample.name )
+ {
+ // regions 0-based, half-open
+ char start_cn_ctrl = copy_number_state(args,start_cn,1);
+ fprintf(args->control_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite],start_cn_ctrl,qual,ctrl_ntot,ctrl_nhet);
+ fprintf(args->summary_fh,"RG\t%s\t%d\t%d\t%c\t%c\t%.1f\t%d\t%d\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite],start_cn_query,start_cn_ctrl,qual,smpl_ntot,smpl_nhet,ctrl_ntot,ctrl_nhet);
+ }
+
+ istart_pos = isite;
+ start_pos = args->sites[isite];
+ start_cn = state;
+ qual = 0;
+ smpl_ntot = smpl_nhet = ctrl_ntot = ctrl_nhet = 0;
+ }
+ }
+ qual = phred_score(1 - qual/(isite - istart_pos));
+ char start_cn_query = copy_number_state(args,start_cn,0);
+ fprintf(args->query_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite-1]+1,start_cn_query,qual,smpl_ntot,smpl_nhet);
+ if ( args->control_sample.name )
+ {
+ char start_cn_ctrl = copy_number_state(args,start_cn,1);
+ fprintf(args->control_sample.summary_fh,"RG\t%s\t%d\t%d\t%c\t%.1f\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite-1]+1,start_cn_ctrl,qual,ctrl_ntot,ctrl_nhet);
+ fprintf(args->summary_fh,"RG\t%s\t%d\t%d\t%c\t%c\t%.1f\t%d\t%d\t%d\t%d\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid), start_pos+1, args->sites[isite-1]+1,start_cn_query,start_cn_ctrl,qual,smpl_ntot,smpl_nhet,ctrl_ntot,ctrl_nhet);
+ }
+}
+
+static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt, float *baf, float *lrr)
+{
+ *baf = ((float*)(baf_fmt->p + baf_fmt->size*smpl->idx))[0];
+ if ( bcf_float_is_missing(*baf) || isnan(*baf) ) *baf = -0.1; // arbitrary negative value == missing value
+
+ if ( lrr_fmt )
+ {
+ *lrr = ((float*)(lrr_fmt->p + lrr_fmt->size*smpl->idx))[0];
+ if ( bcf_float_is_missing(*lrr) || isnan(*lrr) ) { *lrr = 0; *baf = -0.1; }
+ }
+ else
+ *lrr = 0;
+
+ return *baf<0 ? 0 : 1;
+}
+
+int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq);
+
+static void cnv_next_line(args_t *args, bcf1_t *line)
+{
+ if ( !line )
+ {
+ // Done, flush viterbi
+ cnv_flush_viterbi(args);
+ return;
+ }
+
+ if ( line->rid!=args->prev_rid )
+ {
+ // New chromosome
+ cnv_flush_viterbi(args);
+ args->prev_rid = line->rid;
+ args->nsites = 0;
+ args->nRR = args->nAA = args->nRA = 0;
+ }
+
+ // Process line
+ args->ntot++;
+
+ bcf_fmt_t *baf_fmt, *lrr_fmt = NULL;
+ if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return;
+ if ( args->lrr_bias>0 && !(lrr_fmt = bcf_get_fmt(args->hdr, line, "LRR")) ) return;
+
+ float baf1,lrr1,baf2,lrr2;
+ int ret = 0;
+ ret += parse_lrr_baf(&args->query_sample, baf_fmt,lrr_fmt,&baf1,&lrr1);
+ ret += parse_lrr_baf(&args->control_sample,baf_fmt,lrr_fmt,&baf2,&lrr2);
+ if ( !ret ) return;
+
+ // Realloc buffers needed to store observed data and used by viterbi and fwd-bwd
+ args->nsites++;
+ int m = args->msites;
+ hts_expand(uint32_t,args->nsites,args->msites,args->sites);
+ if ( args->msites!=m )
+ {
+ args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*args->nstates);
+ if ( args->control_sample.name )
+ {
+ args->control_sample.lrr = (float*) realloc(args->control_sample.lrr,sizeof(float)*args->msites);
+ args->control_sample.baf = (float*) realloc(args->control_sample.baf,sizeof(float)*args->msites);
+ }
+ args->query_sample.lrr = (float*) realloc(args->query_sample.lrr,sizeof(float)*args->msites);
+ args->query_sample.baf = (float*) realloc(args->query_sample.baf,sizeof(float)*args->msites);
+ if ( args->af_fname )
+ args->nonref_afs = (float*) realloc(args->nonref_afs,sizeof(float)*args->msites);
+ }
+ args->sites[args->nsites-1] = line->pos;
+ args->query_sample.lrr[args->nsites-1] = lrr1;
+ args->query_sample.baf[args->nsites-1] = baf1;
+ if ( args->af_fname )
+ {
+ double alt_freq;
+ args->nonref_afs[args->nsites-1] = read_AF(args->files->targets,line,&alt_freq)<0 ? args->nonref_af_dflt : alt_freq;
+ }
+ if ( args->control_sample.name )
+ {
+ args->control_sample.lrr[args->nsites-1] = lrr2;
+ args->control_sample.baf[args->nsites-1] = baf2;
+ if ( baf2>=0 ) // skip missing values
+ fprintf(args->control_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf2,lrr2);
+ }
+ if ( baf1>=0 ) // skip missing values
+ fprintf(args->query_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf1,lrr1);
+
+ if ( baf1>=0 )
+ {
+ if ( baf1<1/5. ) args->nRR++;
+ else if ( baf1>4/5. ) args->nAA++;
+ else args->nRA++;
+ }
+ args->nused++;
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n");
+ fprintf(pysamerr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n");
+ fprintf(pysamerr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n");
+ fprintf(pysamerr, "Usage: bcftools cnv [OPTIONS] <file.vcf>\n");
+ fprintf(pysamerr, "General Options:\n");
+ fprintf(pysamerr, " -c, --control-sample <string> optional control sample name to highlight differences\n");
+ fprintf(pysamerr, " -f, --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(pysamerr, " -o, --output-dir <path> \n");
+ fprintf(pysamerr, " -p, --plot-threshold <float> plot aberrant chromosomes with quality at least 'float'\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --query-sample <string> query samply name\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, "HMM Options:\n");
+ fprintf(pysamerr, " -a, --aberrant <float[,float]> fraction of aberrant cells in query and control [1.0,1.0]\n");
+ fprintf(pysamerr, " -b, --BAF-weight <float> relative contribution from BAF [1]\n");
+ fprintf(pysamerr, " -d, --BAF-dev <float[,float]> expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
+ fprintf(pysamerr, " -e, --err-prob <float> uniform error probability [1e-4]\n");
+ fprintf(pysamerr, " -k, --LRR-dev <float[,float]> expected LRR deviation [0.2,0.2]\n"); // experimental
+ fprintf(pysamerr, " -l, --LRR-weight <float> relative contribution from LRR [0.2]\n");
+ fprintf(pysamerr, " -L, --LRR-smooth-win <int> window of LRR moving average smoothing [10]\n");
+ fprintf(pysamerr, " -O, --optimize <float> estimate fraction of aberrant cells down to <float> [1.0]\n");
+ fprintf(pysamerr, " -P, --same-prob <float> prior probability of -s/-c being the same [0.5]\n");
+ fprintf(pysamerr, " -x, --xy-prob <float> P(x|y) transition probability [1e-9]\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfcnv(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->plot_th = 1e9; // by default plot none
+ args->nonref_af_dflt = 0.1;
+ args->lrr_smooth_win = 10;
+
+ args->query_sample.cell_frac_dflt = 1;
+ args->control_sample.cell_frac_dflt = 1;
+
+ // How much FORMAT/LRR and FORMAT/BAF matter
+ args->lrr_bias = 0.2;
+ args->baf_bias = 1.0;
+ args->err_prob = 1e-4;
+
+ // Transition probability to a different state and the prior of both samples being the same
+ args->ij_prob = 1e-9;
+ args->same_prob = 0.5;
+
+ // Squared std dev of BAF and LRR values (gaussian noise), estimated from real data (hets, one sample, one chr)
+ args->query_sample.baf_dev2_dflt = args->control_sample.baf_dev2_dflt = 0.04*0.04; // illumina: 0.03
+ args->query_sample.lrr_dev2 = args->control_sample.lrr_dev2 = 0.2*0.2; //0.20*0.20; // illumina: 0.18
+
+ int regions_is_file = 0, targets_is_file = 0;
+ static struct option loptions[] =
+ {
+ {"BAF-dev",1,0,'d'},
+ {"LRR-dev",1,0,'k'},
+ {"LRR-smooth-win",1,0,'L'},
+ {"AF-file",1,0,'f'},
+ {"baum-welch",1,0,'W'}, // hidden
+ {"optimize",1,0,'O'},
+ {"aberrant",1,0,'a'},
+ {"err-prob",1,0,'e'},
+ {"BAF-weight",1,0,'b'},
+ {"LRR-weight",1,0,'l'},
+ {"same-prob",1,0,'P'},
+ {"xy-prob",1,0,'x'},
+ {"sample",1,0,'s'},
+ {"control",1,0,'c'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"plot",1,0,'p'},
+ {"output-dir",1,0,'o'},
+ {0,0,0,0}
+ };
+ char *tmp = NULL;
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W:f:a:L:d:k:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'L':
+ args->lrr_smooth_win = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse: --LRR-smooth-win %s\n", optarg);
+ break;
+ case 'f': args->af_fname = optarg; break;
+ case 'O':
+ args->optimize_frac = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -O %s\n", optarg);
+ break;
+ case 'd':
+ args->query_sample.baf_dev2_dflt = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -d %s\n", optarg);
+ args->control_sample.baf_dev2_dflt = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -d %s\n", optarg);
+ }
+ else
+ args->control_sample.baf_dev2_dflt = args->query_sample.baf_dev2_dflt;
+ args->query_sample.baf_dev2_dflt *= args->query_sample.baf_dev2_dflt;
+ args->control_sample.baf_dev2_dflt *= args->control_sample.baf_dev2_dflt;
+ break;
+ case 'k':
+ args->query_sample.lrr_dev2 = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -k %s\n", optarg);
+ args->control_sample.lrr_dev2 = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -d %s\n", optarg);
+ }
+ else
+ args->control_sample.lrr_dev2 = args->query_sample.lrr_dev2;
+ args->query_sample.lrr_dev2 *= args->query_sample.lrr_dev2;
+ args->control_sample.lrr_dev2 *= args->control_sample.lrr_dev2;
+ break;
+ case 'a':
+ args->query_sample.cell_frac_dflt = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -a %s\n", optarg);
+ args->control_sample.cell_frac_dflt = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -a %s\n", optarg);
+ }
+ break;
+ case 'W':
+ args->baum_welch_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -W %s\n", optarg);
+ break;
+ case 'e':
+ args->err_prob = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -e %s\n", optarg);
+ break;
+ case 'b':
+ args->baf_bias = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -b %s\n", optarg);
+ break;
+ case 'x':
+ args->ij_prob = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -x %s\n", optarg);
+ break;
+ case 'P':
+ args->same_prob = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -P %s\n", optarg);
+ break;
+ case 'l':
+ args->lrr_bias = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -l %s\n", optarg);
+ break;
+ case 'p':
+ args->plot_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -p %s\n", optarg);
+ break;
+ case 'o': args->output_dir = optarg; break;
+ case 's': args->query_sample.name = strdup(optarg); break;
+ case 'c': args->control_sample.name = optarg; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'h':
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";
+ }
+ else fname = argv[optind];
+ if ( !fname ) usage(args);
+
+ if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n");
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->af_fname )
+ {
+ if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
+ error("Failed to read the targets: %s\n", args->af_fname);
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ cnv_next_line(args, line);
+ }
+ cnv_next_line(args, NULL);
+ create_plots(args);
+ fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ destroy_data(args);
+ free(args);
+ return 0;
+}
+
+
diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c
new file mode 100644
index 0000000..cfec7c0
--- /dev/null
+++ b/bcftools/vcfconcat.c
@@ -0,0 +1,662 @@
+/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include "bcftools.h"
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ htsFile *out_fh;
+ int output_type, n_threads;
+ bcf_hdr_t *out_hdr;
+ int *seen_seq;
+
+ // phasing
+ int *start_pos, start_tid, ifname;
+ int *swap_phase, nswap, *nmatch, *nmism;
+ bcf1_t **buf;
+ int nbuf, mbuf, prev_chr, min_PQ, prev_pos_check;
+ int32_t *GTa, *GTb, mGTa, mGTb, *phase_qual, *phase_set;
+
+ char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
+ int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
+ int compact_PS, phase_set_changed;
+}
+args_t;
+
+static void init_data(args_t *args)
+{
+ bcf1_t *line = NULL;
+
+ // With phased concat, the chunks overlap and come in the right order. To
+ // avoid opening all files at once, store start positions to recognise need
+ // for the next one. This way we can keep only two open chunks at once.
+ if ( args->phased_concat )
+ {
+ args->start_pos = (int*) malloc(sizeof(int)*args->nfnames);
+ line = bcf_init();
+ }
+
+ kstring_t str = {0,0,0};
+ int i, prev_chrid = -1;
+ for (i=0; i<args->nfnames; i++)
+ {
+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+ args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
+ if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
+ error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
+
+ int j;
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++)
+ if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) )
+ error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
+
+ if ( args->phased_concat )
+ {
+ int ret = bcf_read(fp, hdr, line);
+ if ( ret!=0 ) args->start_pos[i] = -2; // empty file
+ else
+ {
+ int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line));
+ args->start_pos[i] = chrid==prev_chrid ? line->pos : -1;
+ prev_chrid = chrid;
+ }
+ }
+ bcf_hdr_destroy(hdr);
+ hts_close(fp);
+ }
+ free(str.s);
+ if ( line ) bcf_destroy(line);
+
+ args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int));
+
+ if ( args->phased_concat )
+ {
+ bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
+ bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
+ }
+ bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+
+ bcf_hdr_write(args->out_fh, args->out_hdr);
+
+ if ( args->allow_overlaps )
+ {
+ args->files = bcf_sr_init();
+ args->files->require_index = 1;
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->remove_dups )
+ {
+ if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
+ }
+ for (i=0; i<args->nfnames; i++)
+ if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum));
+ }
+ else if ( args->phased_concat )
+ {
+ // Remove empty files from the list
+ int nok = 0;
+ while (1)
+ {
+ while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++;
+ if ( nok==args->nfnames ) break;
+
+ i = nok;
+ while ( i<args->nfnames && args->start_pos[i]==-2 ) i++;
+ if ( i==args->nfnames ) break;
+
+ int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp;
+ char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str;
+ }
+ for (i=nok; i<args->nfnames; i++) free(args->fnames[i]);
+ args->nfnames = nok;
+
+ for (i=1; i<args->nfnames; i++)
+ if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] )
+ error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]);
+
+ args->prev_chr = -1;
+ args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
+ args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
+ args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
+ args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
+ args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
+ args->files = bcf_sr_init();
+ args->files->require_index = 1;
+ args->ifname = 0;
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nfnames; i++) free(args->fnames[i]);
+ free(args->fnames);
+ if ( args->files ) bcf_sr_destroy(args->files);
+ if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+ bcf_hdr_destroy(args->out_hdr);
+ free(args->seen_seq);
+ free(args->start_pos);
+ free(args->swap_phase);
+ for (i=0; i<args->mbuf; i++) bcf_destroy(args->buf[i]);
+ free(args->buf);
+ free(args->GTa);
+ free(args->GTb);
+ free(args->nmatch);
+ free(args->nmism);
+ free(args->phase_qual);
+ free(args->phase_set);
+}
+
+int vcf_write_line(htsFile *fp, kstring_t *line);
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec)
+{
+ int i, nGTs = bcf_get_genotypes(hdr, rec, &args->GTa, &args->mGTa);
+ if ( nGTs <= 0 ) return; // GT field is not present
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ {
+ if ( !args->swap_phase[i] ) continue;
+ int *gt = &args->GTa[i*2];
+ if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue;
+ SWAP(int, gt[0], gt[1]);
+ gt[1] |= 1;
+ }
+ bcf_update_genotypes(hdr,rec,args->GTa,nGTs);
+}
+
+static void phased_flush(args_t *args)
+{
+ if ( !args->nbuf ) return;
+
+ bcf_hdr_t *ahdr = args->files->readers[0].header;
+ bcf_hdr_t *bhdr = args->files->readers[1].header;
+
+ int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr);
+ static int gt_absent_warned = 0;
+
+ for (i=0; i<args->nbuf; i+=2)
+ {
+ bcf1_t *arec = args->buf[i];
+ bcf1_t *brec = args->buf[i+1];
+
+ int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
+ if ( nGTs < 0 )
+ {
+ if ( !gt_absent_warned )
+ {
+ fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1);
+ gt_absent_warned = 1;
+ }
+ continue;
+ }
+ if ( nGTs != 2*nsmpl ) continue; // not diploid
+ nGTs = bcf_get_genotypes(bhdr, brec, &args->GTb, &args->mGTb);
+ if ( nGTs < 0 )
+ {
+ if ( !gt_absent_warned )
+ {
+ fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1);
+ gt_absent_warned = 1;
+ }
+ continue;
+ }
+ if ( nGTs != 2*nsmpl ) continue; // not diploid
+
+ for (j=0; j<nsmpl; j++)
+ {
+ int *gta = &args->GTa[j*2];
+ int *gtb = &args->GTb[j*2];
+ if ( gta[1]==bcf_int32_vector_end || gtb[1]==bcf_int32_vector_end ) continue;
+ if ( bcf_gt_is_missing(gta[0]) || bcf_gt_is_missing(gta[1]) || bcf_gt_is_missing(gtb[0]) || bcf_gt_is_missing(gtb[1]) ) continue;
+ if ( !bcf_gt_is_phased(gta[1]) || !bcf_gt_is_phased(gtb[1]) ) continue;
+ if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gta[1]) || bcf_gt_allele(gtb[0])==bcf_gt_allele(gtb[1]) ) continue;
+ if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[0]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[1]) )
+ {
+ if ( args->swap_phase[j] ) args->nmism[j]++; else args->nmatch[j]++;
+ }
+ if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[1]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[0]) )
+ {
+ if ( args->swap_phase[j] ) args->nmatch[j]++; else args->nmism[j]++;
+ }
+ }
+ }
+ for (i=0; i<args->nbuf/2; i+=2)
+ {
+ bcf1_t *arec = args->buf[i];
+ bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
+ if ( args->nswap )
+ phase_update(args, args->out_hdr, arec);
+ if ( !args->compact_PS || args->phase_set_changed )
+ {
+ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
+ args->phase_set_changed = 0;
+ }
+ bcf_write(args->out_fh, args->out_hdr, arec);
+
+ if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = arec->pos;
+ }
+ args->nswap = 0;
+ for (j=0; j<nsmpl; j++)
+ {
+ if ( args->nmatch[j] >= args->nmism[j] )
+ args->swap_phase[j] = 0;
+ else
+ {
+ args->swap_phase[j] = 1;
+ args->nswap++;
+ }
+ if ( args->nmatch[j] && args->nmism[j] )
+ {
+ // Entropy-inspired quality. The factor 0.7 shifts and scales to (0,1)
+ double f = (double)args->nmatch[j]/(args->nmatch[j]+args->nmism[j]);
+ args->phase_qual[j] = 99*(0.7 + f*log(f) + (1-f)*log(1-f))/0.7;
+ }
+ else
+ args->phase_qual[j] = 99;
+ args->nmatch[j] = 0;
+ args->nmism[j] = 0;
+ }
+ int PQ_printed = 0;
+ for (; i<args->nbuf; i+=2)
+ {
+ bcf1_t *brec = args->buf[i+1];
+ bcf_translate(args->out_hdr, args->files->readers[1].header, brec);
+ if ( !PQ_printed )
+ {
+ bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl);
+ PQ_printed = 1;
+ for (j=0; j<nsmpl; j++)
+ if ( args->phase_qual[j] < args->min_PQ )
+ {
+ args->phase_set[j] = brec->pos+1;
+ args->phase_set_changed = 1;
+ }
+ else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing;
+ }
+ if ( args->nswap )
+ phase_update(args, args->out_hdr, brec);
+ if ( !args->compact_PS || args->phase_set_changed )
+ {
+ bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl);
+ args->phase_set_changed = 0;
+ }
+ bcf_write(args->out_fh, args->out_hdr, brec);
+
+ if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = brec->pos;
+ }
+ args->nbuf = 0;
+}
+
+static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec)
+{
+ if ( arec && arec->errcode )
+ error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname);
+ if ( brec && brec->errcode )
+ error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname);
+
+ int i, nsmpl = bcf_hdr_nsamples(args->out_hdr);
+ int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec));
+ if ( args->prev_chr<0 || args->prev_chr!=chr_id )
+ {
+ if ( args->prev_chr>=0 ) phased_flush(args);
+
+ for (i=0; i<nsmpl; i++)
+ args->phase_set[i] = arec->pos+1;
+ args->phase_set_changed = 1;
+
+ if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec));
+ args->seen_seq[chr_id] = 1;
+ args->prev_chr = chr_id;
+ args->prev_pos_check = -1;
+ }
+
+ if ( !brec )
+ {
+ bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
+ if ( args->nswap )
+ phase_update(args, args->out_hdr, arec);
+ if ( !args->compact_PS || args->phase_set_changed )
+ {
+ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
+ args->phase_set_changed = 0;
+ }
+ bcf_write(args->out_fh, args->out_hdr, arec);
+
+ if ( arec->pos < args->prev_pos_check )
+ error("FIXME, disorder: %s:%d in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
+ args->prev_pos_check = arec->pos;
+ return;
+ }
+
+ int m = args->mbuf;
+ args->nbuf += 2;
+ hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf);
+ for (i=m; i<args->mbuf; i++)
+ args->buf[i] = bcf_init1();
+
+ SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
+ SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
+}
+
+static void concat(args_t *args)
+{
+ int i;
+ if ( args->phased_concat ) // phased concat
+ {
+ // keep only two open files at a time
+ while ( args->ifname < args->nfnames )
+ {
+ int new_file = 0;
+ while ( args->files->nreaders < 2 && args->ifname < args->nfnames )
+ {
+ if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
+ new_file = 1;
+
+ args->ifname++;
+ if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open
+ if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome
+ }
+
+ // is there a line from the previous run? Seek the newly opened reader to that position
+ int seek_pos = -1;
+ int seek_chr = -1;
+ if ( bcf_sr_has_line(args->files,0) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos);
+ seek_pos = line->pos;
+ seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line));
+ }
+ else if ( new_file )
+ bcf_sr_seek(args->files,NULL,0); // set to start
+
+ int nret;
+ while ( (nret = bcf_sr_next_line(args->files)) )
+ {
+ if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader
+ {
+ // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
+ if ( ! bcf_sr_region_done(args->files,0) ) continue;
+
+ phased_flush(args);
+ bcf_sr_remove_reader(args->files, 0);
+ }
+
+ // Get a line to learn about current position
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_sr_has_line(args->files,i) ) break;
+ bcf1_t *line = bcf_sr_get_line(args->files,i);
+
+ // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
+ if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
+ seek_pos = seek_chr = -1;
+
+ // Check if the position overlaps with the next, yet unopened, reader
+ int must_seek = 0;
+ while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
+ {
+ must_seek = 1;
+ if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
+ args->ifname++;
+ }
+ if ( must_seek )
+ {
+ bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
+ seek_pos = line->pos;
+ seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));
+ continue;
+ }
+
+ // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
+ if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;
+
+ phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);
+ }
+
+ if ( args->files->nreaders )
+ {
+ phased_flush(args);
+ while ( args->files->nreaders )
+ bcf_sr_remove_reader(args->files, 0);
+ }
+ }
+ }
+ else if ( args->files ) // combining overlapping files, using synced reader
+ {
+ while ( bcf_sr_next_line(args->files) )
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,i);
+ if ( !line ) continue;
+ bcf_translate(args->out_hdr, args->files->readers[i].header, line);
+ bcf_write1(args->out_fh, args->out_hdr, line);
+ if ( args->remove_dups ) break;
+ }
+ }
+ }
+ else // concatenating
+ {
+ kstring_t tmp = {0,0,0};
+ int prev_chr_id = -1, prev_pos;
+ bcf1_t *line = bcf_init();
+ for (i=0; i<args->nfnames; i++)
+ {
+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+ if ( !fp->is_bin && args->output_type&FT_VCF )
+ {
+ line->max_unpack = BCF_UN_STR;
+ // if VCF is on both input and output, avoid VCF to BCF conversion
+ while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
+ {
+ char *str = fp->line.s;
+ while ( *str && *str!='\t' ) str++;
+ tmp.l = 0;
+ kputsn(fp->line.s,str-fp->line.s,&tmp);
+ int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s);
+ if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
+ if ( prev_chr_id!=chr_id )
+ {
+ prev_pos = -1;
+ if ( args->seen_seq[chr_id] )
+ error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
+ }
+ char *end;
+ int pos = strtol(str+1,&end,10) - 1;
+ if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s);
+ if ( prev_pos > pos )
+ error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s);
+ args->seen_seq[chr_id] = 1;
+ prev_chr_id = chr_id;
+
+ if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l);
+ }
+ }
+ else
+ {
+ // BCF conversion is required
+ line->max_unpack = 0;
+ while ( bcf_read(fp, hdr, line)==0 )
+ {
+ bcf_translate(args->out_hdr, hdr, line);
+
+ if ( prev_chr_id!=line->rid )
+ {
+ prev_pos = -1;
+ if ( args->seen_seq[line->rid] )
+ error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
+ }
+ if ( prev_pos > line->pos )
+ error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
+ args->seen_seq[line->rid] = 1;
+ prev_chr_id = line->rid;
+
+ if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n");
+ }
+ }
+ bcf_hdr_destroy(hdr);
+ hts_close(fp);
+ }
+ bcf_destroy(line);
+ free(tmp.s);
+ }
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n");
+ fprintf(stderr, " columns appearing in the same order. The program can be used, for example, to\n");
+ fprintf(stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n");
+ fprintf(stderr, " VCF into one. The input files must be sorted by chr and position. The files\n");
+ fprintf(stderr, " must be given in the correct order to produce sorted VCF on output unless\n");
+ fprintf(stderr, " the -a, --allow-overlaps option is specified.\n");
+ fprintf(stderr, "Usage: bcftools concat [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
+ fprintf(stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
+ fprintf(stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
+ fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n");
+ fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
+ fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
+ fprintf(stderr, " -o, --output <file> Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
+ fprintf(stderr, " -r, --regions <region> Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> Restrict to regions listed in a file\n");
+ fprintf(stderr, " --threads <int> Number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfconcat(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->min_PQ = 30;
+
+ static struct option loptions[] =
+ {
+ {"compact-PS",no_argument,NULL,'c'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"remove-duplicates",no_argument,NULL,'D'},
+ {"rm-dups",required_argument,NULL,'d'},
+ {"allow-overlaps",no_argument,NULL,'a'},
+ {"ligate",no_argument,NULL,'l'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"file-list",required_argument,NULL,'f'},
+ {"min-PQ",required_argument,NULL,'q'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'c': args->compact_PS = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
+ case 'd': args->remove_dups = optarg; break;
+ case 'D': args->remove_dups = "none"; break;
+ case 'q':
+ args->min_PQ = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
+ break;
+ case 'a': args->allow_overlaps = 1; break;
+ case 'l': args->phased_concat = 1; break;
+ case 'f': args->file_list = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ while ( optind<argc )
+ {
+ args->nfnames++;
+ args->fnames = (char **)realloc(args->fnames,sizeof(char*)*args->nfnames);
+ args->fnames[args->nfnames-1] = strdup(argv[optind]);
+ optind++;
+ }
+ if ( args->allow_overlaps && args->phased_concat ) args->allow_overlaps = 0;
+ if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
+ if ( args->file_list )
+ {
+ if ( args->nfnames ) error("Cannot combine -l with file names on command line.\n");
+ args->fnames = hts_readlines(args->file_list, &args->nfnames);
+ if ( !args->fnames ) error("Could not read the file: %s\n", args->file_list);
+ }
+ if ( !args->nfnames ) usage(args);
+ if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n");
+ if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
+ init_data(args);
+ concat(args);
+ destroy_data(args);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c
new file mode 100644
index 0000000..40db3f7
--- /dev/null
+++ b/bcftools/vcfconcat.c.pysam.c
@@ -0,0 +1,664 @@
+#include "pysam.h"
+
+/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include "bcftools.h"
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ htsFile *out_fh;
+ int output_type, n_threads;
+ bcf_hdr_t *out_hdr;
+ int *seen_seq;
+
+ // phasing
+ int *start_pos, start_tid, ifname;
+ int *swap_phase, nswap, *nmatch, *nmism;
+ bcf1_t **buf;
+ int nbuf, mbuf, prev_chr, min_PQ, prev_pos_check;
+ int32_t *GTa, *GTb, mGTa, mGTb, *phase_qual, *phase_set;
+
+ char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
+ int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
+ int compact_PS, phase_set_changed;
+}
+args_t;
+
+static void init_data(args_t *args)
+{
+ bcf1_t *line = NULL;
+
+ // With phased concat, the chunks overlap and come in the right order. To
+ // avoid opening all files at once, store start positions to recognise need
+ // for the next one. This way we can keep only two open chunks at once.
+ if ( args->phased_concat )
+ {
+ args->start_pos = (int*) malloc(sizeof(int)*args->nfnames);
+ line = bcf_init();
+ }
+
+ kstring_t str = {0,0,0};
+ int i, prev_chrid = -1;
+ for (i=0; i<args->nfnames; i++)
+ {
+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+ args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
+ if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
+ error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
+
+ int j;
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++)
+ if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) )
+ error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
+
+ if ( args->phased_concat )
+ {
+ int ret = bcf_read(fp, hdr, line);
+ if ( ret!=0 ) args->start_pos[i] = -2; // empty file
+ else
+ {
+ int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line));
+ args->start_pos[i] = chrid==prev_chrid ? line->pos : -1;
+ prev_chrid = chrid;
+ }
+ }
+ bcf_hdr_destroy(hdr);
+ hts_close(fp);
+ }
+ free(str.s);
+ if ( line ) bcf_destroy(line);
+
+ args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int));
+
+ if ( args->phased_concat )
+ {
+ bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
+ bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
+ }
+ bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+
+ bcf_hdr_write(args->out_fh, args->out_hdr);
+
+ if ( args->allow_overlaps )
+ {
+ args->files = bcf_sr_init();
+ args->files->require_index = 1;
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->remove_dups )
+ {
+ if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
+ }
+ for (i=0; i<args->nfnames; i++)
+ if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum));
+ }
+ else if ( args->phased_concat )
+ {
+ // Remove empty files from the list
+ int nok = 0;
+ while (1)
+ {
+ while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++;
+ if ( nok==args->nfnames ) break;
+
+ i = nok;
+ while ( i<args->nfnames && args->start_pos[i]==-2 ) i++;
+ if ( i==args->nfnames ) break;
+
+ int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp;
+ char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str;
+ }
+ for (i=nok; i<args->nfnames; i++) free(args->fnames[i]);
+ args->nfnames = nok;
+
+ for (i=1; i<args->nfnames; i++)
+ if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] )
+ error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]);
+
+ args->prev_chr = -1;
+ args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
+ args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
+ args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
+ args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
+ args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
+ args->files = bcf_sr_init();
+ args->files->require_index = 1;
+ args->ifname = 0;
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nfnames; i++) free(args->fnames[i]);
+ free(args->fnames);
+ if ( args->files ) bcf_sr_destroy(args->files);
+ if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+ bcf_hdr_destroy(args->out_hdr);
+ free(args->seen_seq);
+ free(args->start_pos);
+ free(args->swap_phase);
+ for (i=0; i<args->mbuf; i++) bcf_destroy(args->buf[i]);
+ free(args->buf);
+ free(args->GTa);
+ free(args->GTb);
+ free(args->nmatch);
+ free(args->nmism);
+ free(args->phase_qual);
+ free(args->phase_set);
+}
+
+int vcf_write_line(htsFile *fp, kstring_t *line);
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec)
+{
+ int i, nGTs = bcf_get_genotypes(hdr, rec, &args->GTa, &args->mGTa);
+ if ( nGTs <= 0 ) return; // GT field is not present
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ {
+ if ( !args->swap_phase[i] ) continue;
+ int *gt = &args->GTa[i*2];
+ if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue;
+ SWAP(int, gt[0], gt[1]);
+ gt[1] |= 1;
+ }
+ bcf_update_genotypes(hdr,rec,args->GTa,nGTs);
+}
+
+static void phased_flush(args_t *args)
+{
+ if ( !args->nbuf ) return;
+
+ bcf_hdr_t *ahdr = args->files->readers[0].header;
+ bcf_hdr_t *bhdr = args->files->readers[1].header;
+
+ int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr);
+ static int gt_absent_warned = 0;
+
+ for (i=0; i<args->nbuf; i+=2)
+ {
+ bcf1_t *arec = args->buf[i];
+ bcf1_t *brec = args->buf[i+1];
+
+ int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
+ if ( nGTs < 0 )
+ {
+ if ( !gt_absent_warned )
+ {
+ fprintf(pysamerr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1);
+ gt_absent_warned = 1;
+ }
+ continue;
+ }
+ if ( nGTs != 2*nsmpl ) continue; // not diploid
+ nGTs = bcf_get_genotypes(bhdr, brec, &args->GTb, &args->mGTb);
+ if ( nGTs < 0 )
+ {
+ if ( !gt_absent_warned )
+ {
+ fprintf(pysamerr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1);
+ gt_absent_warned = 1;
+ }
+ continue;
+ }
+ if ( nGTs != 2*nsmpl ) continue; // not diploid
+
+ for (j=0; j<nsmpl; j++)
+ {
+ int *gta = &args->GTa[j*2];
+ int *gtb = &args->GTb[j*2];
+ if ( gta[1]==bcf_int32_vector_end || gtb[1]==bcf_int32_vector_end ) continue;
+ if ( bcf_gt_is_missing(gta[0]) || bcf_gt_is_missing(gta[1]) || bcf_gt_is_missing(gtb[0]) || bcf_gt_is_missing(gtb[1]) ) continue;
+ if ( !bcf_gt_is_phased(gta[1]) || !bcf_gt_is_phased(gtb[1]) ) continue;
+ if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gta[1]) || bcf_gt_allele(gtb[0])==bcf_gt_allele(gtb[1]) ) continue;
+ if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[0]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[1]) )
+ {
+ if ( args->swap_phase[j] ) args->nmism[j]++; else args->nmatch[j]++;
+ }
+ if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[1]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[0]) )
+ {
+ if ( args->swap_phase[j] ) args->nmatch[j]++; else args->nmism[j]++;
+ }
+ }
+ }
+ for (i=0; i<args->nbuf/2; i+=2)
+ {
+ bcf1_t *arec = args->buf[i];
+ bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
+ if ( args->nswap )
+ phase_update(args, args->out_hdr, arec);
+ if ( !args->compact_PS || args->phase_set_changed )
+ {
+ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
+ args->phase_set_changed = 0;
+ }
+ bcf_write(args->out_fh, args->out_hdr, arec);
+
+ if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = arec->pos;
+ }
+ args->nswap = 0;
+ for (j=0; j<nsmpl; j++)
+ {
+ if ( args->nmatch[j] >= args->nmism[j] )
+ args->swap_phase[j] = 0;
+ else
+ {
+ args->swap_phase[j] = 1;
+ args->nswap++;
+ }
+ if ( args->nmatch[j] && args->nmism[j] )
+ {
+ // Entropy-inspired quality. The factor 0.7 shifts and scales to (0,1)
+ double f = (double)args->nmatch[j]/(args->nmatch[j]+args->nmism[j]);
+ args->phase_qual[j] = 99*(0.7 + f*log(f) + (1-f)*log(1-f))/0.7;
+ }
+ else
+ args->phase_qual[j] = 99;
+ args->nmatch[j] = 0;
+ args->nmism[j] = 0;
+ }
+ int PQ_printed = 0;
+ for (; i<args->nbuf; i+=2)
+ {
+ bcf1_t *brec = args->buf[i+1];
+ bcf_translate(args->out_hdr, args->files->readers[1].header, brec);
+ if ( !PQ_printed )
+ {
+ bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl);
+ PQ_printed = 1;
+ for (j=0; j<nsmpl; j++)
+ if ( args->phase_qual[j] < args->min_PQ )
+ {
+ args->phase_set[j] = brec->pos+1;
+ args->phase_set_changed = 1;
+ }
+ else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing;
+ }
+ if ( args->nswap )
+ phase_update(args, args->out_hdr, brec);
+ if ( !args->compact_PS || args->phase_set_changed )
+ {
+ bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl);
+ args->phase_set_changed = 0;
+ }
+ bcf_write(args->out_fh, args->out_hdr, brec);
+
+ if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = brec->pos;
+ }
+ args->nbuf = 0;
+}
+
+static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec)
+{
+ if ( arec && arec->errcode )
+ error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname);
+ if ( brec && brec->errcode )
+ error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname);
+
+ int i, nsmpl = bcf_hdr_nsamples(args->out_hdr);
+ int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec));
+ if ( args->prev_chr<0 || args->prev_chr!=chr_id )
+ {
+ if ( args->prev_chr>=0 ) phased_flush(args);
+
+ for (i=0; i<nsmpl; i++)
+ args->phase_set[i] = arec->pos+1;
+ args->phase_set_changed = 1;
+
+ if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec));
+ args->seen_seq[chr_id] = 1;
+ args->prev_chr = chr_id;
+ args->prev_pos_check = -1;
+ }
+
+ if ( !brec )
+ {
+ bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
+ if ( args->nswap )
+ phase_update(args, args->out_hdr, arec);
+ if ( !args->compact_PS || args->phase_set_changed )
+ {
+ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
+ args->phase_set_changed = 0;
+ }
+ bcf_write(args->out_fh, args->out_hdr, arec);
+
+ if ( arec->pos < args->prev_pos_check )
+ error("FIXME, disorder: %s:%d in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
+ args->prev_pos_check = arec->pos;
+ return;
+ }
+
+ int m = args->mbuf;
+ args->nbuf += 2;
+ hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf);
+ for (i=m; i<args->mbuf; i++)
+ args->buf[i] = bcf_init1();
+
+ SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
+ SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
+}
+
+static void concat(args_t *args)
+{
+ int i;
+ if ( args->phased_concat ) // phased concat
+ {
+ // keep only two open files at a time
+ while ( args->ifname < args->nfnames )
+ {
+ int new_file = 0;
+ while ( args->files->nreaders < 2 && args->ifname < args->nfnames )
+ {
+ if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
+ new_file = 1;
+
+ args->ifname++;
+ if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open
+ if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome
+ }
+
+ // is there a line from the previous run? Seek the newly opened reader to that position
+ int seek_pos = -1;
+ int seek_chr = -1;
+ if ( bcf_sr_has_line(args->files,0) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos);
+ seek_pos = line->pos;
+ seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line));
+ }
+ else if ( new_file )
+ bcf_sr_seek(args->files,NULL,0); // set to start
+
+ int nret;
+ while ( (nret = bcf_sr_next_line(args->files)) )
+ {
+ if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader
+ {
+ // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
+ if ( ! bcf_sr_region_done(args->files,0) ) continue;
+
+ phased_flush(args);
+ bcf_sr_remove_reader(args->files, 0);
+ }
+
+ // Get a line to learn about current position
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_sr_has_line(args->files,i) ) break;
+ bcf1_t *line = bcf_sr_get_line(args->files,i);
+
+ // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
+ if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
+ seek_pos = seek_chr = -1;
+
+ // Check if the position overlaps with the next, yet unopened, reader
+ int must_seek = 0;
+ while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
+ {
+ must_seek = 1;
+ if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
+ args->ifname++;
+ }
+ if ( must_seek )
+ {
+ bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
+ seek_pos = line->pos;
+ seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));
+ continue;
+ }
+
+ // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
+ if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;
+
+ phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);
+ }
+
+ if ( args->files->nreaders )
+ {
+ phased_flush(args);
+ while ( args->files->nreaders )
+ bcf_sr_remove_reader(args->files, 0);
+ }
+ }
+ }
+ else if ( args->files ) // combining overlapping files, using synced reader
+ {
+ while ( bcf_sr_next_line(args->files) )
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,i);
+ if ( !line ) continue;
+ bcf_translate(args->out_hdr, args->files->readers[i].header, line);
+ bcf_write1(args->out_fh, args->out_hdr, line);
+ if ( args->remove_dups ) break;
+ }
+ }
+ }
+ else // concatenating
+ {
+ kstring_t tmp = {0,0,0};
+ int prev_chr_id = -1, prev_pos;
+ bcf1_t *line = bcf_init();
+ for (i=0; i<args->nfnames; i++)
+ {
+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+ if ( !fp->is_bin && args->output_type&FT_VCF )
+ {
+ line->max_unpack = BCF_UN_STR;
+ // if VCF is on both input and output, avoid VCF to BCF conversion
+ while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
+ {
+ char *str = fp->line.s;
+ while ( *str && *str!='\t' ) str++;
+ tmp.l = 0;
+ kputsn(fp->line.s,str-fp->line.s,&tmp);
+ int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s);
+ if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
+ if ( prev_chr_id!=chr_id )
+ {
+ prev_pos = -1;
+ if ( args->seen_seq[chr_id] )
+ error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
+ }
+ char *end;
+ int pos = strtol(str+1,&end,10) - 1;
+ if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s);
+ if ( prev_pos > pos )
+ error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s);
+ args->seen_seq[chr_id] = 1;
+ prev_chr_id = chr_id;
+
+ if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l);
+ }
+ }
+ else
+ {
+ // BCF conversion is required
+ line->max_unpack = 0;
+ while ( bcf_read(fp, hdr, line)==0 )
+ {
+ bcf_translate(args->out_hdr, hdr, line);
+
+ if ( prev_chr_id!=line->rid )
+ {
+ prev_pos = -1;
+ if ( args->seen_seq[line->rid] )
+ error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
+ }
+ if ( prev_pos > line->pos )
+ error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
+ args->seen_seq[line->rid] = 1;
+ prev_chr_id = line->rid;
+
+ if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n");
+ }
+ }
+ bcf_hdr_destroy(hdr);
+ hts_close(fp);
+ }
+ bcf_destroy(line);
+ free(tmp.s);
+ }
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n");
+ fprintf(pysamerr, " columns appearing in the same order. The program can be used, for example, to\n");
+ fprintf(pysamerr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n");
+ fprintf(pysamerr, " VCF into one. The input files must be sorted by chr and position. The files\n");
+ fprintf(pysamerr, " must be given in the correct order to produce sorted VCF on output unless\n");
+ fprintf(pysamerr, " the -a, --allow-overlaps option is specified.\n");
+ fprintf(pysamerr, "Usage: bcftools concat [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
+ fprintf(pysamerr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
+ fprintf(pysamerr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
+ fprintf(pysamerr, " -D, --remove-duplicates Alias for -d none\n");
+ fprintf(pysamerr, " -f, --file-list <file> Read the list of files from a file.\n");
+ fprintf(pysamerr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
+ fprintf(pysamerr, " -o, --output <file> Write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysamerr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
+ fprintf(pysamerr, " -r, --regions <region> Restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> Restrict to regions listed in a file\n");
+ fprintf(pysamerr, " --threads <int> Number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfconcat(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->min_PQ = 30;
+
+ static struct option loptions[] =
+ {
+ {"compact-PS",no_argument,NULL,'c'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"remove-duplicates",no_argument,NULL,'D'},
+ {"rm-dups",required_argument,NULL,'d'},
+ {"allow-overlaps",no_argument,NULL,'a'},
+ {"ligate",no_argument,NULL,'l'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"file-list",required_argument,NULL,'f'},
+ {"min-PQ",required_argument,NULL,'q'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'c': args->compact_PS = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
+ case 'd': args->remove_dups = optarg; break;
+ case 'D': args->remove_dups = "none"; break;
+ case 'q':
+ args->min_PQ = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
+ break;
+ case 'a': args->allow_overlaps = 1; break;
+ case 'l': args->phased_concat = 1; break;
+ case 'f': args->file_list = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ while ( optind<argc )
+ {
+ args->nfnames++;
+ args->fnames = (char **)realloc(args->fnames,sizeof(char*)*args->nfnames);
+ args->fnames[args->nfnames-1] = strdup(argv[optind]);
+ optind++;
+ }
+ if ( args->allow_overlaps && args->phased_concat ) args->allow_overlaps = 0;
+ if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
+ if ( args->file_list )
+ {
+ if ( args->nfnames ) error("Cannot combine -l with file names on command line.\n");
+ args->fnames = hts_readlines(args->file_list, &args->nfnames);
+ if ( !args->fnames ) error("Could not read the file: %s\n", args->file_list);
+ }
+ if ( !args->nfnames ) usage(args);
+ if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n");
+ if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
+ init_data(args);
+ concat(args);
+ destroy_data(args);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c
new file mode 100644
index 0000000..26166df
--- /dev/null
+++ b/bcftools/vcfconvert.c
@@ -0,0 +1,1448 @@
+/* vcfconvert.c -- convert between VCF/BCF and related formats.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/faidx.h>
+#include <htslib/vcf.h>
+#include <htslib/bgzf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <htslib/kseq.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "convert.h"
+#include "tsv2vcf.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct _args_t args_t;
+struct _args_t
+{
+ faidx_t *ref;
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ convert_t *convert;
+ bcf_srs_t *files;
+ bcf_hdr_t *header;
+ void (*convert_func)(struct _args_t *);
+ struct {
+ int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
+ } n;
+ kstring_t str;
+ int32_t *gts;
+ float *flt;
+ int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
+ int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
+ char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
+ char *outfname, *infname, *ref_fname;
+ int argc, n_threads;
+};
+
+static void destroy_data(args_t *args)
+{
+ if ( args->ref ) fai_destroy(args->ref);
+ if ( args->convert) convert_destroy(args->convert);
+ if ( args->filter ) filter_destroy(args->filter);
+ free(args->samples);
+ if ( args->files ) bcf_sr_destroy(args->files);
+}
+
+static void open_vcf(args_t *args, const char *format_str)
+{
+ args->files = bcf_sr_init();
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( !bcf_sr_add_reader(args->files, args->infname) )
+ error("Failed to open %s: %s\n", args->infname,bcf_sr_strerror(args->files->errnum));
+
+ args->header = args->files->readers[0].header;
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->header, args->filter_str);
+
+ int i, nsamples = 0, *samples = NULL;
+ if ( args->sample_list && strcmp("-",args->sample_list) )
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
+ if ( ret<0 ) error("Error parsing the sample list\n");
+ else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
+ }
+
+ if ( args->sample_list[0]!='^' )
+ {
+ // the sample ordering may be different if not negated
+ int n;
+ char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
+ if ( !smpls ) error("Could not parse %s\n", args->sample_list);
+ if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
+ error("The number of samples does not match, perhaps some are present multiple times?\n");
+ nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
+ samples = (int*) malloc(sizeof(int)*nsamples);
+ for (i=0; i<n; i++)
+ {
+ samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
+ free(smpls[i]);
+ }
+ free(smpls);
+ }
+ }
+ if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
+ free(samples);
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->header, args->filter_str);
+}
+
+static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+
+ char tmp, *se = tsv->ss, *ss = tsv->ss;
+ while ( se < tsv->se && *se!=':' ) se++;
+ if ( *se!=':' ) error("Could not parse CHROM in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
+ tmp = *se; *se = 0;
+ rec->rid = bcf_hdr_name2id(args->header,ss);
+ if ( rec->rid<0 ) error("Could not determine sequence name or multiple sequences present: %s\n", tsv->ss);
+ *se = tmp;
+
+ // POS
+ rec->pos = strtol(se+1,&ss,10);
+ if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
+ rec->pos--;
+
+ // REF,ALT
+ args->str.l = 0;
+ se = ++ss;
+ while ( se < tsv->se && *se!='_' ) se++;
+ if ( *se!='_' ) error("Could not parse REF in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
+ kputsn(ss,se-ss,&args->str);
+ ss = ++se;
+ while ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) se++;
+ if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) error("Could not parse ALT in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
+ kputc(',',&args->str);
+ kputsn(ss,se-ss,&args->str);
+ bcf_update_alleles_str(args->header, rec, args->str.s);
+
+ // END - optional
+ if (*se && *se=='_') {
+ long end = strtol(se+1,&ss,10);
+ if ( ss==se+1 ) error("Could not parse END in CHROM:POS_REF_ALT_END: %s\n", tsv->ss);
+ bcf_update_info_int32(args->header, rec, "END", &end, 1);
+ }
+
+ return 0;
+}
+static int tsv_setter_verify_pos(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char *se;
+ int pos = strtol(tsv->ss,&se,10);
+ if ( tsv->ss==se ) error("Could not parse POS: %s\n", tsv->ss);
+ if ( rec->pos != pos-1 ) error("POS mismatch: %s\n", tsv->ss);
+ return 0;
+}
+static int tsv_setter_verify_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ args->rev_als = 0;
+ char tmp = *tsv->se; *tsv->se = 0;
+ if ( strcmp(tsv->ss,rec->d.allele[0]) )
+ {
+ if ( strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
+ args->rev_als = 1;
+ }
+ *tsv->se = tmp;
+ while ( *tsv->se && isspace(*tsv->se) ) tsv->se++;
+ tsv->ss = tsv->se;
+ while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
+ tmp = *tsv->se; *tsv->se = 0;
+ if ( !args->rev_als && strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
+ else if ( args->rev_als && strcmp(tsv->ss,rec->d.allele[0]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[0]); }
+ *tsv->se = tmp;
+ return 0;
+}
+static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ int i, nsamples = bcf_hdr_nsamples(args->header);
+ for (i=0; i<nsamples; i++)
+ {
+ float aa,ab,bb;
+ aa = strtod(tsv->ss, &tsv->se);
+ if ( tsv->ss==tsv->se ) { fprintf(stderr,"Could not parse first value of %d-th sample\n", i+1); return -1; }
+ tsv->ss = tsv->se+1;
+ ab = strtod(tsv->ss, &tsv->se);
+ if ( tsv->ss==tsv->se ) { fprintf(stderr,"Could not parse second value of %d-th sample\n", i+1); return -1; }
+ tsv->ss = tsv->se+1;
+ bb = strtod(tsv->ss, &tsv->se);
+ if ( tsv->ss==tsv->se ) { fprintf(stderr,"Could not parse third value of %d-th sample\n", i+1); return -1; }
+ tsv->ss = tsv->se+1;
+
+ if ( args->rev_als ) { float tmp = bb; bb = aa; aa = tmp; }
+ args->flt[3*i+0] = aa;
+ args->flt[3*i+1] = ab;
+ args->flt[3*i+2] = bb;
+
+ if ( aa >= ab )
+ {
+ if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
+ else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else if ( ab >= bb )
+ {
+ args->gts[2*i+0] = bcf_gt_unphased(0);
+ args->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ if ( *tsv->se ) error("Could not parse: %s\n", tsv->ss);
+ if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
+ if ( bcf_update_format_float(args->header,rec,"GP",args->flt,nsamples*3) ) error("Could not update GP field\n");
+ return 0;
+}
+static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ int i, nsamples = bcf_hdr_nsamples(args->header);
+
+ int32_t a0, a1;
+ if ( args->rev_als ) { a0 = bcf_gt_phased(1); a1 = bcf_gt_phased(0); }
+ else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
+
+ // up is short for "unphased"
+ int nup = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ char *ss = tsv->ss + 4*i + nup;
+ int up = 0, all;
+
+ for (all=0; all < 2; all++){
+ // checking for premature ending
+ if ( !ss[0] || !ss[1] || !ss[2] ||
+ (up && (!ss[3] || !ss[4]) ) )
+ {
+ fprintf(stderr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]);
+ return -1;
+ }
+
+ switch(ss[all*2+up]){
+ case '0':
+ args->gts[2*i+all] = a0;
+ break;
+ case '1' :
+ args->gts[2*i+all] = a1;
+ break;
+ case '?' :
+ // there is no macro to express phased missing allele
+ args->gts[2*i+all] = bcf_gt_phased(-1);
+ break;
+ case '-' :
+ args->gts[2*i+all] = bcf_int32_vector_end;
+ break;
+ default :
+ fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
+ return -1;
+ }
+ if( ss[all*2+up+1]=='*' ) up = up + 1;
+ }
+
+ if(up && up != 2)
+ {
+ fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
+ return -1;
+ }
+
+ // change alleles to unphased if the alleles are unphased
+ if ( up )
+ {
+ args->gts[2*i] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i]));
+ args->gts[2*i+1] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i+1]));
+ }
+ nup = nup + up;
+ }
+ if ( tsv->ss[(nsamples-1)*4+3+nup] )
+ {
+ fprintf(stderr,"nup: %d", nup);
+ fprintf(stderr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]);
+ return -1;
+ }
+
+ if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
+ return 0;
+}
+static void gensample_to_vcf(args_t *args)
+{
+ /*
+ * Inpute: IMPUTE2 output (indentation changed here for clarity):
+ *
+ * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ...
+ * --- 20:62116698_C_A 62116698 C A 1 0 0 ...
+ *
+ * Second column is expected in the form of CHROM:POS_REF_ALT. We use second
+ * column because the first can be empty ("--") when filling sites from reference
+ * panel.
+ *
+ * Output: VCF with filled GT,GP
+ *
+ */
+ kstring_t line = {0,0,0};
+
+ char *gen_fname = NULL, *sample_fname = NULL;
+ sample_fname = strchr(args->infname,',');
+ if ( !sample_fname )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.gen.gz", args->infname);
+ gen_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.samples", args->infname);
+ sample_fname = strdup(args->str.s);
+ }
+ else
+ {
+ *sample_fname = 0;
+ gen_fname = strdup(args->infname);
+ sample_fname = strdup(sample_fname+1);
+ }
+ htsFile *gen_fh = hts_open(gen_fname, "r");
+ if ( !gen_fh ) error("Could not read: %s\n", gen_fname);
+ if ( hts_getline(gen_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", gen_fname);
+
+ // Find out the chromosome name, sample names, init and print the VCF header
+ args->str.l = 0;
+ char *ss, *se = line.s;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) error("Could not parse %s: %s\n", gen_fname,line.s);
+ ss = se+1;
+ se = strchr(ss,':');
+ if ( !se ) error("Expected CHROM:POS_REF_ALT in second column of %s\n", gen_fname);
+ kputsn(ss, se-ss, &args->str);
+
+ tsv_t *tsv = tsv_init("-,CHROM_POS_REF_ALT,POS,REF_ALT,GT_GP");
+ tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
+ tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
+ tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
+ tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype Probabilities\">");
+ bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, nsamples;
+ char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ for (i=2; i<nsamples; i++)
+ {
+ se = samples[i]; while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ bcf_hdr_add_sample(args->header,samples[i]);
+ }
+ for (i=0; i<nsamples; i++) free(samples[i]);
+ free(samples);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+ bcf1_t *rec = bcf_init();
+
+ nsamples -= 2;
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
+ args->flt = (float *) malloc(sizeof(float)*nsamples*3);
+
+ do
+ {
+ bcf_clear(rec);
+ args->n.total++;
+ if ( !tsv_parse(tsv, rec, line.s) )
+ bcf_write(out_fh, args->header, rec);
+ else
+ error("Error occurred while parsing: %s\n", line.s);
+ }
+ while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
+
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
+ bcf_hdr_destroy(args->header);
+ bcf_destroy(rec);
+ free(sample_fname);
+ free(gen_fname);
+ free(args->str.s);
+ free(line.s);
+ free(args->gts);
+ free(args->flt);
+ tsv_destroy(tsv);
+
+ fprintf(stderr,"Number of processed rows: \t%d\n", args->n.total);
+}
+
+static void haplegendsample_to_vcf(args_t *args)
+{
+ /*
+ * Convert from IMPUTE2 hap/legend/sample output files to VCF
+ *
+ * hap:
+ * 0 1 0 1
+ * legend:
+ * id position a0 a1
+ * 1:186946386_G_T 186946386 G T
+ * sample:
+ * sample population group sex
+ * sample1 sample1 sample1 2
+ * sample2 sample2 sample2 2
+ *
+ * Output: VCF with filled GT
+ */
+ kstring_t line = {0,0,0};
+
+ char *hap_fname = NULL, *leg_fname = NULL, *sample_fname = NULL;
+ sample_fname = strchr(args->infname,',');
+ if ( !sample_fname )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.hap.gz", args->infname);
+ hap_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.samples", args->infname);
+ sample_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.legend.gz", args->infname);
+ leg_fname = strdup(args->str.s);
+ }
+ else
+ {
+ char *ss = sample_fname, *se = strchr(ss+1,',');
+ if ( !se ) error("Could not parse hap/legend/sample file names: %s\n", args->infname);
+ *ss = 0;
+ *se = 0;
+ hap_fname = strdup(args->infname);
+ leg_fname = strdup(ss+1);
+ sample_fname = strdup(se+1);
+ }
+ htsFile *hap_fh = hts_open(hap_fname, "r");
+ if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
+
+ htsFile *leg_fh = hts_open(leg_fname,"r");
+ if ( !leg_fh ) error("Could not read: %s\n", leg_fname);
+
+ // Eat up first legend line, then determine chromosome name
+ if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
+ if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
+
+ // Find out the chromosome name, sample names, init and print the VCF header
+ args->str.l = 0;
+ char *se = strchr(line.s,':');
+ if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", leg_fname);
+ kputsn(line.s, se-line.s, &args->str);
+
+ tsv_t *leg_tsv = tsv_init("CHROM_POS_REF_ALT,POS,REF_ALT");
+ tsv_register(leg_tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
+ tsv_register(leg_tsv, "POS", tsv_setter_verify_pos, NULL);
+ tsv_register(leg_tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
+
+ tsv_t *hap_tsv = tsv_init("HAPS");
+ tsv_register(hap_tsv, "HAPS", tsv_setter_haps, args);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, nrows, nsamples;
+ char **samples = hts_readlist(sample_fname, 1, &nrows);
+ nsamples = nrows - 1;
+
+ // sample_fname should contain a header line, so need to ignore first row
+ // returned from hts_readlist (i=1, and not i=0)
+ for (i=1; i<nrows; i++)
+ {
+ se = samples[i]; while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ bcf_hdr_add_sample(args->header,samples[i]);
+ }
+ bcf_hdr_add_sample(args->header,NULL);
+ for (i=0; i<nrows; i++) free(samples[i]);
+ free(samples);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+ bcf1_t *rec = bcf_init();
+
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
+
+ while (1)
+ {
+ bcf_clear(rec);
+ args->n.total++;
+ if ( tsv_parse(leg_tsv, rec, line.s) )
+ error("Error occurred while parsing %s: %s\n", leg_fname,line.s);
+
+ if ( hts_getline(hap_fh, KS_SEP_LINE, &line)<=0 )
+ error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
+
+ if ( tsv_parse(hap_tsv, rec, line.s) )
+ error("Error occurred while parsing %s: %s\n", hap_fname,line.s);
+
+ bcf_write(out_fh, args->header, rec);
+
+ if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 )
+ {
+ if ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 )
+ error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
+ break;
+ }
+ }
+
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
+ if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
+ bcf_hdr_destroy(args->header);
+ bcf_destroy(rec);
+ free(sample_fname);
+ free(hap_fname);
+ free(leg_fname);
+ free(args->str.s);
+ free(line.s);
+ free(args->gts);
+ tsv_destroy(hap_tsv);
+ tsv_destroy(leg_tsv);
+
+ fprintf(stderr,"Number of processed rows: \t%d\n", args->n.total);
+}
+
+static void hapsample_to_vcf(args_t *args)
+{
+ /*
+ * Input: SHAPEIT output
+ *
+ * 20:19995888_A_G 20:19995888 19995888 A G 0 0 0 0 ...
+ *
+ * First column is expected in the form of CHROM:POS_REF_ALT
+ *
+ * Output: VCF with filled GT
+ *
+ */
+ kstring_t line = {0,0,0};
+
+ char *hap_fname = NULL, *sample_fname = NULL;
+ sample_fname = strchr(args->infname,',');
+ if ( !sample_fname )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.hap.gz", args->infname);
+ hap_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.samples", args->infname);
+ sample_fname = strdup(args->str.s);
+ }
+ else
+ {
+ *sample_fname = 0;
+ hap_fname = strdup(args->infname);
+ sample_fname = strdup(sample_fname+1);
+ }
+ htsFile *hap_fh = hts_open(hap_fname, "r");
+ if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
+ if ( hts_getline(hap_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", hap_fname);
+
+ // Find out the chromosome name, sample names, init and print the VCF header
+ args->str.l = 0;
+ char *se = strchr(line.s,':');
+ if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", hap_fname);
+ kputsn(line.s, se-line.s, &args->str);
+
+ tsv_t *tsv = tsv_init("CHROM_POS_REF_ALT,-,POS,REF_ALT,HAPS");
+ tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
+ tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
+ tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
+ tsv_register(tsv, "HAPS", tsv_setter_haps, args);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, nsamples;
+ char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ for (i=2; i<nsamples; i++)
+ {
+ se = samples[i]; while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ bcf_hdr_add_sample(args->header,samples[i]);
+ }
+ bcf_hdr_add_sample(args->header,NULL);
+ for (i=0; i<nsamples; i++) free(samples[i]);
+ free(samples);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+ bcf1_t *rec = bcf_init();
+
+ nsamples -= 2;
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
+
+ do
+ {
+ bcf_clear(rec);
+ args->n.total++;
+ if ( !tsv_parse(tsv, rec, line.s) )
+ bcf_write(out_fh, args->header, rec);
+ else
+ error("Error occurred while parsing: %s\n", line.s);
+ }
+ while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
+
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
+ bcf_hdr_destroy(args->header);
+ bcf_destroy(rec);
+ free(sample_fname);
+ free(hap_fname);
+ free(args->str.s);
+ free(line.s);
+ free(args->gts);
+ tsv_destroy(tsv);
+
+ fprintf(stderr,"Number of processed rows: \t%d\n", args->n.total);
+}
+
+static void vcf_to_gensample(args_t *args)
+{
+ kstring_t str = {0,0,0};
+
+ // insert chrom as first column if needed
+ if(args->output_chrom_first_col)
+ kputs("%CHROM ", &str);
+ else
+ kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
+
+ // insert rsid as second column if needed
+ if(args->output_vcf_ids)
+ kputs("%ID ", &str);
+ else
+ kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
+
+ kputs("%POS %REF %FIRST_ALT", &str);
+ if ( !args->tag || !strcmp(args->tag,"GT") ) kputs("%_GT_TO_PROB3",&str);
+ else if ( !strcmp(args->tag,"PL") ) kputs("%_PL_TO_PROB3",&str);
+ else if ( !strcmp(args->tag,"GP") ) kputs("%_GP_TO_PROB3",&str);
+ else error("todo: --tag %s\n", args->tag);
+ kputs("\n", &str);
+ open_vcf(args,str.s);
+
+ int ret, gen_compressed = 1, sample_compressed = 0;
+ char *gen_fname = NULL, *sample_fname = NULL;
+ str.l = 0;
+ kputs(args->outfname,&str);
+ int n_files, i;
+ char **files = hts_readlist(str.s, 0, &n_files);
+ if ( n_files==1 )
+ {
+ int l = str.l;
+ kputs(".samples",&str);
+ sample_fname = strdup(str.s);
+ str.l = l;
+ kputs(".gen.gz",&str);
+ gen_fname = strdup(str.s);
+ }
+ else if ( n_files==2 )
+ {
+ if (strlen(files[0]) && strcmp(files[0],".")!=0) gen_fname = strdup(files[0]);
+ if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
+ }
+ else
+ {
+ error("Error parsing --gensample filenames: %s\n", args->outfname);
+ }
+ for (i=0; i<n_files; i++) free(files[i]);
+ free(files);
+
+ if ( gen_fname && (strlen(gen_fname)<3 || strcasecmp(".gz",gen_fname+strlen(gen_fname)-3)) ) gen_compressed = 0;
+ if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
+
+ if (gen_fname) fprintf(stderr, "Gen file: %s\n", gen_fname);
+ if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
+
+ // write samples file
+ if (sample_fname) {
+ int i;
+ BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
+ str.l = 0;
+ kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ for (i=0; i<bcf_hdr_nsamples(args->header); i++)
+ {
+ str.l = 0;
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ }
+ if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
+ free(sample_fname);
+ }
+ if (!gen_fname) {
+ if ( str.m ) free(str.s);
+ return;
+ }
+
+ int prev_rid = -1, prev_pos = -1;
+ int no_alt = 0, non_biallelic = 0, filtered = 0, ndup = 0, nok = 0;
+ BGZF *gout = bgzf_open(gen_fname, gen_compressed ? "wg" : "wu");
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) { filtered++; continue; }
+ }
+
+ // ALT allele is required
+ if ( line->n_allele<2 ) { no_alt++; continue; }
+
+ // biallelic required
+ if ( line->n_allele>2 ) {
+ if (!non_biallelic)
+ fprintf(stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ non_biallelic++;
+ continue;
+ }
+
+ // skip duplicate lines, or otherwise shapeit complains
+ if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
+ prev_rid = line->rid;
+ prev_pos = line->pos;
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( str.l )
+ {
+ int ret = bgzf_write(gout, str.s, str.l);
+ if ( ret!= str.l ) error("Error writing %s: %s\n", gen_fname,strerror(errno));
+ nok++;
+ }
+ }
+ fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
+ nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
+
+ if ( str.m ) free(str.s);
+ if ( bgzf_close(gout)!=0 ) error("Error closing %s: %s\n", gen_fname,strerror(errno));
+ free(gen_fname);
+}
+
+static void vcf_to_haplegendsample(args_t *args)
+{
+ kstring_t str = {0,0,0};
+ if ( args->hap2dip )
+ kputs("%_GT_TO_HAP2\n", &str);
+ else
+ kputs("%_GT_TO_HAP\n", &str);
+ open_vcf(args,str.s);
+
+ int ret, hap_compressed = 1, legend_compressed = 1, sample_compressed = 0;
+ char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
+ str.l = 0;
+ kputs(args->outfname,&str);
+ int n_files, i;
+ char **files = hts_readlist(str.s, 0, &n_files);
+ if ( n_files==1 )
+ {
+ int l = str.l;
+ kputs(".samples",&str);
+ sample_fname = strdup(str.s);
+ str.l = l;
+ kputs(".legend.gz",&str);
+ legend_fname = strdup(str.s);
+ str.l = l;
+ kputs(".hap.gz",&str);
+ hap_fname = strdup(str.s);
+ }
+ else if ( n_files==3 )
+ {
+ if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
+ if (strlen(files[1]) && strcmp(files[1],".")!=0) legend_fname = strdup(files[1]);
+ if (strlen(files[2]) && strcmp(files[2],".")!=0) sample_fname = strdup(files[2]);
+ }
+ else
+ {
+ error("Error parsing --hapslegendsample filenames: %s\n", args->outfname);
+ }
+ for (i=0; i<n_files; i++) free(files[i]);
+ free(files);
+
+ if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
+ if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0;
+ if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
+
+ if (hap_fname) fprintf(stderr, "Haps file: %s\n", hap_fname);
+ if (legend_fname) fprintf(stderr, "Legend file: %s\n", legend_fname);
+ if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
+
+ // write samples file
+ if (sample_fname) {
+ int i;
+ BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
+ str.l = 0;
+ kputs("sample population group sex\n", &str);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ for (i=0; i<bcf_hdr_nsamples(args->header); i++)
+ {
+ str.l = 0;
+ ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ }
+ if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
+ free(sample_fname);
+ }
+ if (!hap_fname && !legend_fname) {
+ if ( str.m ) free(str.s);
+ return;
+ }
+
+ // open haps and legend outputs
+ BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
+ if (legend_fname) {
+ str.l = 0;
+ kputs("id position a0 a1\n", &str);
+ ret = bgzf_write(lout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
+ }
+
+ int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) { filtered++; continue; }
+ }
+
+ // ALT allele is required
+ if ( line->n_allele<2 ) { no_alt++; continue; }
+ // biallelic required
+ if ( line->n_allele>2 ) {
+ if (!non_biallelic)
+ fprintf(stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ non_biallelic++;
+ continue;
+ }
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( !str.l ) continue;
+
+ // write haps file
+ if (hap_fname) {
+ ret = bgzf_write(hout, str.s, str.l); // write hap file
+ if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
+ }
+ if (legend_fname) {
+ str.l = 0;
+ if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) )
+ ksprintf(&str, "%s %d %s %s\n", line->d.id, line->pos+1, line->d.allele[0], line->d.allele[1]);
+ else
+ ksprintf(&str, "%s:%d_%s_%s %d %s %s\n", bcf_seqname(args->header, line), line->pos+1, line->d.allele[0], line->d.allele[1], line->pos+1, line->d.allele[0], line->d.allele[1]);
+
+ // write legend file
+ ret = bgzf_write(lout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
+ }
+ nok++;
+ }
+ fprintf(stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
+ if ( str.m ) free(str.s);
+ if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
+ if ( lout && bgzf_close(lout)!=0 ) error("Error closing %s: %s\n", legend_fname, strerror(errno));
+ if (hap_fname) free(hap_fname);
+ if (legend_fname) free(legend_fname);
+}
+
+static void vcf_to_hapsample(args_t *args)
+{
+ /*
+ * WTCCC style haplotypes file
+ * see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#hapsample
+ *
+ * These are essentially the haplotypes from the impute2 format with some
+ * legend info tacked on to the first 5 columns
+ *
+ */
+ kstring_t str = {0,0,0};
+
+ // print ID instead of CHROM:POS_REF_ALT1
+ if ( args->output_vcf_ids )
+ kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
+ else
+ kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
+
+ if ( args->hap2dip )
+ kputs("%_GT_TO_HAP2\n", &str);
+ else
+ kputs("%_GT_TO_HAP\n", &str);
+ open_vcf(args,str.s);
+
+ int ret, hap_compressed = 1, sample_compressed = 0;
+ char *hap_fname = NULL, *sample_fname = NULL;
+ str.l = 0;
+ kputs(args->outfname,&str);
+ int n_files, i;
+ char **files = hts_readlist(str.s, 0, &n_files);
+ if ( n_files==1 )
+ {
+ int l = str.l;
+ kputs(".sample",&str);
+ sample_fname = strdup(str.s);
+ str.l = l;
+ kputs(".hap.gz",&str);
+ hap_fname = strdup(str.s);
+ }
+ else if ( n_files==2 )
+ {
+ if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
+ if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
+ }
+ else
+ {
+ error("Error parsing --hapsample filenames: %s\n", args->outfname);
+ }
+ for (i=0; i<n_files; i++) free(files[i]);
+ free(files);
+
+ if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
+ if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
+
+ if (hap_fname) fprintf(stderr, "Haps file: %s\n", hap_fname);
+ if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
+
+ // write samples file
+ if (sample_fname) {
+ int i;
+ BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
+ str.l = 0;
+ kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ for (i=0; i<bcf_hdr_nsamples(args->header); i++)
+ {
+ str.l = 0;
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ }
+ if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
+ free(sample_fname);
+ }
+ if (!hap_fname) {
+ if ( str.m ) free(str.s);
+ return;
+ }
+
+ // open haps output
+ BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+
+ int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) { filtered++; continue; }
+ }
+
+ // ALT allele is required
+ if ( line->n_allele<2 ) { no_alt++; continue; }
+ // biallelic required
+ if ( line->n_allele>2 ) {
+ if (!non_biallelic)
+ fprintf(stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ non_biallelic++;
+ continue;
+ }
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( !str.l ) continue;
+
+ // write haps file
+ if (hap_fname) {
+ ret = bgzf_write(hout, str.s, str.l); // write hap file
+ if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
+ }
+ nok++;
+ }
+ fprintf(stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
+ if ( str.m ) free(str.s);
+ if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
+ if (hap_fname) free(hap_fname);
+}
+
+static void bcf_hdr_set_chrs(bcf_hdr_t *hdr, faidx_t *fai)
+{
+ int i, n = faidx_nseq(fai);
+ for (i=0; i<n; i++)
+ {
+ const char *seq = faidx_iseq(fai,i);
+ int len = faidx_seq_len(fai, seq);
+ bcf_hdr_printf(hdr, "##contig=<ID=%s,length=%d>", seq,len);
+ }
+}
+static inline int acgt_to_5(char base)
+{
+ if ( base=='A' ) return 0;
+ if ( base=='C' ) return 1;
+ if ( base=='G' ) return 2;
+ if ( base=='T' ) return 3;
+ return 4;
+}
+static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[], int *nals, int ref, int32_t *gts)
+{
+ if ( se - ss > 2 ) return -1; // currently only SNPs
+
+ if ( ss[0]=='-' )
+ {
+ // missing GT
+ gts[0] = bcf_gt_missing;
+ gts[1] = bcf_int32_vector_end;
+ args->n.missing++;
+ return 0;
+ }
+ if ( ss[0]=='I' ) return -2; // skip insertions/deletions for now
+ if ( ss[0]=='D' ) return -2;
+
+ int a0 = acgt_to_5(toupper(ss[0]));
+ int a1 = ss[1] ? acgt_to_5(toupper(ss[1])) : a0;
+ if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
+ if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
+
+ gts[0] = bcf_gt_unphased(alleles[a0]);
+ gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
+
+ if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR
+ else if ( ref==a0 ) args->n.het_ra++; // het: RA
+ else if ( ref==a1 ) args->n.het_ra++; // het: AR
+ else if ( a0==a1 ) args->n.hom_aa++; // hom-alt: AA
+ else args->n.het_aa++; // non-ref het: AA
+
+ return 0;
+}
+static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+
+ int len;
+ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1);
+
+ int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 }; // a,c,g,t,n
+ ref[0] = toupper(ref[0]);
+ int iref = acgt_to_5(ref[0]);
+ alleles[iref] = 0;
+
+ rec->n_sample = bcf_hdr_nsamples(args->header);
+
+ int i, ret;
+ for (i=0; i<rec->n_sample; i++)
+ {
+ if ( i>0 )
+ {
+ ret = tsv_next(tsv);
+ if ( ret==-1 ) error("Too few columns for %d samples at %s:%d\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid), rec->pos+1);
+ }
+ ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
+ if ( ret==-1 ) error("Error parsing the site %s:%d, expected two characters\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1);
+ if ( ret==-2 )
+ {
+ // something else than a SNP
+ free(ref);
+ return -1;
+ }
+ }
+
+ args->str.l = 0;
+ kputc(ref[0], &args->str);
+ for (i=0; i<5; i++)
+ {
+ if ( alleles[i]>0 )
+ {
+ kputc(',', &args->str);
+ kputc("ACGTN"[i], &args->str);
+ }
+ }
+ bcf_update_alleles_str(args->header, rec, args->str.s);
+ if ( bcf_update_genotypes(args->header,rec,args->gts,rec->n_sample*2) ) error("Could not update the GT field\n");
+
+ free(ref);
+ return 0;
+}
+
+static void tsv_to_vcf(args_t *args)
+{
+ if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
+ if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
+
+ args->ref = fai_load(args->ref_fname);
+ if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_set_chrs(args->header, args->ref);
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, n;
+ char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
+ if ( !smpls ) error("Could not parse %s\n", args->sample_list);
+ for (i=0; i<n; i++)
+ {
+ bcf_hdr_add_sample(args->header, smpls[i]);
+ free(smpls[i]);
+ }
+ free(smpls);
+ bcf_hdr_add_sample(args->header, NULL);
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+
+ tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
+ if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
+ if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
+ if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
+ if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
+
+ bcf1_t *rec = bcf_init();
+ bcf_float_set_missing(rec->qual);
+
+ kstring_t line = {0,0,0};
+ htsFile *in_fh = hts_open(args->infname, "r");
+ if ( !in_fh ) error("Could not read: %s\n", args->infname);
+ while ( hts_getline(in_fh, KS_SEP_LINE, &line) > 0 )
+ {
+ if ( line.s[0]=='#' ) continue; // skip comments
+ bcf_clear(rec);
+
+ args->n.total++;
+ if ( !tsv_parse(tsv, rec, line.s) )
+ bcf_write(out_fh, args->header, rec);
+ else
+ args->n.skipped++;
+ }
+ if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
+ free(line.s);
+
+ bcf_hdr_destroy(args->header);
+ hts_close(out_fh);
+ tsv_destroy(tsv);
+ bcf_destroy(rec);
+ free(args->str.s);
+ free(args->gts);
+
+ fprintf(stderr,"Rows total: \t%d\n", args->n.total);
+ fprintf(stderr,"Rows skipped: \t%d\n", args->n.skipped);
+ fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing);
+ fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr);
+ fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra);
+ fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa);
+ fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa);
+}
+
+static void vcf_to_vcf(args_t *args)
+{
+ open_vcf(args,NULL);
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ bcf_hdr_write(out_fh,hdr);
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+ bcf_write(out_fh,hdr,line);
+ }
+ hts_close(out_fh);
+}
+
+static void gvcf_to_vcf(args_t *args)
+{
+ if ( !args->ref_fname ) error("--gvcf2vcf requires the --fasta-ref option\n");
+
+ args->ref = fai_load(args->ref_fname);
+ if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
+
+ open_vcf(args,NULL);
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
+ bcf_hdr_write(out_fh,hdr);
+
+ int32_t *itmp = NULL, nitmp = 0;
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+
+ if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") )
+ {
+ // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+
+ int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
+ if ( nend!=1 )
+ {
+ // No END lineord
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+ bcf_update_info_int32(hdr,line,"END",NULL,0);
+ int pos, len;
+ for (pos=line->pos; pos<itmp[0]; pos++)
+ {
+ line->pos = pos;
+ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1);
+ // we have already checked above that there is only one allele,
+ // so fine to just update alleles with the ref allele from the fasta
+ bcf_update_alleles_str(hdr, line, &ref[0]);
+ bcf_write(out_fh,hdr,line);
+ }
+ }
+ free(itmp);
+ hts_close(out_fh);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n");
+ fprintf(stderr, " formats details. When specifying output files explicitly instead\n");
+ fprintf(stderr, " of with <prefix>, one can use '-' for stdout and '.' to suppress.\n");
+ fprintf(stderr, "Usage: bcftools convert [OPTIONS] <input_file>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "VCF input options:\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --samples <list> list of samples to include\n");
+ fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "VCF output options:\n");
+ fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
+ fprintf(stderr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
+ fprintf(stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
+ fprintf(stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
+ fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "gVCF conversion:\n");
+ fprintf(stderr, " --gvcf2vcf expand gVCF reference blocks\n");
+ fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
+ fprintf(stderr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
+ fprintf(stderr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
+ fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "HAP/LEGEND/SAMPLE conversion:\n");
+ fprintf(stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
+ fprintf(stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
+ fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "TSV conversion:\n");
+ fprintf(stderr, " --tsv2vcf <file> \n");
+ fprintf(stderr, " -c, --columns <string> columns of the input tsv file [ID,CHROM,POS,AA]\n");
+ fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(stderr, " -s, --samples <list> list of sample names\n");
+ fprintf(stderr, " -S, --samples-file <file> file of sample names\n");
+ fprintf(stderr, "\n");
+ // fprintf(stderr, "PLINK options:\n");
+ // fprintf(stderr, " -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
+ // fprintf(stderr, " --tped make tped file instead\n");
+ // fprintf(stderr, " --bin make binary bed/fam/bim files\n");
+ // fprintf(stderr, "\n");
+ // fprintf(stderr, "PBWT options:\n");
+ // fprintf(stderr, " -b, --pbwt <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
+ // fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfconvert(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->outfname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+
+ static struct option loptions[] =
+ {
+ {"include",required_argument,NULL,'i'},
+ {"exclude",required_argument,NULL,'e'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {"gensample",required_argument,NULL,'g'},
+ {"gensample2vcf",required_argument,NULL,'G'},
+ {"tag",required_argument,NULL,1},
+ {"chrom",no_argument,NULL,8},
+ {"tsv2vcf",required_argument,NULL,2},
+ {"hapsample",required_argument,NULL,7},
+ {"hapsample2vcf",required_argument,NULL,3},
+ {"vcf-ids",no_argument,NULL,4},
+ {"haploid2diploid",no_argument,NULL,5},
+ {"gvcf2vcf",no_argument,NULL,6},
+ {"haplegendsample",required_argument,NULL,'h'},
+ {"haplegendsample2vcf",required_argument,NULL,'H'},
+ {"columns",required_argument,NULL,'c'},
+ {"fasta-ref",required_argument,NULL,'f'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; args->targets_is_file = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 'g': args->convert_func = vcf_to_gensample; args->outfname = optarg; break;
+ case 'G': args->convert_func = gensample_to_vcf; args->infname = optarg; break;
+ case 1 : args->tag = optarg; break;
+ case 2 : args->convert_func = tsv_to_vcf; args->infname = optarg; break;
+ case 3 : args->convert_func = hapsample_to_vcf; args->infname = optarg; break;
+ case 4 : args->output_vcf_ids = 1; break;
+ case 5 : args->hap2dip = 1; break;
+ case 6 : args->convert_func = gvcf_to_vcf; break;
+ case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
+ case 8 : args->output_chrom_first_col = 1; break;
+ case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
+ case 'f': args->ref_fname = optarg; break;
+ case 'c': args->columns = optarg; break;
+ case 'o': args->outfname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( !args->infname )
+ {
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) args->infname = "-";
+ }
+ else args->infname = argv[optind];
+ }
+ if ( !args->infname ) usage();
+
+ if ( args->convert_func ) args->convert_func(args);
+ else vcf_to_vcf(args);
+
+ destroy_data(args);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c
new file mode 100644
index 0000000..03b24b4
--- /dev/null
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -0,0 +1,1450 @@
+#include "pysam.h"
+
+/* vcfconvert.c -- convert between VCF/BCF and related formats.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/faidx.h>
+#include <htslib/vcf.h>
+#include <htslib/bgzf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <htslib/kseq.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "convert.h"
+#include "tsv2vcf.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct _args_t args_t;
+struct _args_t
+{
+ faidx_t *ref;
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ convert_t *convert;
+ bcf_srs_t *files;
+ bcf_hdr_t *header;
+ void (*convert_func)(struct _args_t *);
+ struct {
+ int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
+ } n;
+ kstring_t str;
+ int32_t *gts;
+ float *flt;
+ int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
+ int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
+ char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
+ char *outfname, *infname, *ref_fname;
+ int argc, n_threads;
+};
+
+static void destroy_data(args_t *args)
+{
+ if ( args->ref ) fai_destroy(args->ref);
+ if ( args->convert) convert_destroy(args->convert);
+ if ( args->filter ) filter_destroy(args->filter);
+ free(args->samples);
+ if ( args->files ) bcf_sr_destroy(args->files);
+}
+
+static void open_vcf(args_t *args, const char *format_str)
+{
+ args->files = bcf_sr_init();
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( !bcf_sr_add_reader(args->files, args->infname) )
+ error("Failed to open %s: %s\n", args->infname,bcf_sr_strerror(args->files->errnum));
+
+ args->header = args->files->readers[0].header;
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->header, args->filter_str);
+
+ int i, nsamples = 0, *samples = NULL;
+ if ( args->sample_list && strcmp("-",args->sample_list) )
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
+ if ( ret<0 ) error("Error parsing the sample list\n");
+ else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
+ }
+
+ if ( args->sample_list[0]!='^' )
+ {
+ // the sample ordering may be different if not negated
+ int n;
+ char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
+ if ( !smpls ) error("Could not parse %s\n", args->sample_list);
+ if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
+ error("The number of samples does not match, perhaps some are present multiple times?\n");
+ nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
+ samples = (int*) malloc(sizeof(int)*nsamples);
+ for (i=0; i<n; i++)
+ {
+ samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
+ free(smpls[i]);
+ }
+ free(smpls);
+ }
+ }
+ if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
+ free(samples);
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->header, args->filter_str);
+}
+
+static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+
+ char tmp, *se = tsv->ss, *ss = tsv->ss;
+ while ( se < tsv->se && *se!=':' ) se++;
+ if ( *se!=':' ) error("Could not parse CHROM in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
+ tmp = *se; *se = 0;
+ rec->rid = bcf_hdr_name2id(args->header,ss);
+ if ( rec->rid<0 ) error("Could not determine sequence name or multiple sequences present: %s\n", tsv->ss);
+ *se = tmp;
+
+ // POS
+ rec->pos = strtol(se+1,&ss,10);
+ if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
+ rec->pos--;
+
+ // REF,ALT
+ args->str.l = 0;
+ se = ++ss;
+ while ( se < tsv->se && *se!='_' ) se++;
+ if ( *se!='_' ) error("Could not parse REF in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
+ kputsn(ss,se-ss,&args->str);
+ ss = ++se;
+ while ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) se++;
+ if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) error("Could not parse ALT in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
+ kputc(',',&args->str);
+ kputsn(ss,se-ss,&args->str);
+ bcf_update_alleles_str(args->header, rec, args->str.s);
+
+ // END - optional
+ if (*se && *se=='_') {
+ long end = strtol(se+1,&ss,10);
+ if ( ss==se+1 ) error("Could not parse END in CHROM:POS_REF_ALT_END: %s\n", tsv->ss);
+ bcf_update_info_int32(args->header, rec, "END", &end, 1);
+ }
+
+ return 0;
+}
+static int tsv_setter_verify_pos(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ char *se;
+ int pos = strtol(tsv->ss,&se,10);
+ if ( tsv->ss==se ) error("Could not parse POS: %s\n", tsv->ss);
+ if ( rec->pos != pos-1 ) error("POS mismatch: %s\n", tsv->ss);
+ return 0;
+}
+static int tsv_setter_verify_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ args->rev_als = 0;
+ char tmp = *tsv->se; *tsv->se = 0;
+ if ( strcmp(tsv->ss,rec->d.allele[0]) )
+ {
+ if ( strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
+ args->rev_als = 1;
+ }
+ *tsv->se = tmp;
+ while ( *tsv->se && isspace(*tsv->se) ) tsv->se++;
+ tsv->ss = tsv->se;
+ while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
+ tmp = *tsv->se; *tsv->se = 0;
+ if ( !args->rev_als && strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
+ else if ( args->rev_als && strcmp(tsv->ss,rec->d.allele[0]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[0]); }
+ *tsv->se = tmp;
+ return 0;
+}
+static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ int i, nsamples = bcf_hdr_nsamples(args->header);
+ for (i=0; i<nsamples; i++)
+ {
+ float aa,ab,bb;
+ aa = strtod(tsv->ss, &tsv->se);
+ if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse first value of %d-th sample\n", i+1); return -1; }
+ tsv->ss = tsv->se+1;
+ ab = strtod(tsv->ss, &tsv->se);
+ if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse second value of %d-th sample\n", i+1); return -1; }
+ tsv->ss = tsv->se+1;
+ bb = strtod(tsv->ss, &tsv->se);
+ if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse third value of %d-th sample\n", i+1); return -1; }
+ tsv->ss = tsv->se+1;
+
+ if ( args->rev_als ) { float tmp = bb; bb = aa; aa = tmp; }
+ args->flt[3*i+0] = aa;
+ args->flt[3*i+1] = ab;
+ args->flt[3*i+2] = bb;
+
+ if ( aa >= ab )
+ {
+ if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
+ else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else if ( ab >= bb )
+ {
+ args->gts[2*i+0] = bcf_gt_unphased(0);
+ args->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
+ }
+ if ( *tsv->se ) error("Could not parse: %s\n", tsv->ss);
+ if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
+ if ( bcf_update_format_float(args->header,rec,"GP",args->flt,nsamples*3) ) error("Could not update GP field\n");
+ return 0;
+}
+static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ int i, nsamples = bcf_hdr_nsamples(args->header);
+
+ int32_t a0, a1;
+ if ( args->rev_als ) { a0 = bcf_gt_phased(1); a1 = bcf_gt_phased(0); }
+ else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
+
+ // up is short for "unphased"
+ int nup = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ char *ss = tsv->ss + 4*i + nup;
+ int up = 0, all;
+
+ for (all=0; all < 2; all++){
+ // checking for premature ending
+ if ( !ss[0] || !ss[1] || !ss[2] ||
+ (up && (!ss[3] || !ss[4]) ) )
+ {
+ fprintf(pysamerr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]);
+ return -1;
+ }
+
+ switch(ss[all*2+up]){
+ case '0':
+ args->gts[2*i+all] = a0;
+ break;
+ case '1' :
+ args->gts[2*i+all] = a1;
+ break;
+ case '?' :
+ // there is no macro to express phased missing allele
+ args->gts[2*i+all] = bcf_gt_phased(-1);
+ break;
+ case '-' :
+ args->gts[2*i+all] = bcf_int32_vector_end;
+ break;
+ default :
+ fprintf(pysamerr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
+ return -1;
+ }
+ if( ss[all*2+up+1]=='*' ) up = up + 1;
+ }
+
+ if(up && up != 2)
+ {
+ fprintf(pysamerr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
+ return -1;
+ }
+
+ // change alleles to unphased if the alleles are unphased
+ if ( up )
+ {
+ args->gts[2*i] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i]));
+ args->gts[2*i+1] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i+1]));
+ }
+ nup = nup + up;
+ }
+ if ( tsv->ss[(nsamples-1)*4+3+nup] )
+ {
+ fprintf(pysamerr,"nup: %d", nup);
+ fprintf(pysamerr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]);
+ return -1;
+ }
+
+ if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
+ return 0;
+}
+static void gensample_to_vcf(args_t *args)
+{
+ /*
+ * Inpute: IMPUTE2 output (indentation changed here for clarity):
+ *
+ * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ...
+ * --- 20:62116698_C_A 62116698 C A 1 0 0 ...
+ *
+ * Second column is expected in the form of CHROM:POS_REF_ALT. We use second
+ * column because the first can be empty ("--") when filling sites from reference
+ * panel.
+ *
+ * Output: VCF with filled GT,GP
+ *
+ */
+ kstring_t line = {0,0,0};
+
+ char *gen_fname = NULL, *sample_fname = NULL;
+ sample_fname = strchr(args->infname,',');
+ if ( !sample_fname )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.gen.gz", args->infname);
+ gen_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.samples", args->infname);
+ sample_fname = strdup(args->str.s);
+ }
+ else
+ {
+ *sample_fname = 0;
+ gen_fname = strdup(args->infname);
+ sample_fname = strdup(sample_fname+1);
+ }
+ htsFile *gen_fh = hts_open(gen_fname, "r");
+ if ( !gen_fh ) error("Could not read: %s\n", gen_fname);
+ if ( hts_getline(gen_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", gen_fname);
+
+ // Find out the chromosome name, sample names, init and print the VCF header
+ args->str.l = 0;
+ char *ss, *se = line.s;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) error("Could not parse %s: %s\n", gen_fname,line.s);
+ ss = se+1;
+ se = strchr(ss,':');
+ if ( !se ) error("Expected CHROM:POS_REF_ALT in second column of %s\n", gen_fname);
+ kputsn(ss, se-ss, &args->str);
+
+ tsv_t *tsv = tsv_init("-,CHROM_POS_REF_ALT,POS,REF_ALT,GT_GP");
+ tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
+ tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
+ tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
+ tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype Probabilities\">");
+ bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, nsamples;
+ char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ for (i=2; i<nsamples; i++)
+ {
+ se = samples[i]; while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ bcf_hdr_add_sample(args->header,samples[i]);
+ }
+ for (i=0; i<nsamples; i++) free(samples[i]);
+ free(samples);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+ bcf1_t *rec = bcf_init();
+
+ nsamples -= 2;
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
+ args->flt = (float *) malloc(sizeof(float)*nsamples*3);
+
+ do
+ {
+ bcf_clear(rec);
+ args->n.total++;
+ if ( !tsv_parse(tsv, rec, line.s) )
+ bcf_write(out_fh, args->header, rec);
+ else
+ error("Error occurred while parsing: %s\n", line.s);
+ }
+ while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
+
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
+ bcf_hdr_destroy(args->header);
+ bcf_destroy(rec);
+ free(sample_fname);
+ free(gen_fname);
+ free(args->str.s);
+ free(line.s);
+ free(args->gts);
+ free(args->flt);
+ tsv_destroy(tsv);
+
+ fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total);
+}
+
+static void haplegendsample_to_vcf(args_t *args)
+{
+ /*
+ * Convert from IMPUTE2 hap/legend/sample output files to VCF
+ *
+ * hap:
+ * 0 1 0 1
+ * legend:
+ * id position a0 a1
+ * 1:186946386_G_T 186946386 G T
+ * sample:
+ * sample population group sex
+ * sample1 sample1 sample1 2
+ * sample2 sample2 sample2 2
+ *
+ * Output: VCF with filled GT
+ */
+ kstring_t line = {0,0,0};
+
+ char *hap_fname = NULL, *leg_fname = NULL, *sample_fname = NULL;
+ sample_fname = strchr(args->infname,',');
+ if ( !sample_fname )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.hap.gz", args->infname);
+ hap_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.samples", args->infname);
+ sample_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.legend.gz", args->infname);
+ leg_fname = strdup(args->str.s);
+ }
+ else
+ {
+ char *ss = sample_fname, *se = strchr(ss+1,',');
+ if ( !se ) error("Could not parse hap/legend/sample file names: %s\n", args->infname);
+ *ss = 0;
+ *se = 0;
+ hap_fname = strdup(args->infname);
+ leg_fname = strdup(ss+1);
+ sample_fname = strdup(se+1);
+ }
+ htsFile *hap_fh = hts_open(hap_fname, "r");
+ if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
+
+ htsFile *leg_fh = hts_open(leg_fname,"r");
+ if ( !leg_fh ) error("Could not read: %s\n", leg_fname);
+
+ // Eat up first legend line, then determine chromosome name
+ if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
+ if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
+
+ // Find out the chromosome name, sample names, init and print the VCF header
+ args->str.l = 0;
+ char *se = strchr(line.s,':');
+ if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", leg_fname);
+ kputsn(line.s, se-line.s, &args->str);
+
+ tsv_t *leg_tsv = tsv_init("CHROM_POS_REF_ALT,POS,REF_ALT");
+ tsv_register(leg_tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
+ tsv_register(leg_tsv, "POS", tsv_setter_verify_pos, NULL);
+ tsv_register(leg_tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
+
+ tsv_t *hap_tsv = tsv_init("HAPS");
+ tsv_register(hap_tsv, "HAPS", tsv_setter_haps, args);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, nrows, nsamples;
+ char **samples = hts_readlist(sample_fname, 1, &nrows);
+ nsamples = nrows - 1;
+
+ // sample_fname should contain a header line, so need to ignore first row
+ // returned from hts_readlist (i=1, and not i=0)
+ for (i=1; i<nrows; i++)
+ {
+ se = samples[i]; while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ bcf_hdr_add_sample(args->header,samples[i]);
+ }
+ bcf_hdr_add_sample(args->header,NULL);
+ for (i=0; i<nrows; i++) free(samples[i]);
+ free(samples);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+ bcf1_t *rec = bcf_init();
+
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
+
+ while (1)
+ {
+ bcf_clear(rec);
+ args->n.total++;
+ if ( tsv_parse(leg_tsv, rec, line.s) )
+ error("Error occurred while parsing %s: %s\n", leg_fname,line.s);
+
+ if ( hts_getline(hap_fh, KS_SEP_LINE, &line)<=0 )
+ error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
+
+ if ( tsv_parse(hap_tsv, rec, line.s) )
+ error("Error occurred while parsing %s: %s\n", hap_fname,line.s);
+
+ bcf_write(out_fh, args->header, rec);
+
+ if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 )
+ {
+ if ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 )
+ error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
+ break;
+ }
+ }
+
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
+ if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
+ bcf_hdr_destroy(args->header);
+ bcf_destroy(rec);
+ free(sample_fname);
+ free(hap_fname);
+ free(leg_fname);
+ free(args->str.s);
+ free(line.s);
+ free(args->gts);
+ tsv_destroy(hap_tsv);
+ tsv_destroy(leg_tsv);
+
+ fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total);
+}
+
+static void hapsample_to_vcf(args_t *args)
+{
+ /*
+ * Input: SHAPEIT output
+ *
+ * 20:19995888_A_G 20:19995888 19995888 A G 0 0 0 0 ...
+ *
+ * First column is expected in the form of CHROM:POS_REF_ALT
+ *
+ * Output: VCF with filled GT
+ *
+ */
+ kstring_t line = {0,0,0};
+
+ char *hap_fname = NULL, *sample_fname = NULL;
+ sample_fname = strchr(args->infname,',');
+ if ( !sample_fname )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.hap.gz", args->infname);
+ hap_fname = strdup(args->str.s);
+ args->str.l = 0;
+ ksprintf(&args->str,"%s.samples", args->infname);
+ sample_fname = strdup(args->str.s);
+ }
+ else
+ {
+ *sample_fname = 0;
+ hap_fname = strdup(args->infname);
+ sample_fname = strdup(sample_fname+1);
+ }
+ htsFile *hap_fh = hts_open(hap_fname, "r");
+ if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
+ if ( hts_getline(hap_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", hap_fname);
+
+ // Find out the chromosome name, sample names, init and print the VCF header
+ args->str.l = 0;
+ char *se = strchr(line.s,':');
+ if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", hap_fname);
+ kputsn(line.s, se-line.s, &args->str);
+
+ tsv_t *tsv = tsv_init("CHROM_POS_REF_ALT,-,POS,REF_ALT,HAPS");
+ tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
+ tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
+ tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
+ tsv_register(tsv, "HAPS", tsv_setter_haps, args);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, nsamples;
+ char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ for (i=2; i<nsamples; i++)
+ {
+ se = samples[i]; while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ bcf_hdr_add_sample(args->header,samples[i]);
+ }
+ bcf_hdr_add_sample(args->header,NULL);
+ for (i=0; i<nsamples; i++) free(samples[i]);
+ free(samples);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+ bcf1_t *rec = bcf_init();
+
+ nsamples -= 2;
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
+
+ do
+ {
+ bcf_clear(rec);
+ args->n.total++;
+ if ( !tsv_parse(tsv, rec, line.s) )
+ bcf_write(out_fh, args->header, rec);
+ else
+ error("Error occurred while parsing: %s\n", line.s);
+ }
+ while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
+
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
+ bcf_hdr_destroy(args->header);
+ bcf_destroy(rec);
+ free(sample_fname);
+ free(hap_fname);
+ free(args->str.s);
+ free(line.s);
+ free(args->gts);
+ tsv_destroy(tsv);
+
+ fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total);
+}
+
+static void vcf_to_gensample(args_t *args)
+{
+ kstring_t str = {0,0,0};
+
+ // insert chrom as first column if needed
+ if(args->output_chrom_first_col)
+ kputs("%CHROM ", &str);
+ else
+ kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
+
+ // insert rsid as second column if needed
+ if(args->output_vcf_ids)
+ kputs("%ID ", &str);
+ else
+ kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
+
+ kputs("%POS %REF %FIRST_ALT", &str);
+ if ( !args->tag || !strcmp(args->tag,"GT") ) kputs("%_GT_TO_PROB3",&str);
+ else if ( !strcmp(args->tag,"PL") ) kputs("%_PL_TO_PROB3",&str);
+ else if ( !strcmp(args->tag,"GP") ) kputs("%_GP_TO_PROB3",&str);
+ else error("todo: --tag %s\n", args->tag);
+ kputs("\n", &str);
+ open_vcf(args,str.s);
+
+ int ret, gen_compressed = 1, sample_compressed = 0;
+ char *gen_fname = NULL, *sample_fname = NULL;
+ str.l = 0;
+ kputs(args->outfname,&str);
+ int n_files, i;
+ char **files = hts_readlist(str.s, 0, &n_files);
+ if ( n_files==1 )
+ {
+ int l = str.l;
+ kputs(".samples",&str);
+ sample_fname = strdup(str.s);
+ str.l = l;
+ kputs(".gen.gz",&str);
+ gen_fname = strdup(str.s);
+ }
+ else if ( n_files==2 )
+ {
+ if (strlen(files[0]) && strcmp(files[0],".")!=0) gen_fname = strdup(files[0]);
+ if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
+ }
+ else
+ {
+ error("Error parsing --gensample filenames: %s\n", args->outfname);
+ }
+ for (i=0; i<n_files; i++) free(files[i]);
+ free(files);
+
+ if ( gen_fname && (strlen(gen_fname)<3 || strcasecmp(".gz",gen_fname+strlen(gen_fname)-3)) ) gen_compressed = 0;
+ if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
+
+ if (gen_fname) fprintf(pysamerr, "Gen file: %s\n", gen_fname);
+ if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname);
+
+ // write samples file
+ if (sample_fname) {
+ int i;
+ BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
+ str.l = 0;
+ kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ for (i=0; i<bcf_hdr_nsamples(args->header); i++)
+ {
+ str.l = 0;
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ }
+ if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
+ free(sample_fname);
+ }
+ if (!gen_fname) {
+ if ( str.m ) free(str.s);
+ return;
+ }
+
+ int prev_rid = -1, prev_pos = -1;
+ int no_alt = 0, non_biallelic = 0, filtered = 0, ndup = 0, nok = 0;
+ BGZF *gout = bgzf_open(gen_fname, gen_compressed ? "wg" : "wu");
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) { filtered++; continue; }
+ }
+
+ // ALT allele is required
+ if ( line->n_allele<2 ) { no_alt++; continue; }
+
+ // biallelic required
+ if ( line->n_allele>2 ) {
+ if (!non_biallelic)
+ fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ non_biallelic++;
+ continue;
+ }
+
+ // skip duplicate lines, or otherwise shapeit complains
+ if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
+ prev_rid = line->rid;
+ prev_pos = line->pos;
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( str.l )
+ {
+ int ret = bgzf_write(gout, str.s, str.l);
+ if ( ret!= str.l ) error("Error writing %s: %s\n", gen_fname,strerror(errno));
+ nok++;
+ }
+ }
+ fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
+ nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
+
+ if ( str.m ) free(str.s);
+ if ( bgzf_close(gout)!=0 ) error("Error closing %s: %s\n", gen_fname,strerror(errno));
+ free(gen_fname);
+}
+
+static void vcf_to_haplegendsample(args_t *args)
+{
+ kstring_t str = {0,0,0};
+ if ( args->hap2dip )
+ kputs("%_GT_TO_HAP2\n", &str);
+ else
+ kputs("%_GT_TO_HAP\n", &str);
+ open_vcf(args,str.s);
+
+ int ret, hap_compressed = 1, legend_compressed = 1, sample_compressed = 0;
+ char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
+ str.l = 0;
+ kputs(args->outfname,&str);
+ int n_files, i;
+ char **files = hts_readlist(str.s, 0, &n_files);
+ if ( n_files==1 )
+ {
+ int l = str.l;
+ kputs(".samples",&str);
+ sample_fname = strdup(str.s);
+ str.l = l;
+ kputs(".legend.gz",&str);
+ legend_fname = strdup(str.s);
+ str.l = l;
+ kputs(".hap.gz",&str);
+ hap_fname = strdup(str.s);
+ }
+ else if ( n_files==3 )
+ {
+ if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
+ if (strlen(files[1]) && strcmp(files[1],".")!=0) legend_fname = strdup(files[1]);
+ if (strlen(files[2]) && strcmp(files[2],".")!=0) sample_fname = strdup(files[2]);
+ }
+ else
+ {
+ error("Error parsing --hapslegendsample filenames: %s\n", args->outfname);
+ }
+ for (i=0; i<n_files; i++) free(files[i]);
+ free(files);
+
+ if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
+ if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0;
+ if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
+
+ if (hap_fname) fprintf(pysamerr, "Haps file: %s\n", hap_fname);
+ if (legend_fname) fprintf(pysamerr, "Legend file: %s\n", legend_fname);
+ if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname);
+
+ // write samples file
+ if (sample_fname) {
+ int i;
+ BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
+ str.l = 0;
+ kputs("sample population group sex\n", &str);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ for (i=0; i<bcf_hdr_nsamples(args->header); i++)
+ {
+ str.l = 0;
+ ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ }
+ if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
+ free(sample_fname);
+ }
+ if (!hap_fname && !legend_fname) {
+ if ( str.m ) free(str.s);
+ return;
+ }
+
+ // open haps and legend outputs
+ BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
+ if (legend_fname) {
+ str.l = 0;
+ kputs("id position a0 a1\n", &str);
+ ret = bgzf_write(lout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
+ }
+
+ int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) { filtered++; continue; }
+ }
+
+ // ALT allele is required
+ if ( line->n_allele<2 ) { no_alt++; continue; }
+ // biallelic required
+ if ( line->n_allele>2 ) {
+ if (!non_biallelic)
+ fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ non_biallelic++;
+ continue;
+ }
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( !str.l ) continue;
+
+ // write haps file
+ if (hap_fname) {
+ ret = bgzf_write(hout, str.s, str.l); // write hap file
+ if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
+ }
+ if (legend_fname) {
+ str.l = 0;
+ if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) )
+ ksprintf(&str, "%s %d %s %s\n", line->d.id, line->pos+1, line->d.allele[0], line->d.allele[1]);
+ else
+ ksprintf(&str, "%s:%d_%s_%s %d %s %s\n", bcf_seqname(args->header, line), line->pos+1, line->d.allele[0], line->d.allele[1], line->pos+1, line->d.allele[0], line->d.allele[1]);
+
+ // write legend file
+ ret = bgzf_write(lout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
+ }
+ nok++;
+ }
+ fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
+ if ( str.m ) free(str.s);
+ if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
+ if ( lout && bgzf_close(lout)!=0 ) error("Error closing %s: %s\n", legend_fname, strerror(errno));
+ if (hap_fname) free(hap_fname);
+ if (legend_fname) free(legend_fname);
+}
+
+static void vcf_to_hapsample(args_t *args)
+{
+ /*
+ * WTCCC style haplotypes file
+ * see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#hapsample
+ *
+ * These are essentially the haplotypes from the impute2 format with some
+ * legend info tacked on to the first 5 columns
+ *
+ */
+ kstring_t str = {0,0,0};
+
+ // print ID instead of CHROM:POS_REF_ALT1
+ if ( args->output_vcf_ids )
+ kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
+ else
+ kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
+
+ if ( args->hap2dip )
+ kputs("%_GT_TO_HAP2\n", &str);
+ else
+ kputs("%_GT_TO_HAP\n", &str);
+ open_vcf(args,str.s);
+
+ int ret, hap_compressed = 1, sample_compressed = 0;
+ char *hap_fname = NULL, *sample_fname = NULL;
+ str.l = 0;
+ kputs(args->outfname,&str);
+ int n_files, i;
+ char **files = hts_readlist(str.s, 0, &n_files);
+ if ( n_files==1 )
+ {
+ int l = str.l;
+ kputs(".sample",&str);
+ sample_fname = strdup(str.s);
+ str.l = l;
+ kputs(".hap.gz",&str);
+ hap_fname = strdup(str.s);
+ }
+ else if ( n_files==2 )
+ {
+ if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
+ if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
+ }
+ else
+ {
+ error("Error parsing --hapsample filenames: %s\n", args->outfname);
+ }
+ for (i=0; i<n_files; i++) free(files[i]);
+ free(files);
+
+ if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
+ if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
+
+ if (hap_fname) fprintf(pysamerr, "Haps file: %s\n", hap_fname);
+ if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname);
+
+ // write samples file
+ if (sample_fname) {
+ int i;
+ BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
+ str.l = 0;
+ kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ for (i=0; i<bcf_hdr_nsamples(args->header); i++)
+ {
+ str.l = 0;
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]);
+ ret = bgzf_write(sout, str.s, str.l);
+ if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
+ }
+ if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
+ free(sample_fname);
+ }
+ if (!hap_fname) {
+ if ( str.m ) free(str.s);
+ return;
+ }
+
+ // open haps output
+ BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+
+ int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) { filtered++; continue; }
+ }
+
+ // ALT allele is required
+ if ( line->n_allele<2 ) { no_alt++; continue; }
+ // biallelic required
+ if ( line->n_allele>2 ) {
+ if (!non_biallelic)
+ fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ non_biallelic++;
+ continue;
+ }
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( !str.l ) continue;
+
+ // write haps file
+ if (hap_fname) {
+ ret = bgzf_write(hout, str.s, str.l); // write hap file
+ if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
+ }
+ nok++;
+ }
+ fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
+ if ( str.m ) free(str.s);
+ if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
+ if (hap_fname) free(hap_fname);
+}
+
+static void bcf_hdr_set_chrs(bcf_hdr_t *hdr, faidx_t *fai)
+{
+ int i, n = faidx_nseq(fai);
+ for (i=0; i<n; i++)
+ {
+ const char *seq = faidx_iseq(fai,i);
+ int len = faidx_seq_len(fai, seq);
+ bcf_hdr_printf(hdr, "##contig=<ID=%s,length=%d>", seq,len);
+ }
+}
+static inline int acgt_to_5(char base)
+{
+ if ( base=='A' ) return 0;
+ if ( base=='C' ) return 1;
+ if ( base=='G' ) return 2;
+ if ( base=='T' ) return 3;
+ return 4;
+}
+static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[], int *nals, int ref, int32_t *gts)
+{
+ if ( se - ss > 2 ) return -1; // currently only SNPs
+
+ if ( ss[0]=='-' )
+ {
+ // missing GT
+ gts[0] = bcf_gt_missing;
+ gts[1] = bcf_int32_vector_end;
+ args->n.missing++;
+ return 0;
+ }
+ if ( ss[0]=='I' ) return -2; // skip insertions/deletions for now
+ if ( ss[0]=='D' ) return -2;
+
+ int a0 = acgt_to_5(toupper(ss[0]));
+ int a1 = ss[1] ? acgt_to_5(toupper(ss[1])) : a0;
+ if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
+ if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
+
+ gts[0] = bcf_gt_unphased(alleles[a0]);
+ gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
+
+ if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR
+ else if ( ref==a0 ) args->n.het_ra++; // het: RA
+ else if ( ref==a1 ) args->n.het_ra++; // het: AR
+ else if ( a0==a1 ) args->n.hom_aa++; // hom-alt: AA
+ else args->n.het_aa++; // non-ref het: AA
+
+ return 0;
+}
+static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+
+ int len;
+ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1);
+
+ int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 }; // a,c,g,t,n
+ ref[0] = toupper(ref[0]);
+ int iref = acgt_to_5(ref[0]);
+ alleles[iref] = 0;
+
+ rec->n_sample = bcf_hdr_nsamples(args->header);
+
+ int i, ret;
+ for (i=0; i<rec->n_sample; i++)
+ {
+ if ( i>0 )
+ {
+ ret = tsv_next(tsv);
+ if ( ret==-1 ) error("Too few columns for %d samples at %s:%d\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid), rec->pos+1);
+ }
+ ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
+ if ( ret==-1 ) error("Error parsing the site %s:%d, expected two characters\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1);
+ if ( ret==-2 )
+ {
+ // something else than a SNP
+ free(ref);
+ return -1;
+ }
+ }
+
+ args->str.l = 0;
+ kputc(ref[0], &args->str);
+ for (i=0; i<5; i++)
+ {
+ if ( alleles[i]>0 )
+ {
+ kputc(',', &args->str);
+ kputc("ACGTN"[i], &args->str);
+ }
+ }
+ bcf_update_alleles_str(args->header, rec, args->str.s);
+ if ( bcf_update_genotypes(args->header,rec,args->gts,rec->n_sample*2) ) error("Could not update the GT field\n");
+
+ free(ref);
+ return 0;
+}
+
+static void tsv_to_vcf(args_t *args)
+{
+ if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
+ if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
+
+ args->ref = fai_load(args->ref_fname);
+ if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
+
+ args->header = bcf_hdr_init("w");
+ bcf_hdr_set_chrs(args->header, args->ref);
+ bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+ bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+
+ int i, n;
+ char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
+ if ( !smpls ) error("Could not parse %s\n", args->sample_list);
+ for (i=0; i<n; i++)
+ {
+ bcf_hdr_add_sample(args->header, smpls[i]);
+ free(smpls[i]);
+ }
+ free(smpls);
+ bcf_hdr_add_sample(args->header, NULL);
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
+
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_write(out_fh,args->header);
+
+ tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
+ if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
+ if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
+ if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
+ if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
+
+ bcf1_t *rec = bcf_init();
+ bcf_float_set_missing(rec->qual);
+
+ kstring_t line = {0,0,0};
+ htsFile *in_fh = hts_open(args->infname, "r");
+ if ( !in_fh ) error("Could not read: %s\n", args->infname);
+ while ( hts_getline(in_fh, KS_SEP_LINE, &line) > 0 )
+ {
+ if ( line.s[0]=='#' ) continue; // skip comments
+ bcf_clear(rec);
+
+ args->n.total++;
+ if ( !tsv_parse(tsv, rec, line.s) )
+ bcf_write(out_fh, args->header, rec);
+ else
+ args->n.skipped++;
+ }
+ if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
+ free(line.s);
+
+ bcf_hdr_destroy(args->header);
+ hts_close(out_fh);
+ tsv_destroy(tsv);
+ bcf_destroy(rec);
+ free(args->str.s);
+ free(args->gts);
+
+ fprintf(pysamerr,"Rows total: \t%d\n", args->n.total);
+ fprintf(pysamerr,"Rows skipped: \t%d\n", args->n.skipped);
+ fprintf(pysamerr,"Missing GTs: \t%d\n", args->n.missing);
+ fprintf(pysamerr,"Hom RR: \t%d\n", args->n.hom_rr);
+ fprintf(pysamerr,"Het RA: \t%d\n", args->n.het_ra);
+ fprintf(pysamerr,"Hom AA: \t%d\n", args->n.hom_aa);
+ fprintf(pysamerr,"Het AA: \t%d\n", args->n.het_aa);
+}
+
+static void vcf_to_vcf(args_t *args)
+{
+ open_vcf(args,NULL);
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ bcf_hdr_write(out_fh,hdr);
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+ bcf_write(out_fh,hdr,line);
+ }
+ hts_close(out_fh);
+}
+
+static void gvcf_to_vcf(args_t *args)
+{
+ if ( !args->ref_fname ) error("--gvcf2vcf requires the --fasta-ref option\n");
+
+ args->ref = fai_load(args->ref_fname);
+ if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
+
+ open_vcf(args,NULL);
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
+ bcf_hdr_write(out_fh,hdr);
+
+ int32_t *itmp = NULL, nitmp = 0;
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+
+ if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") )
+ {
+ // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+
+ int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
+ if ( nend!=1 )
+ {
+ // No END lineord
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+ bcf_update_info_int32(hdr,line,"END",NULL,0);
+ int pos, len;
+ for (pos=line->pos; pos<itmp[0]; pos++)
+ {
+ line->pos = pos;
+ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1);
+ // we have already checked above that there is only one allele,
+ // so fine to just update alleles with the ref allele from the fasta
+ bcf_update_alleles_str(hdr, line, &ref[0]);
+ bcf_write(out_fh,hdr,line);
+ }
+ }
+ free(itmp);
+ hts_close(out_fh);
+}
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Converts VCF/BCF to other formats and back. See man page for file\n");
+ fprintf(pysamerr, " formats details. When specifying output files explicitly instead\n");
+ fprintf(pysamerr, " of with <prefix>, one can use '-' for stdout and '.' to suppress.\n");
+ fprintf(pysamerr, "Usage: bcftools convert [OPTIONS] <input_file>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "VCF input options:\n");
+ fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --samples <list> list of samples to include\n");
+ fprintf(pysamerr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "VCF output options:\n");
+ fprintf(pysamerr, " -o, --output <file> output file name [stdout]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
+ fprintf(pysamerr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
+ fprintf(pysamerr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
+ fprintf(pysamerr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
+ fprintf(pysamerr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysamerr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "gVCF conversion:\n");
+ fprintf(pysamerr, " --gvcf2vcf expand gVCF reference blocks\n");
+ fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
+ fprintf(pysamerr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
+ fprintf(pysamerr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
+ fprintf(pysamerr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysamerr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "HAP/LEGEND/SAMPLE conversion:\n");
+ fprintf(pysamerr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
+ fprintf(pysamerr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
+ fprintf(pysamerr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysamerr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "TSV conversion:\n");
+ fprintf(pysamerr, " --tsv2vcf <file> \n");
+ fprintf(pysamerr, " -c, --columns <string> columns of the input tsv file [ID,CHROM,POS,AA]\n");
+ fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(pysamerr, " -s, --samples <list> list of sample names\n");
+ fprintf(pysamerr, " -S, --samples-file <file> file of sample names\n");
+ fprintf(pysamerr, "\n");
+ // fprintf(pysamerr, "PLINK options:\n");
+ // fprintf(pysamerr, " -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
+ // fprintf(pysamerr, " --tped make tped file instead\n");
+ // fprintf(pysamerr, " --bin make binary bed/fam/bim files\n");
+ // fprintf(pysamerr, "\n");
+ // fprintf(pysamerr, "PBWT options:\n");
+ // fprintf(pysamerr, " -b, --pbwt <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
+ // fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfconvert(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->outfname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+
+ static struct option loptions[] =
+ {
+ {"include",required_argument,NULL,'i'},
+ {"exclude",required_argument,NULL,'e'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {"gensample",required_argument,NULL,'g'},
+ {"gensample2vcf",required_argument,NULL,'G'},
+ {"tag",required_argument,NULL,1},
+ {"chrom",no_argument,NULL,8},
+ {"tsv2vcf",required_argument,NULL,2},
+ {"hapsample",required_argument,NULL,7},
+ {"hapsample2vcf",required_argument,NULL,3},
+ {"vcf-ids",no_argument,NULL,4},
+ {"haploid2diploid",no_argument,NULL,5},
+ {"gvcf2vcf",no_argument,NULL,6},
+ {"haplegendsample",required_argument,NULL,'h'},
+ {"haplegendsample2vcf",required_argument,NULL,'H'},
+ {"columns",required_argument,NULL,'c'},
+ {"fasta-ref",required_argument,NULL,'f'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; args->targets_is_file = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 'g': args->convert_func = vcf_to_gensample; args->outfname = optarg; break;
+ case 'G': args->convert_func = gensample_to_vcf; args->infname = optarg; break;
+ case 1 : args->tag = optarg; break;
+ case 2 : args->convert_func = tsv_to_vcf; args->infname = optarg; break;
+ case 3 : args->convert_func = hapsample_to_vcf; args->infname = optarg; break;
+ case 4 : args->output_vcf_ids = 1; break;
+ case 5 : args->hap2dip = 1; break;
+ case 6 : args->convert_func = gvcf_to_vcf; break;
+ case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
+ case 8 : args->output_chrom_first_col = 1; break;
+ case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
+ case 'f': args->ref_fname = optarg; break;
+ case 'c': args->columns = optarg; break;
+ case 'o': args->outfname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( !args->infname )
+ {
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) args->infname = "-";
+ }
+ else args->infname = argv[optind];
+ }
+ if ( !args->infname ) usage();
+
+ if ( args->convert_func ) args->convert_func(args);
+ else vcf_to_vcf(args);
+
+ destroy_data(args);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c
new file mode 100644
index 0000000..ac4c3a3
--- /dev/null
+++ b/bcftools/vcffilter.c
@@ -0,0 +1,568 @@
+/* vcffilter.c -- Apply fixed-threshold filters.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "rbuf.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+// FILTER columns annotation: replace or add to existing FILTERs; set FILTER to PASS at good sites?
+#define ANNOT_ADD 1
+#define ANNOT_RESET 2
+
+// Set genotypes of filtered samples
+#define SET_GTS_MISSING 1
+#define SET_GTS_REF 2
+
+typedef struct _args_t
+{
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ const uint8_t *smpl_pass;
+ int set_gts;
+ char *soft_filter; // drop failed sites or annotate FILTER column?
+ int annot_mode; // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is?
+ int flt_fail, flt_pass; // BCF ids of fail and pass filters
+ int snp_gap, indel_gap, IndelGap_id, SnpGap_id;
+ int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac;
+ rbuf_t rbuf;
+ bcf1_t **rbuf_lines;
+
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ htsFile *out_fh;
+ int output_type, n_threads;
+
+ char **argv, *output_fname, *targets_list, *regions_list;
+ int argc;
+}
+args_t;
+
+static void init_data(args_t *args)
+{
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+
+ args->hdr = args->files->readers[0].header;
+ args->flt_pass = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"PASS"); assert( !args->flt_pass ); // sanity check: required by BCF spec
+
+ // -i or -e: append FILTER line
+ if ( args->soft_filter && args->filter_logic )
+ {
+ kstring_t flt_name = {0,0,0};
+ if ( strcmp(args->soft_filter,"+") )
+ kputs(args->soft_filter, &flt_name);
+ else
+ {
+ // Make up a filter name
+ int i = 0, id = -1;
+ do
+ {
+ ksprintf(&flt_name,"Filter%d", ++i);
+ id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s);
+ }
+ while ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,id) );
+ }
+ // escape quotes
+ kstring_t tmp = {0,0,0};
+ char *t = args->filter_str;
+ while ( *t )
+ {
+ if ( *t=='"' ) kputc('\\',&tmp);
+ kputc(*t,&tmp);
+ t++;
+ }
+ int ret = bcf_hdr_printf(args->hdr, "##FILTER=<ID=%s,Description=\"Set if %s: %s\">", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s);
+ if ( ret!=0 )
+ error("Failed to append header line: ##FILTER=<ID=%s,Description=\"Set if %s: %s\">\n", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s);
+ args->flt_fail = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); assert( args->flt_fail>=0 );
+ free(flt_name.s);
+ free(tmp.s);
+ }
+
+ if ( args->snp_gap || args->indel_gap )
+ {
+ if ( !args->filter_logic && args->soft_filter && strcmp(args->soft_filter,"+") )
+ {
+ kstring_t tmp = {0,0,0};
+ if ( args->snp_gap ) kputs("\"SnpGap\"", &tmp);
+ if ( args->indel_gap )
+ {
+ if ( tmp.s ) kputs(" and ", &tmp);
+ kputs("\"IndelGap\"", &tmp);
+ }
+ fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
+ free(tmp.s);
+ }
+
+ rbuf_init(&args->rbuf, 64);
+ args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
+ if ( args->snp_gap )
+ {
+ bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap);
+ args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap");
+ assert( args->SnpGap_id>=0 );
+ }
+ if ( args->indel_gap )
+ {
+ bcf_hdr_printf(args->hdr, "##FILTER=<ID=IndelGap,Description=\"Indel within %d bp of an indel\">", args->indel_gap);
+ args->IndelGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "IndelGap");
+ assert( args->IndelGap_id>=0 );
+ }
+ }
+
+ bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+}
+
+static void destroy_data(args_t *args)
+{
+ if ( args->rbuf_lines )
+ {
+ int i;
+ for (i=0; i<args->rbuf.m; i++)
+ if ( args->rbuf_lines[i] ) bcf_destroy1(args->rbuf_lines[i]);
+ free(args->rbuf_lines);
+ }
+ if ( args->filter )
+ filter_destroy(args->filter);
+ free(args->tmpi);
+ free(args->tmp_ac);
+}
+
+static void flush_buffer(args_t *args, int n)
+{
+ int i, j;
+ for (i=0; i<n; i++)
+ {
+ int k = rbuf_shift(&args->rbuf);
+ bcf1_t *rec = args->rbuf_lines[k];
+
+ int pass = 1;
+ if ( !args->soft_filter )
+ {
+ for (j=0; j<rec->d.n_flt; j++)
+ {
+ if ( args->indel_gap && rec->d.flt[j]==args->IndelGap_id ) { pass = 0; break; }
+ if ( args->snp_gap && rec->d.flt[j]==args->SnpGap_id ) { pass = 0; break; }
+ }
+ }
+ if ( pass ) bcf_write1(args->out_fh, args->hdr, rec);
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+static void buffered_filters(args_t *args, bcf1_t *line)
+{
+ /**
+ * The logic of SnpGap=3. The SNPs at positions 1 and 7 are filtered,
+ * positions 0 and 8 are not:
+ * 0123456789
+ * ref .G.GT..G..
+ * del .A.G-..A..
+ * Here the positions 1 and 6 are filtered, 0 and 7 are not:
+ * 0123-456789
+ * ref .G.G-..G..
+ * ins .A.GT..A..
+ *
+ * The logic of IndelGap=2. The second indel is filtered:
+ * 012345678901
+ * ref .GT.GT..GT..
+ * del .G-.G-..G-..
+ * And similarly here, the second is filtered:
+ * 01 23 456 78
+ * ref .A-.A-..A-..
+ * ins .AT.AT..AT..
+ */
+
+ // To avoid additional data structure, we abuse bcf1_t's var and var_type records.
+ const int SnpGap_set = VCF_OTHER<<1;
+ const int IndelGap_set = VCF_OTHER<<2;
+ const int IndelGap_flush = VCF_OTHER<<3;
+
+ int var_type = 0, i;
+ if ( line )
+ {
+ // Still on the same chromosome?
+ int ilast = rbuf_last(&args->rbuf);
+ if ( ilast>=0 && line->rid != args->rbuf_lines[ilast]->rid )
+ flush_buffer(args, args->rbuf.n); // new chromosome, flush everything
+
+ rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n,args->rbuf_lines);
+
+ // Insert the new record in the buffer. The line would be overwritten in
+ // the next bcf_sr_next_line call, therefore we need to swap it with an
+ // unused one
+ ilast = rbuf_append(&args->rbuf);
+ if ( !args->rbuf_lines[ilast] ) args->rbuf_lines[ilast] = bcf_init1();
+ SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->rbuf_lines[ilast]);
+
+ var_type = bcf_get_variant_types(line);
+
+ // Find out the size of an indel. The indel boundaries are based on REF
+ // (POS+1,POS+rlen-1). This is not entirely correct: mpileup likes to
+ // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
+ // used. This filter is therefore more strict and may remove some valid
+ // SNPs.
+ int len = 1;
+ if ( var_type & VCF_INDEL )
+ {
+ for (i=1; i<line->n_allele; i++)
+ if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
+ }
+
+ // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
+ line->d.var[0].n = len;
+ }
+
+ int k_flush = 1;
+ if ( args->indel_gap )
+ {
+ k_flush = 0;
+ // Find indels which are too close to each other
+ int last_to = -1;
+ for (i=-1; rbuf_next(&args->rbuf,&i); )
+ {
+ bcf1_t *rec = args->rbuf_lines[i];
+ int rec_from = rec->pos;
+ if ( last_to!=-1 && last_to < rec_from ) break;
+
+ k_flush++;
+ if ( !(rec->d.var_type & VCF_INDEL) ) continue;
+
+ rec->d.var_type |= IndelGap_set;
+ last_to = args->indel_gap + rec->pos + rec->d.var[0].n - 1;
+ }
+ if ( i==args->rbuf.f && line && last_to!=-1 ) k_flush = 0;
+ if ( k_flush || !line )
+ {
+ // Select the best indel from the cluster of k_flush indels
+ int k = 0, max_ac = -1, imax_ac = -1;
+ for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
+ {
+ k++;
+ bcf1_t *rec = args->rbuf_lines[i];
+ if ( !(rec->d.var_type & IndelGap_set) ) continue;
+ hts_expand(int, rec->n_allele, args->ntmpi, args->tmpi);
+ int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL);
+ if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; }
+ }
+
+ // Filter all but the best indel (with max AF or first if AF not available)
+ k = 0;
+ for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
+ {
+ k++;
+ bcf1_t *rec = args->rbuf_lines[i];
+ if ( !(rec->d.var_type & IndelGap_set) ) continue;
+ rec->d.var_type |= IndelGap_flush;
+ if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id);
+ }
+ }
+ }
+
+ if ( !line )
+ {
+ // Finished: flush everything
+ flush_buffer(args, args->rbuf.n);
+ return;
+ }
+
+ int j_flush = 1;
+ if ( args->snp_gap )
+ {
+ j_flush = 0;
+ int last_from = line->pos;
+ for (i=-1; rbuf_next(&args->rbuf,&i); )
+ {
+ bcf1_t *rec = args->rbuf_lines[i];
+ int rec_to = rec->pos + rec->d.var[0].n - 1; // last position affected by the variant
+ if ( rec_to + args->snp_gap < last_from )
+ j_flush++;
+ else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
+ {
+ // this SNP has not been SnpGap-filtered yet
+ rec->d.var_type |= SnpGap_set;
+ bcf_add_filter(args->hdr, rec, args->SnpGap_id);
+ }
+ else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) )
+ {
+ // the line which we are adding is a SNP and needs to be filtered
+ line->d.var_type |= SnpGap_set;
+ bcf_add_filter(args->hdr, line, args->SnpGap_id);
+ break;
+ }
+ }
+ }
+ flush_buffer(args, j_flush < k_flush ? j_flush : k_flush);
+}
+
+static void set_genotypes(args_t *args, bcf1_t *line, int pass_site)
+{
+ int i,j;
+ if ( !bcf_hdr_nsamples(args->hdr) ) return;
+ if ( args->smpl_pass )
+ {
+ int npass = 0;
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) npass += args->smpl_pass[i];
+
+ // return if all samples pass
+ if ( npass==bcf_hdr_nsamples(args->hdr) && (args->filter_logic & FLT_INCLUDE) ) return;
+ if ( npass==0 && (args->filter_logic & FLT_EXCLUDE) ) return;
+ }
+ else if ( pass_site ) return;
+
+ int an = 0, has_an = bcf_get_info_int32(args->hdr, line, "AN", &args->tmp_ac, &args->ntmp_ac);
+ if ( has_an==1 ) an = args->tmp_ac[0];
+ else has_an = 0;
+
+ int has_ac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_ac, &args->ntmp_ac);
+ has_ac = has_ac==line->n_allele-1 ? 1 : 0;
+
+ int new_gt = 0, ngts = bcf_get_format_int32(args->hdr, line, "GT", &args->tmpi, &args->ntmpi);
+ ngts /= bcf_hdr_nsamples(args->hdr);
+ if ( args->set_gts==SET_GTS_MISSING ) new_gt = bcf_gt_missing;
+ else if ( args->set_gts==SET_GTS_REF ) new_gt = bcf_gt_unphased(0);
+ else error("todo: set_gts=%d\n", args->set_gts);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ {
+ if ( args->smpl_pass )
+ {
+ int pass = args->smpl_pass[i];
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( pass ) continue;
+ }
+ int32_t *gts = args->tmpi + ngts*i;
+ for (j=0; j<ngts; j++)
+ {
+ if ( gts[j]==bcf_int32_vector_end ) break;
+ if ( args->set_gts==SET_GTS_MISSING && !bcf_gt_is_missing(gts[j]) )
+ {
+ int ial = bcf_gt_allele(gts[j]);
+ if ( has_ac && ial>0 && ial<=line->n_allele ) args->tmp_ac[ ial-1 ]--;
+ an--;
+ }
+ else if ( args->set_gts==SET_GTS_REF )
+ {
+ int ial = bcf_gt_allele(gts[j]);
+ if ( bcf_gt_is_missing(gts[j]) ) an++;
+ else if ( has_ac && ial>0 && ial<=line->n_allele ) args->tmp_ac[ ial-1 ]--;
+ }
+ gts[j] = new_gt;
+ }
+ }
+ bcf_update_genotypes(args->hdr,line,args->tmpi,ngts*bcf_hdr_nsamples(args->hdr));
+ if ( has_an ) bcf_update_info_int32(args->hdr,line,"AN",&an,1);
+ if ( has_ac ) bcf_update_info_int32(args->hdr,line,"AC",args->tmp_ac,line->n_allele-1);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Apply fixed-threshold filters.\n");
+ fprintf(stderr, "Usage: bcftools filter [options] <in.vcf.gz>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -g, --SnpGap <int> filter SNPs within <int> base pairs of an indel\n");
+ fprintf(stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
+ fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
+ fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --soft-filter <string> annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
+ fprintf(stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcffilter(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"set-GTs",required_argument,NULL,'S'},
+ {"mode",required_argument,NULL,'m'},
+ {"soft-filter",required_argument,NULL,'s'},
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"SnpGap",required_argument,NULL,'g'},
+ {"IndelGap",required_argument,NULL,'G'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'g':
+ args->snp_gap = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg);
+ break;
+ case 'G':
+ args->indel_gap = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --IndelGap %s\n", optarg);
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 's': args->soft_filter = optarg; break;
+ case 'm':
+ if ( strchr(optarg,'x') ) args->annot_mode |= ANNOT_RESET;
+ if ( strchr(optarg,'+') ) args->annot_mode |= ANNOT_ADD;
+ break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'S':
+ if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING;
+ else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF;
+ else error("The argument to -S not recognised: %s\n", optarg);
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage(args);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
+
+ // read in the regions from the command line
+ if ( args->regions_list )
+ {
+ args->files->require_index = 1;
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ else if ( optind+1 < argc )
+ {
+ int i;
+ kstring_t tmp = {0,0,0};
+ kputs(argv[optind+1],&tmp);
+ for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
+ args->files->require_index = 1;
+ if ( bcf_sr_set_regions(args->files, tmp.s, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ free(tmp.s);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ bcf_hdr_write(args->out_fh, args->hdr);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files, 0);
+ int pass = 1;
+ if ( args->filter )
+ {
+ pass = filter_test(args->filter, line, &args->smpl_pass);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ }
+ if ( args->soft_filter || args->set_gts || pass )
+ {
+ if ( pass )
+ {
+ bcf_unpack(line,BCF_UN_FLT);
+ if ( args->annot_mode & ANNOT_RESET || !line->d.n_flt ) bcf_add_filter(args->hdr, line, args->flt_pass);
+ }
+ else if ( args->soft_filter )
+ {
+ if ( (args->annot_mode & ANNOT_ADD) ) bcf_add_filter(args->hdr, line, args->flt_fail);
+ else bcf_update_filter(args->hdr, line, &args->flt_fail, 1);
+ }
+ if ( args->set_gts ) set_genotypes(args, line, pass);
+ if ( !args->rbuf_lines )
+ bcf_write1(args->out_fh, args->hdr, line);
+ else
+ buffered_filters(args, line);
+ }
+ }
+ buffered_filters(args, NULL);
+
+ hts_close(args->out_fh);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c
new file mode 100644
index 0000000..c731ba3
--- /dev/null
+++ b/bcftools/vcffilter.c.pysam.c
@@ -0,0 +1,570 @@
+#include "pysam.h"
+
+/* vcffilter.c -- Apply fixed-threshold filters.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "rbuf.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+// FILTER columns annotation: replace or add to existing FILTERs; set FILTER to PASS at good sites?
+#define ANNOT_ADD 1
+#define ANNOT_RESET 2
+
+// Set genotypes of filtered samples
+#define SET_GTS_MISSING 1
+#define SET_GTS_REF 2
+
+typedef struct _args_t
+{
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ const uint8_t *smpl_pass;
+ int set_gts;
+ char *soft_filter; // drop failed sites or annotate FILTER column?
+ int annot_mode; // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is?
+ int flt_fail, flt_pass; // BCF ids of fail and pass filters
+ int snp_gap, indel_gap, IndelGap_id, SnpGap_id;
+ int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac;
+ rbuf_t rbuf;
+ bcf1_t **rbuf_lines;
+
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ htsFile *out_fh;
+ int output_type, n_threads;
+
+ char **argv, *output_fname, *targets_list, *regions_list;
+ int argc;
+}
+args_t;
+
+static void init_data(args_t *args)
+{
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+
+ args->hdr = args->files->readers[0].header;
+ args->flt_pass = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"PASS"); assert( !args->flt_pass ); // sanity check: required by BCF spec
+
+ // -i or -e: append FILTER line
+ if ( args->soft_filter && args->filter_logic )
+ {
+ kstring_t flt_name = {0,0,0};
+ if ( strcmp(args->soft_filter,"+") )
+ kputs(args->soft_filter, &flt_name);
+ else
+ {
+ // Make up a filter name
+ int i = 0, id = -1;
+ do
+ {
+ ksprintf(&flt_name,"Filter%d", ++i);
+ id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s);
+ }
+ while ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,id) );
+ }
+ // escape quotes
+ kstring_t tmp = {0,0,0};
+ char *t = args->filter_str;
+ while ( *t )
+ {
+ if ( *t=='"' ) kputc('\\',&tmp);
+ kputc(*t,&tmp);
+ t++;
+ }
+ int ret = bcf_hdr_printf(args->hdr, "##FILTER=<ID=%s,Description=\"Set if %s: %s\">", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s);
+ if ( ret!=0 )
+ error("Failed to append header line: ##FILTER=<ID=%s,Description=\"Set if %s: %s\">\n", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s);
+ args->flt_fail = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); assert( args->flt_fail>=0 );
+ free(flt_name.s);
+ free(tmp.s);
+ }
+
+ if ( args->snp_gap || args->indel_gap )
+ {
+ if ( !args->filter_logic && args->soft_filter && strcmp(args->soft_filter,"+") )
+ {
+ kstring_t tmp = {0,0,0};
+ if ( args->snp_gap ) kputs("\"SnpGap\"", &tmp);
+ if ( args->indel_gap )
+ {
+ if ( tmp.s ) kputs(" and ", &tmp);
+ kputs("\"IndelGap\"", &tmp);
+ }
+ fprintf(pysamerr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
+ free(tmp.s);
+ }
+
+ rbuf_init(&args->rbuf, 64);
+ args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
+ if ( args->snp_gap )
+ {
+ bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap);
+ args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap");
+ assert( args->SnpGap_id>=0 );
+ }
+ if ( args->indel_gap )
+ {
+ bcf_hdr_printf(args->hdr, "##FILTER=<ID=IndelGap,Description=\"Indel within %d bp of an indel\">", args->indel_gap);
+ args->IndelGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "IndelGap");
+ assert( args->IndelGap_id>=0 );
+ }
+ }
+
+ bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+}
+
+static void destroy_data(args_t *args)
+{
+ if ( args->rbuf_lines )
+ {
+ int i;
+ for (i=0; i<args->rbuf.m; i++)
+ if ( args->rbuf_lines[i] ) bcf_destroy1(args->rbuf_lines[i]);
+ free(args->rbuf_lines);
+ }
+ if ( args->filter )
+ filter_destroy(args->filter);
+ free(args->tmpi);
+ free(args->tmp_ac);
+}
+
+static void flush_buffer(args_t *args, int n)
+{
+ int i, j;
+ for (i=0; i<n; i++)
+ {
+ int k = rbuf_shift(&args->rbuf);
+ bcf1_t *rec = args->rbuf_lines[k];
+
+ int pass = 1;
+ if ( !args->soft_filter )
+ {
+ for (j=0; j<rec->d.n_flt; j++)
+ {
+ if ( args->indel_gap && rec->d.flt[j]==args->IndelGap_id ) { pass = 0; break; }
+ if ( args->snp_gap && rec->d.flt[j]==args->SnpGap_id ) { pass = 0; break; }
+ }
+ }
+ if ( pass ) bcf_write1(args->out_fh, args->hdr, rec);
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+static void buffered_filters(args_t *args, bcf1_t *line)
+{
+ /**
+ * The logic of SnpGap=3. The SNPs at positions 1 and 7 are filtered,
+ * positions 0 and 8 are not:
+ * 0123456789
+ * ref .G.GT..G..
+ * del .A.G-..A..
+ * Here the positions 1 and 6 are filtered, 0 and 7 are not:
+ * 0123-456789
+ * ref .G.G-..G..
+ * ins .A.GT..A..
+ *
+ * The logic of IndelGap=2. The second indel is filtered:
+ * 012345678901
+ * ref .GT.GT..GT..
+ * del .G-.G-..G-..
+ * And similarly here, the second is filtered:
+ * 01 23 456 78
+ * ref .A-.A-..A-..
+ * ins .AT.AT..AT..
+ */
+
+ // To avoid additional data structure, we abuse bcf1_t's var and var_type records.
+ const int SnpGap_set = VCF_OTHER<<1;
+ const int IndelGap_set = VCF_OTHER<<2;
+ const int IndelGap_flush = VCF_OTHER<<3;
+
+ int var_type = 0, i;
+ if ( line )
+ {
+ // Still on the same chromosome?
+ int ilast = rbuf_last(&args->rbuf);
+ if ( ilast>=0 && line->rid != args->rbuf_lines[ilast]->rid )
+ flush_buffer(args, args->rbuf.n); // new chromosome, flush everything
+
+ rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n,args->rbuf_lines);
+
+ // Insert the new record in the buffer. The line would be overwritten in
+ // the next bcf_sr_next_line call, therefore we need to swap it with an
+ // unused one
+ ilast = rbuf_append(&args->rbuf);
+ if ( !args->rbuf_lines[ilast] ) args->rbuf_lines[ilast] = bcf_init1();
+ SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->rbuf_lines[ilast]);
+
+ var_type = bcf_get_variant_types(line);
+
+ // Find out the size of an indel. The indel boundaries are based on REF
+ // (POS+1,POS+rlen-1). This is not entirely correct: mpileup likes to
+ // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
+ // used. This filter is therefore more strict and may remove some valid
+ // SNPs.
+ int len = 1;
+ if ( var_type & VCF_INDEL )
+ {
+ for (i=1; i<line->n_allele; i++)
+ if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
+ }
+
+ // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
+ line->d.var[0].n = len;
+ }
+
+ int k_flush = 1;
+ if ( args->indel_gap )
+ {
+ k_flush = 0;
+ // Find indels which are too close to each other
+ int last_to = -1;
+ for (i=-1; rbuf_next(&args->rbuf,&i); )
+ {
+ bcf1_t *rec = args->rbuf_lines[i];
+ int rec_from = rec->pos;
+ if ( last_to!=-1 && last_to < rec_from ) break;
+
+ k_flush++;
+ if ( !(rec->d.var_type & VCF_INDEL) ) continue;
+
+ rec->d.var_type |= IndelGap_set;
+ last_to = args->indel_gap + rec->pos + rec->d.var[0].n - 1;
+ }
+ if ( i==args->rbuf.f && line && last_to!=-1 ) k_flush = 0;
+ if ( k_flush || !line )
+ {
+ // Select the best indel from the cluster of k_flush indels
+ int k = 0, max_ac = -1, imax_ac = -1;
+ for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
+ {
+ k++;
+ bcf1_t *rec = args->rbuf_lines[i];
+ if ( !(rec->d.var_type & IndelGap_set) ) continue;
+ hts_expand(int, rec->n_allele, args->ntmpi, args->tmpi);
+ int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL);
+ if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; }
+ }
+
+ // Filter all but the best indel (with max AF or first if AF not available)
+ k = 0;
+ for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
+ {
+ k++;
+ bcf1_t *rec = args->rbuf_lines[i];
+ if ( !(rec->d.var_type & IndelGap_set) ) continue;
+ rec->d.var_type |= IndelGap_flush;
+ if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id);
+ }
+ }
+ }
+
+ if ( !line )
+ {
+ // Finished: flush everything
+ flush_buffer(args, args->rbuf.n);
+ return;
+ }
+
+ int j_flush = 1;
+ if ( args->snp_gap )
+ {
+ j_flush = 0;
+ int last_from = line->pos;
+ for (i=-1; rbuf_next(&args->rbuf,&i); )
+ {
+ bcf1_t *rec = args->rbuf_lines[i];
+ int rec_to = rec->pos + rec->d.var[0].n - 1; // last position affected by the variant
+ if ( rec_to + args->snp_gap < last_from )
+ j_flush++;
+ else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
+ {
+ // this SNP has not been SnpGap-filtered yet
+ rec->d.var_type |= SnpGap_set;
+ bcf_add_filter(args->hdr, rec, args->SnpGap_id);
+ }
+ else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) )
+ {
+ // the line which we are adding is a SNP and needs to be filtered
+ line->d.var_type |= SnpGap_set;
+ bcf_add_filter(args->hdr, line, args->SnpGap_id);
+ break;
+ }
+ }
+ }
+ flush_buffer(args, j_flush < k_flush ? j_flush : k_flush);
+}
+
+static void set_genotypes(args_t *args, bcf1_t *line, int pass_site)
+{
+ int i,j;
+ if ( !bcf_hdr_nsamples(args->hdr) ) return;
+ if ( args->smpl_pass )
+ {
+ int npass = 0;
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) npass += args->smpl_pass[i];
+
+ // return if all samples pass
+ if ( npass==bcf_hdr_nsamples(args->hdr) && (args->filter_logic & FLT_INCLUDE) ) return;
+ if ( npass==0 && (args->filter_logic & FLT_EXCLUDE) ) return;
+ }
+ else if ( pass_site ) return;
+
+ int an = 0, has_an = bcf_get_info_int32(args->hdr, line, "AN", &args->tmp_ac, &args->ntmp_ac);
+ if ( has_an==1 ) an = args->tmp_ac[0];
+ else has_an = 0;
+
+ int has_ac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_ac, &args->ntmp_ac);
+ has_ac = has_ac==line->n_allele-1 ? 1 : 0;
+
+ int new_gt = 0, ngts = bcf_get_format_int32(args->hdr, line, "GT", &args->tmpi, &args->ntmpi);
+ ngts /= bcf_hdr_nsamples(args->hdr);
+ if ( args->set_gts==SET_GTS_MISSING ) new_gt = bcf_gt_missing;
+ else if ( args->set_gts==SET_GTS_REF ) new_gt = bcf_gt_unphased(0);
+ else error("todo: set_gts=%d\n", args->set_gts);
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ {
+ if ( args->smpl_pass )
+ {
+ int pass = args->smpl_pass[i];
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( pass ) continue;
+ }
+ int32_t *gts = args->tmpi + ngts*i;
+ for (j=0; j<ngts; j++)
+ {
+ if ( gts[j]==bcf_int32_vector_end ) break;
+ if ( args->set_gts==SET_GTS_MISSING && !bcf_gt_is_missing(gts[j]) )
+ {
+ int ial = bcf_gt_allele(gts[j]);
+ if ( has_ac && ial>0 && ial<=line->n_allele ) args->tmp_ac[ ial-1 ]--;
+ an--;
+ }
+ else if ( args->set_gts==SET_GTS_REF )
+ {
+ int ial = bcf_gt_allele(gts[j]);
+ if ( bcf_gt_is_missing(gts[j]) ) an++;
+ else if ( has_ac && ial>0 && ial<=line->n_allele ) args->tmp_ac[ ial-1 ]--;
+ }
+ gts[j] = new_gt;
+ }
+ }
+ bcf_update_genotypes(args->hdr,line,args->tmpi,ngts*bcf_hdr_nsamples(args->hdr));
+ if ( has_an ) bcf_update_info_int32(args->hdr,line,"AN",&an,1);
+ if ( has_ac ) bcf_update_info_int32(args->hdr,line,"AC",args->tmp_ac,line->n_allele-1);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Apply fixed-threshold filters.\n");
+ fprintf(pysamerr, "Usage: bcftools filter [options] <in.vcf.gz>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysamerr, " -g, --SnpGap <int> filter SNPs within <int> base pairs of an indel\n");
+ fprintf(pysamerr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
+ fprintf(pysamerr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
+ fprintf(pysamerr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --soft-filter <string> annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
+ fprintf(pysamerr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcffilter(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"set-GTs",required_argument,NULL,'S'},
+ {"mode",required_argument,NULL,'m'},
+ {"soft-filter",required_argument,NULL,'s'},
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"SnpGap",required_argument,NULL,'g'},
+ {"IndelGap",required_argument,NULL,'G'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'g':
+ args->snp_gap = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg);
+ break;
+ case 'G':
+ args->indel_gap = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --IndelGap %s\n", optarg);
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 's': args->soft_filter = optarg; break;
+ case 'm':
+ if ( strchr(optarg,'x') ) args->annot_mode |= ANNOT_RESET;
+ if ( strchr(optarg,'+') ) args->annot_mode |= ANNOT_ADD;
+ break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'S':
+ if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING;
+ else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF;
+ else error("The argument to -S not recognised: %s\n", optarg);
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage(args);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
+
+ // read in the regions from the command line
+ if ( args->regions_list )
+ {
+ args->files->require_index = 1;
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ else if ( optind+1 < argc )
+ {
+ int i;
+ kstring_t tmp = {0,0,0};
+ kputs(argv[optind+1],&tmp);
+ for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
+ args->files->require_index = 1;
+ if ( bcf_sr_set_regions(args->files, tmp.s, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ free(tmp.s);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ bcf_hdr_write(args->out_fh, args->hdr);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files, 0);
+ int pass = 1;
+ if ( args->filter )
+ {
+ pass = filter_test(args->filter, line, &args->smpl_pass);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ }
+ if ( args->soft_filter || args->set_gts || pass )
+ {
+ if ( pass )
+ {
+ bcf_unpack(line,BCF_UN_FLT);
+ if ( args->annot_mode & ANNOT_RESET || !line->d.n_flt ) bcf_add_filter(args->hdr, line, args->flt_pass);
+ }
+ else if ( args->soft_filter )
+ {
+ if ( (args->annot_mode & ANNOT_ADD) ) bcf_add_filter(args->hdr, line, args->flt_fail);
+ else bcf_update_filter(args->hdr, line, &args->flt_fail, 1);
+ }
+ if ( args->set_gts ) set_genotypes(args, line, pass);
+ if ( !args->rbuf_lines )
+ bcf_write1(args->out_fh, args->hdr, line);
+ else
+ buffered_filters(args, line);
+ }
+ }
+ buffered_filters(args, NULL);
+
+ hts_close(args->out_fh);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c
new file mode 100644
index 0000000..b741ef6
--- /dev/null
+++ b/bcftools/vcfgtcheck.c
@@ -0,0 +1,804 @@
+/* vcfgtcheck.c -- Check sample identity.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+
+typedef struct
+{
+ bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check
+ bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
+ int ntmp_arr, npl_arr;
+ int32_t *tmp_arr, *pl_arr;
+ double *lks, *sites;
+ int *cnts, *dps, hom_only, cross_check, all_sites;
+ char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
+ int argc, no_PLs;
+}
+args_t;
+
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+char *msprintf(const char *fmt, ...);
+void mkdir_p(const char *fmt, ...);
+
+void py_plot(char *script)
+{
+ mkdir_p(script);
+ int len = strlen(script);
+ char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script);
+ int ret = system(cmd);
+ if ( ret ) fprintf(stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
+ free(cmd);
+}
+
+static void plot_check(args_t *args, char *target_sample, char *query_sample)
+{
+ char *fname;
+ FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import matplotlib.gridspec as gridspec\n"
+ "import csv\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "\n"
+ "sample_ids = False\n"
+ "\n"
+ "dat = []\n"
+ "with open('%s.tab', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " if row[0][0]=='#': continue\n"
+ " if row[0]!='CN': continue\n"
+ " tgt = 0\n"
+ " if row[4]=='%s': tgt = 1\n"
+ " dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n"
+ "\n"
+ "dat = sorted(dat)\n"
+ "\n"
+ "iq = -1; dp = 0\n"
+ "for i in range(len(dat)):\n"
+ " if iq==-1 and dat[i][3]==1: iq = i\n"
+ " dp += dat[i][2]\n"
+ "dp /= len(dat)\n"
+ "\n"
+ "fig,ax1 = plt.subplots(figsize=(8,5))\n"
+ "ax2 = ax1.twinx()\n"
+ "plots = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n"
+ "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n"
+ "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n"
+ "if iq!=-1:\n"
+ " ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n"
+ " ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n"
+ " ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n"
+ "for tl in ax1.get_yticklabels(): tl.set_color('g')\n"
+ "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n"
+ "min_dp = min([x[2] for x in dat])\n"
+ "max_dp = max([x[2] for x in dat])\n"
+ "ax2.set_ylim(min_dp-1,max_dp+1)\n"
+ "ax1.set_title('Discordance with %s')\n"
+ "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n"
+ "ax1.set_xlabel('Sample ID')\n"
+ "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n"
+ "if sample_ids:\n"
+ " ax1.set_xticks(range(len(dat)))\n"
+ " ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n"
+ " plt.subplots_adjust(bottom=0.2)\n"
+ "ax1.set_ylabel('Discordance',color='g')\n"
+ "ax2.set_ylabel('Number of sites',color='k')\n"
+ "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
+ "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
+ "labels = [l.get_label() for l in plots]\n"
+ "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n"
+ "plt.savefig('%s.png')\n"
+ "plt.close()\n"
+ "\n", args->plot, target_sample, target_sample, query_sample, args->plot
+ );
+ fclose(fp);
+ py_plot(fname);
+ free(fname);
+}
+
+static void plot_cross_check(args_t *args)
+{
+ char *fname;
+ FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import matplotlib.gridspec as gridspec\n"
+ "import csv\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "avg = []\n"
+ "dp = []\n"
+ "sm2id = {}\n"
+ "dat = None\n"
+ "min = None\n"
+ "max = None\n"
+ "with open('%s.tab', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " i = 0\n"
+ " for row in reader:\n"
+ " if row[0]=='SM':\n"
+ " sm2id[row[4]] = i\n"
+ " avg.append([i,float(row[1])])\n"
+ " dp.append([i,float(row[2])])\n"
+ " i += 1\n"
+ " elif row[0]=='CN':\n"
+ " val = 0\n"
+ " if int(row[2])!=0: val = float(row[1])/int(row[2])\n"
+ " if not dat:\n"
+ " dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n"
+ " min = val\n"
+ " max = val\n"
+ " id_i = sm2id[row[4]]\n"
+ " id_j = sm2id[row[5]]\n"
+ " dat[id_i][id_j] = val\n"
+ " dat[id_j][id_i] = val\n"
+ " if min > val: min = val\n"
+ " if max < val: max = val\n"
+ "\n"
+ "if len(sm2id)<=1: exit(1)\n"
+ "if min==max: exit(1)\n"
+ "\n"
+ "fig = plt.figure(figsize=(6,7))\n"
+ "gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n"
+ "ax1 = plt.subplot(gs[0])\n"
+ "ax2 = plt.subplot(gs[1])\n"
+ "\n"
+ "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n"
+ "ax3 = ax1.twinx()\n"
+ "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n"
+ "for tl in ax3.get_yticklabels():\n"
+ " tl.set_color('r')\n"
+ " tl.set_fontsize(9)\n"
+ "\n"
+ "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n"
+ "cb1 = plt.colorbar(im,ax=ax2)\n"
+ "cb1.set_label('Pairwise discordance')\n"
+ "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n"
+ "\n"
+ "ax1.tick_params(axis='both', which='major', labelsize=9)\n"
+ "ax1.tick_params(axis='both', which='minor', labelsize=9)\n"
+ "ax2.tick_params(axis='both', which='major', labelsize=9)\n"
+ "ax2.tick_params(axis='both', which='minor', labelsize=9)\n"
+ "\n"
+ "ax1.set_title('Sample Discordance Score')\n"
+ "ax2.set_ylabel('Sample ID')\n"
+ "ax2.set_xlabel('Sample ID')\n"
+ "ax3.set_ylabel('Average Depth',color='r')\n"
+ "ax1.set_xlabel('Sample ID')\n"
+ "ax1.set_ylabel('Average discordance')\n"
+ "\n"
+ "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n"
+ "plt.savefig('%s.png')\n"
+ "plt.close()\n"
+ "\n", args->plot,args->plot
+ );
+ fclose(fp);
+ py_plot(fname);
+ free(fname);
+}
+
+static void init_data(args_t *args)
+{
+ args->sm_hdr = args->files->readers[0].header;
+ if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname);
+
+ if ( !args->cross_check )
+ {
+ args->gt_hdr = args->files->readers[1].header;
+ int nsamples = bcf_hdr_nsamples(args->gt_hdr);
+ if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname);
+ args->lks = (double*) calloc(nsamples,sizeof(double));
+ args->cnts = (int*) calloc(nsamples,sizeof(int));
+ args->sites = (double*) calloc(nsamples,sizeof(double));
+ args->dps = (int*) calloc(nsamples,sizeof(int));
+ }
+ else
+ {
+ int nsamples = bcf_hdr_nsamples(args->sm_hdr);
+ int narr = (nsamples-1)*nsamples/2;
+ args->lks = (double*) calloc(narr,sizeof(double));
+ args->cnts = (int*) calloc(narr,sizeof(int));
+ args->dps = (int*) calloc(narr,sizeof(int));
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites);
+}
+
+static int allele_to_int(bcf1_t *line, char *allele)
+{
+ int i;
+ for (i=0; i<line->n_allele; i++)
+ if ( !strcmp(allele,line->d.allele[i]) ) return i;
+ if ( strcmp(line->d.allele[i-1],"X") ) return -1;
+ return i-1;
+}
+
+static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl)
+{
+ int i, j;
+ for (i=0; i<n_gt2ipl; i++) gt2ipl[i] = -1;
+ for (i=0; i<gt_line->n_allele; i++)
+ {
+ // find which of the sm_alleles (k) corresponds to the gt_allele (i)
+ int k = allele_to_int(sm_line, gt_line->d.allele[i]);
+ if ( k<0 ) return 0;
+ for (j=0; j<=i; j++)
+ {
+ int l = allele_to_int(sm_line, gt_line->d.allele[j]);
+ if ( l<0 ) return 0;
+ gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k);
+ }
+ }
+ //for (i=0; i<n_gt2ipl; i++) printf("%d .. %d\n", i,gt2ipl[i]);
+ return 1;
+}
+
+static void set_cwd(args_t *args)
+{
+ int i;
+ char *buf;
+ size_t nbuf = 500;
+ args->cwd = (char*) malloc(sizeof(char)*nbuf);
+ for (i=0; i<5; i++)
+ {
+ if ( (buf = getcwd(args->cwd, nbuf)) ) break;
+ nbuf *= 2;
+ args->cwd = (char*) realloc(args->cwd, sizeof(char)*nbuf);
+ }
+ assert(buf);
+}
+
+static void print_header(args_t *args, FILE *fp)
+{
+ fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
+ fprintf(fp, "# \t bcftools %s ", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++)
+ fprintf(fp, " %s",args->argv[i]);
+ fprintf(fp, "\n# and the working directory was:\n");
+ fprintf(fp, "# \t %s\n#\n", args->cwd);
+}
+
+static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
+{
+ // PLs not present, use GTs instead.
+ int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs
+ int nsm_gt, i;
+ if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 )
+ error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+ nsm_gt /= bcf_hdr_nsamples(hdr);
+ int npl = line->n_allele*(line->n_allele+1)/2;
+ hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ {
+ int *gt_ptr = args->tmp_arr + i*nsm_gt;
+ int j, *pl_ptr = args->pl_arr + i*npl;
+ if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype
+ {
+ for (j=0; j<npl; j++) pl_ptr[j] = -1;
+ }
+ else
+ {
+ int a = bcf_gt_allele(gt_ptr[0]);
+ int b = bcf_gt_allele(gt_ptr[1]);
+ for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
+ int idx = bcf_alleles2gt(a,b);
+ pl_ptr[idx] = 0;
+ }
+ }
+ return npl;
+}
+
+static int cmp_doubleptr(const void *_a, const void *_b)
+{
+ double *a = *((double**)_a);
+ double *b = *((double**)_b);
+ if ( *a < *b ) return -1;
+ else if ( *a == *b ) return 0;
+ return 1;
+}
+
+static void check_gt(args_t *args)
+{
+ int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
+ int fake_pls = args->no_PLs;
+
+ // Initialize things: check which tags are defined in the header, sample names etc.
+ if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+ {
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
+ error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
+ if ( !args->no_PLs )
+ fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
+ fake_pls = 1;
+ }
+
+ FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
+ print_header(args, fp);
+
+ int tgt_isample = -1, query_isample = 0;
+ if ( args->target_sample )
+ {
+ tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
+ if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
+ }
+ if ( args->all_sites )
+ {
+ if ( tgt_isample==-1 )
+ {
+ fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
+ tgt_isample = 0;
+ }
+ }
+ if ( args->query_sample )
+ {
+ query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
+ if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
+ }
+ if ( args->all_sites )
+ fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n",
+ args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);
+
+ // Main loop
+ float prev_lk = 0;
+ while ( (ret=bcf_sr_next_line(args->files)) )
+ {
+ if ( ret!=2 ) continue;
+ bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file
+ bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file
+ bcf_unpack(sm_line, BCF_UN_FMT);
+ bcf_unpack(gt_line, BCF_UN_FMT);
+
+ // Init mapping from target genotype index to the sample's PL fields
+ int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
+ if ( n_gt2ipl > m_gt2ipl )
+ {
+ m_gt2ipl = n_gt2ipl;
+ gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
+ }
+ if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;
+
+ // Target genotypes
+ int ngt, npl;
+ if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 )
+ error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
+ ngt /= bcf_hdr_nsamples(args->gt_hdr);
+ if ( ngt!=2 ) continue; // checking only diploid genotypes
+
+ // Sample PLs
+ if ( !fake_pls )
+ {
+ if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
+ {
+ if ( sm_line->n_allele==1 )
+ {
+ // PL values may not be present when ALT=. (mpileup/bcftools output), in that case
+ // switch automatically to GT at these sites
+ npl = fake_PLs(args, args->sm_hdr, sm_line);
+ }
+ else
+ error("PL not present at %s:%d?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1);
+ }
+ else
+ npl /= bcf_hdr_nsamples(args->sm_hdr);
+ }
+ else
+ npl = fake_PLs(args, args->sm_hdr, sm_line);
+
+ // Calculate likelihoods for all samples, assuming diploid genotypes
+
+ // For faster access to genotype likelihoods (PLs) of the query sample
+ int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
+ double sum_pl = 0; // for converting PLs to probs
+ for (max_ipl=0; max_ipl<npl; max_ipl++)
+ {
+ if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
+ if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
+ sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
+ }
+ if ( sum_pl==0 ) continue; // no PLs present
+ if ( fake_pls && args->no_PLs==1 ) sum_pl = -1;
+
+ // The main stats: concordance of the query sample with the target -g samples
+ for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
+ {
+ int *gt_ptr = gt_arr + i*ngt;
+ if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
+ if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue;
+ int a = bcf_gt_allele(gt_ptr[0]);
+ int b = bcf_gt_allele(gt_ptr[1]);
+ if ( args->hom_only && a!=b ) continue; // heterozygous genotype
+ int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
+ int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file
+ if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing
+ args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl);
+ args->sites[i]++;
+ }
+ if ( args->all_sites )
+ {
+ // Print LKs at all sites for debugging
+ int *gt_ptr = gt_arr + tgt_isample*ngt;
+ if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
+ int a = bcf_gt_allele(gt_ptr[0]);
+ int b = bcf_gt_allele(gt_ptr[1]);
+ if ( args->hom_only && a!=b ) continue; // heterozygous genotype
+ fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
+ for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
+ fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");
+ fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk);
+ prev_lk = args->lks[query_isample];
+
+ int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
+ for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]);
+ for (igt=0; igt<npl; igt++)
+ if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
+ else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
+ else fprintf(fp, "\t%d", pl_ptr[igt]);
+ fprintf(fp, "\n");
+ }
+ }
+ free(gt2ipl);
+ free(gt_arr);
+ free(args->pl_arr);
+ free(args->tmp_arr);
+
+ // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same
+ // plot as discordance per site, the latter must be scaled to the same range
+ int nsamples = bcf_hdr_nsamples(args->gt_hdr);
+ double extreme_lk = 0, extreme_lk_per_site = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i];
+ if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i];
+ }
+
+ // Sorted output
+ double **p = (double**) malloc(sizeof(double*)*nsamples);
+ for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
+ qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+
+ fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
+ for (i=0; i<nsamples; i++)
+ {
+ int idx = p[i] - args->lks;
+ double per_site = 0;
+ if ( args->sites[idx] )
+ {
+ if ( args->sites[idx] && extreme_lk_per_site )
+ {
+ per_site = args->lks[idx]/args->sites[idx];
+ per_site *= extreme_lk / extreme_lk_per_site;
+ }
+ else
+ per_site = 0;
+ }
+ fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i);
+ }
+
+ if ( args->plot )
+ {
+ fclose(fp);
+ plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
+ }
+}
+
+static inline int is_hom_most_likely(int nals, int *pls)
+{
+ int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
+ for (ia=1; ia<nals; ia++)
+ {
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+ idx++;
+ }
+ if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
+ idx++;
+ }
+ return min_is_hom;
+}
+
+static void cross_check_gts(args_t *args)
+{
+ int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
+ unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
+ int fake_pls = args->no_PLs, ignore_dp = 0;
+
+ int i,j,k,idx, pl_warned = 0, dp_warned = 0;
+ int32_t *dp_arr = NULL;
+ int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+ {
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
+ error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
+ if ( !args->no_PLs )
+ fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
+ fake_pls = 1;
+ }
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;
+
+ FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
+ print_header(args, fp);
+ if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ bcf_unpack(line, BCF_UN_FMT);
+
+ int npl;
+ if ( !fake_pls )
+ {
+ npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
+ if ( npl<=0 ) { pl_warned++; continue; }
+ npl /= nsamples;
+ }
+ else
+ npl = fake_PLs(args, args->sm_hdr, line);
+ int mdp = 0;
+ if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++;
+
+ if ( args->hom_only )
+ {
+ for (i=0; i<nsamples; i++)
+ is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
+ }
+
+ double sum = 0; int nsum = 0;
+ idx = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ int *ipl = &args->pl_arr[i*npl];
+ if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
+ if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
+ if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }
+
+ for (j=0; j<i; j++)
+ {
+ int *jpl = &args->pl_arr[j*npl];
+ if ( *jpl==-1 ) { idx++; continue; } // missing genotype
+ if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
+ if ( args->hom_only && !is_hom[j] ) { idx++; continue; }
+
+ int min_pl = INT_MAX;
+ for (k=0; k<npl; k++)
+ {
+ if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
+ if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
+ if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
+ }
+ if ( k!=npl ) { idx++; continue; }
+
+ if ( args->all_sites ) { sum += min_pl; nsum++; }
+ args->lks[idx] += min_pl;
+ args->cnts[idx]++;
+
+ if ( mdp>0 )
+ {
+ args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
+ dp[i] += dp_arr[i]; ndp[i]++;
+ dp[j] += dp_arr[j]; ndp[j]++;
+ }
+ else
+ {
+ args->dps[idx]++;
+ dp[i]++; ndp[i]++;
+ dp[j]++; ndp[j]++;
+ }
+ idx++;
+ }
+ }
+ if ( args->all_sites )
+ fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
+ }
+ if ( dp_arr ) free(dp_arr);
+ if ( args->pl_arr ) free(args->pl_arr);
+ if ( args->tmp_arr ) free(args->tmp_arr);
+ if ( is_hom ) free(is_hom);
+
+ if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
+ if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
+
+ // Output samples sorted by average discordance
+ double *score = (double*) calloc(nsamples,sizeof(double));
+ args->sites = (double*) calloc(nsamples,sizeof(double));
+ idx = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ for (j=0; j<i; j++)
+ {
+ score[i] += args->lks[idx];
+ score[j] += args->lks[idx];
+ args->sites[i] += args->cnts[idx];
+ args->sites[j] += args->cnts[idx];
+ idx++;
+ }
+ }
+ for (i=0; i<nsamples; i++)
+ if ( args->sites[i] ) score[i] /= args->sites[i];
+ double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
+ for (i=0; i<nsamples; i++) p[i] = &score[i];
+ qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+ // The average discordance gives the number of differing sites in % with -G1
+ fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
+ for (i=0; i<nsamples; i++)
+ {
+ idx = p[i] - score;
+ double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
+ double nsites = args->sites[idx]/(nsamples-1);
+ avg_score += score[idx];
+ fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
+ }
+
+ // // Overall score: maximum absolute deviation from the average score
+ // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
+ // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set
+ free(p);
+ free(score);
+ free(dp);
+ free(ndp);
+
+ // Pairwise discordances
+ fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
+ idx = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ for (j=0; j<i; j++)
+ {
+ fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0,
+ args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ idx++;
+ }
+ }
+ fclose(fp);
+ if ( args->plot )
+ plot_cross_check(args);
+}
+
+static char *init_prefix(char *prefix)
+{
+ int len = strlen(prefix);
+ if ( prefix[len-1] == '/' || prefix[len-1] == '\\' )
+ return msprintf("%sgtcheck", prefix);
+ return strdup(prefix);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n");
+ fprintf(stderr, "Usage: bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -a, --all-sites output comparison for all sites\n");
+ fprintf(stderr, " -g, --genotypes <file> genotypes to compare against\n");
+ fprintf(stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
+ fprintf(stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
+ fprintf(stderr, " -p, --plot <prefix> plot\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --query-sample <string> query sample (by default the first sample is checked)\n");
+ fprintf(stderr, " -S, --target-sample <string> target sample in the -g file (used only for plotting)\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfgtcheck(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv; set_cwd(args);
+ char *regions = NULL, *targets = NULL;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"GTs-only",1,0,'G'},
+ {"all-sites",0,0,'a'},
+ {"homs-only",0,0,'H'},
+ {"help",0,0,'h'},
+ {"genotypes",1,0,'g'},
+ {"plot",1,0,'p'},
+ {"target-sample",1,0,'S'},
+ {"query-sample",1,0,'s'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {0,0,0,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'G':
+ args->no_PLs = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
+ break;
+ case 'a': args->all_sites = 1; break;
+ case 'H': args->hom_only = 1; break;
+ case 'g': args->gt_fname = optarg; break;
+ case 'p': args->plot = optarg; break;
+ case 'S': args->target_sample = optarg; break;
+ case 's': args->query_sample = optarg; break;
+ case 'r': regions = optarg; break;
+ case 'R': regions = optarg; regions_is_file = 1; break;
+ case 't': targets = optarg; break;
+ case 'T': targets = optarg; targets_is_file = 1; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(); // no files given
+ }
+ else fname = argv[optind];
+ if ( argc>optind+1 ) usage(); // too many files given
+ if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode
+ else args->files->require_index = 1;
+ if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions);
+ if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets);
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum));
+ args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS;
+ if ( args->plot ) args->plot = init_prefix(args->plot);
+ init_data(args);
+ if ( args->cross_check )
+ cross_check_gts(args);
+ else
+ check_gt(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ if (args->plot) free(args->plot);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c
new file mode 100644
index 0000000..161ca3c
--- /dev/null
+++ b/bcftools/vcfgtcheck.c.pysam.c
@@ -0,0 +1,806 @@
+#include "pysam.h"
+
+/* vcfgtcheck.c -- Check sample identity.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+
+typedef struct
+{
+ bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check
+ bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
+ int ntmp_arr, npl_arr;
+ int32_t *tmp_arr, *pl_arr;
+ double *lks, *sites;
+ int *cnts, *dps, hom_only, cross_check, all_sites;
+ char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
+ int argc, no_PLs;
+}
+args_t;
+
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+char *msprintf(const char *fmt, ...);
+void mkdir_p(const char *fmt, ...);
+
+void py_plot(char *script)
+{
+ mkdir_p(script);
+ int len = strlen(script);
+ char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script);
+ int ret = system(cmd);
+ if ( ret ) fprintf(pysamerr, "The command returned non-zero status %d: %s\n", ret, cmd);
+ free(cmd);
+}
+
+static void plot_check(args_t *args, char *target_sample, char *query_sample)
+{
+ char *fname;
+ FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import matplotlib.gridspec as gridspec\n"
+ "import csv\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "\n"
+ "sample_ids = False\n"
+ "\n"
+ "dat = []\n"
+ "with open('%s.tab', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " for row in reader:\n"
+ " if row[0][0]=='#': continue\n"
+ " if row[0]!='CN': continue\n"
+ " tgt = 0\n"
+ " if row[4]=='%s': tgt = 1\n"
+ " dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n"
+ "\n"
+ "dat = sorted(dat)\n"
+ "\n"
+ "iq = -1; dp = 0\n"
+ "for i in range(len(dat)):\n"
+ " if iq==-1 and dat[i][3]==1: iq = i\n"
+ " dp += dat[i][2]\n"
+ "dp /= len(dat)\n"
+ "\n"
+ "fig,ax1 = plt.subplots(figsize=(8,5))\n"
+ "ax2 = ax1.twinx()\n"
+ "plots = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n"
+ "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n"
+ "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n"
+ "if iq!=-1:\n"
+ " ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n"
+ " ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n"
+ " ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n"
+ "for tl in ax1.get_yticklabels(): tl.set_color('g')\n"
+ "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n"
+ "min_dp = min([x[2] for x in dat])\n"
+ "max_dp = max([x[2] for x in dat])\n"
+ "ax2.set_ylim(min_dp-1,max_dp+1)\n"
+ "ax1.set_title('Discordance with %s')\n"
+ "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n"
+ "ax1.set_xlabel('Sample ID')\n"
+ "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n"
+ "if sample_ids:\n"
+ " ax1.set_xticks(range(len(dat)))\n"
+ " ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n"
+ " plt.subplots_adjust(bottom=0.2)\n"
+ "ax1.set_ylabel('Discordance',color='g')\n"
+ "ax2.set_ylabel('Number of sites',color='k')\n"
+ "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
+ "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
+ "labels = [l.get_label() for l in plots]\n"
+ "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n"
+ "plt.savefig('%s.png')\n"
+ "plt.close()\n"
+ "\n", args->plot, target_sample, target_sample, query_sample, args->plot
+ );
+ fclose(fp);
+ py_plot(fname);
+ free(fname);
+}
+
+static void plot_cross_check(args_t *args)
+{
+ char *fname;
+ FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "import matplotlib.gridspec as gridspec\n"
+ "import csv\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "avg = []\n"
+ "dp = []\n"
+ "sm2id = {}\n"
+ "dat = None\n"
+ "min = None\n"
+ "max = None\n"
+ "with open('%s.tab', 'rb') as f:\n"
+ " reader = csv.reader(f, 'tab')\n"
+ " i = 0\n"
+ " for row in reader:\n"
+ " if row[0]=='SM':\n"
+ " sm2id[row[4]] = i\n"
+ " avg.append([i,float(row[1])])\n"
+ " dp.append([i,float(row[2])])\n"
+ " i += 1\n"
+ " elif row[0]=='CN':\n"
+ " val = 0\n"
+ " if int(row[2])!=0: val = float(row[1])/int(row[2])\n"
+ " if not dat:\n"
+ " dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n"
+ " min = val\n"
+ " max = val\n"
+ " id_i = sm2id[row[4]]\n"
+ " id_j = sm2id[row[5]]\n"
+ " dat[id_i][id_j] = val\n"
+ " dat[id_j][id_i] = val\n"
+ " if min > val: min = val\n"
+ " if max < val: max = val\n"
+ "\n"
+ "if len(sm2id)<=1: exit(1)\n"
+ "if min==max: exit(1)\n"
+ "\n"
+ "fig = plt.figure(figsize=(6,7))\n"
+ "gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n"
+ "ax1 = plt.subplot(gs[0])\n"
+ "ax2 = plt.subplot(gs[1])\n"
+ "\n"
+ "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n"
+ "ax3 = ax1.twinx()\n"
+ "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n"
+ "for tl in ax3.get_yticklabels():\n"
+ " tl.set_color('r')\n"
+ " tl.set_fontsize(9)\n"
+ "\n"
+ "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n"
+ "cb1 = plt.colorbar(im,ax=ax2)\n"
+ "cb1.set_label('Pairwise discordance')\n"
+ "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n"
+ "\n"
+ "ax1.tick_params(axis='both', which='major', labelsize=9)\n"
+ "ax1.tick_params(axis='both', which='minor', labelsize=9)\n"
+ "ax2.tick_params(axis='both', which='major', labelsize=9)\n"
+ "ax2.tick_params(axis='both', which='minor', labelsize=9)\n"
+ "\n"
+ "ax1.set_title('Sample Discordance Score')\n"
+ "ax2.set_ylabel('Sample ID')\n"
+ "ax2.set_xlabel('Sample ID')\n"
+ "ax3.set_ylabel('Average Depth',color='r')\n"
+ "ax1.set_xlabel('Sample ID')\n"
+ "ax1.set_ylabel('Average discordance')\n"
+ "\n"
+ "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n"
+ "plt.savefig('%s.png')\n"
+ "plt.close()\n"
+ "\n", args->plot,args->plot
+ );
+ fclose(fp);
+ py_plot(fname);
+ free(fname);
+}
+
+static void init_data(args_t *args)
+{
+ args->sm_hdr = args->files->readers[0].header;
+ if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname);
+
+ if ( !args->cross_check )
+ {
+ args->gt_hdr = args->files->readers[1].header;
+ int nsamples = bcf_hdr_nsamples(args->gt_hdr);
+ if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname);
+ args->lks = (double*) calloc(nsamples,sizeof(double));
+ args->cnts = (int*) calloc(nsamples,sizeof(int));
+ args->sites = (double*) calloc(nsamples,sizeof(double));
+ args->dps = (int*) calloc(nsamples,sizeof(int));
+ }
+ else
+ {
+ int nsamples = bcf_hdr_nsamples(args->sm_hdr);
+ int narr = (nsamples-1)*nsamples/2;
+ args->lks = (double*) calloc(narr,sizeof(double));
+ args->cnts = (int*) calloc(narr,sizeof(int));
+ args->dps = (int*) calloc(narr,sizeof(int));
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites);
+}
+
+static int allele_to_int(bcf1_t *line, char *allele)
+{
+ int i;
+ for (i=0; i<line->n_allele; i++)
+ if ( !strcmp(allele,line->d.allele[i]) ) return i;
+ if ( strcmp(line->d.allele[i-1],"X") ) return -1;
+ return i-1;
+}
+
+static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl)
+{
+ int i, j;
+ for (i=0; i<n_gt2ipl; i++) gt2ipl[i] = -1;
+ for (i=0; i<gt_line->n_allele; i++)
+ {
+ // find which of the sm_alleles (k) corresponds to the gt_allele (i)
+ int k = allele_to_int(sm_line, gt_line->d.allele[i]);
+ if ( k<0 ) return 0;
+ for (j=0; j<=i; j++)
+ {
+ int l = allele_to_int(sm_line, gt_line->d.allele[j]);
+ if ( l<0 ) return 0;
+ gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k);
+ }
+ }
+ //for (i=0; i<n_gt2ipl; i++) printf("%d .. %d\n", i,gt2ipl[i]);
+ return 1;
+}
+
+static void set_cwd(args_t *args)
+{
+ int i;
+ char *buf;
+ size_t nbuf = 500;
+ args->cwd = (char*) malloc(sizeof(char)*nbuf);
+ for (i=0; i<5; i++)
+ {
+ if ( (buf = getcwd(args->cwd, nbuf)) ) break;
+ nbuf *= 2;
+ args->cwd = (char*) realloc(args->cwd, sizeof(char)*nbuf);
+ }
+ assert(buf);
+}
+
+static void print_header(args_t *args, FILE *fp)
+{
+ fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
+ fprintf(fp, "# \t bcftools %s ", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++)
+ fprintf(fp, " %s",args->argv[i]);
+ fprintf(fp, "\n# and the working directory was:\n");
+ fprintf(fp, "# \t %s\n#\n", args->cwd);
+}
+
+static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
+{
+ // PLs not present, use GTs instead.
+ int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs
+ int nsm_gt, i;
+ if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 )
+ error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+ nsm_gt /= bcf_hdr_nsamples(hdr);
+ int npl = line->n_allele*(line->n_allele+1)/2;
+ hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ {
+ int *gt_ptr = args->tmp_arr + i*nsm_gt;
+ int j, *pl_ptr = args->pl_arr + i*npl;
+ if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype
+ {
+ for (j=0; j<npl; j++) pl_ptr[j] = -1;
+ }
+ else
+ {
+ int a = bcf_gt_allele(gt_ptr[0]);
+ int b = bcf_gt_allele(gt_ptr[1]);
+ for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
+ int idx = bcf_alleles2gt(a,b);
+ pl_ptr[idx] = 0;
+ }
+ }
+ return npl;
+}
+
+static int cmp_doubleptr(const void *_a, const void *_b)
+{
+ double *a = *((double**)_a);
+ double *b = *((double**)_b);
+ if ( *a < *b ) return -1;
+ else if ( *a == *b ) return 0;
+ return 1;
+}
+
+static void check_gt(args_t *args)
+{
+ int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
+ int fake_pls = args->no_PLs;
+
+ // Initialize things: check which tags are defined in the header, sample names etc.
+ if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+ {
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
+ error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
+ if ( !args->no_PLs )
+ fprintf(pysamerr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
+ fake_pls = 1;
+ }
+
+ FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
+ print_header(args, fp);
+
+ int tgt_isample = -1, query_isample = 0;
+ if ( args->target_sample )
+ {
+ tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
+ if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
+ }
+ if ( args->all_sites )
+ {
+ if ( tgt_isample==-1 )
+ {
+ fprintf(pysamerr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
+ tgt_isample = 0;
+ }
+ }
+ if ( args->query_sample )
+ {
+ query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
+ if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
+ }
+ if ( args->all_sites )
+ fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n",
+ args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);
+
+ // Main loop
+ float prev_lk = 0;
+ while ( (ret=bcf_sr_next_line(args->files)) )
+ {
+ if ( ret!=2 ) continue;
+ bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file
+ bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file
+ bcf_unpack(sm_line, BCF_UN_FMT);
+ bcf_unpack(gt_line, BCF_UN_FMT);
+
+ // Init mapping from target genotype index to the sample's PL fields
+ int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
+ if ( n_gt2ipl > m_gt2ipl )
+ {
+ m_gt2ipl = n_gt2ipl;
+ gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
+ }
+ if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;
+
+ // Target genotypes
+ int ngt, npl;
+ if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 )
+ error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
+ ngt /= bcf_hdr_nsamples(args->gt_hdr);
+ if ( ngt!=2 ) continue; // checking only diploid genotypes
+
+ // Sample PLs
+ if ( !fake_pls )
+ {
+ if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
+ {
+ if ( sm_line->n_allele==1 )
+ {
+ // PL values may not be present when ALT=. (mpileup/bcftools output), in that case
+ // switch automatically to GT at these sites
+ npl = fake_PLs(args, args->sm_hdr, sm_line);
+ }
+ else
+ error("PL not present at %s:%d?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1);
+ }
+ else
+ npl /= bcf_hdr_nsamples(args->sm_hdr);
+ }
+ else
+ npl = fake_PLs(args, args->sm_hdr, sm_line);
+
+ // Calculate likelihoods for all samples, assuming diploid genotypes
+
+ // For faster access to genotype likelihoods (PLs) of the query sample
+ int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
+ double sum_pl = 0; // for converting PLs to probs
+ for (max_ipl=0; max_ipl<npl; max_ipl++)
+ {
+ if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
+ if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
+ sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
+ }
+ if ( sum_pl==0 ) continue; // no PLs present
+ if ( fake_pls && args->no_PLs==1 ) sum_pl = -1;
+
+ // The main stats: concordance of the query sample with the target -g samples
+ for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
+ {
+ int *gt_ptr = gt_arr + i*ngt;
+ if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
+ if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue;
+ int a = bcf_gt_allele(gt_ptr[0]);
+ int b = bcf_gt_allele(gt_ptr[1]);
+ if ( args->hom_only && a!=b ) continue; // heterozygous genotype
+ int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
+ int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file
+ if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing
+ args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl);
+ args->sites[i]++;
+ }
+ if ( args->all_sites )
+ {
+ // Print LKs at all sites for debugging
+ int *gt_ptr = gt_arr + tgt_isample*ngt;
+ if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
+ int a = bcf_gt_allele(gt_ptr[0]);
+ int b = bcf_gt_allele(gt_ptr[1]);
+ if ( args->hom_only && a!=b ) continue; // heterozygous genotype
+ fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
+ for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
+ fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");
+ fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk);
+ prev_lk = args->lks[query_isample];
+
+ int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
+ for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]);
+ for (igt=0; igt<npl; igt++)
+ if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
+ else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
+ else fprintf(fp, "\t%d", pl_ptr[igt]);
+ fprintf(fp, "\n");
+ }
+ }
+ free(gt2ipl);
+ free(gt_arr);
+ free(args->pl_arr);
+ free(args->tmp_arr);
+
+ // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same
+ // plot as discordance per site, the latter must be scaled to the same range
+ int nsamples = bcf_hdr_nsamples(args->gt_hdr);
+ double extreme_lk = 0, extreme_lk_per_site = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i];
+ if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i];
+ }
+
+ // Sorted output
+ double **p = (double**) malloc(sizeof(double*)*nsamples);
+ for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
+ qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+
+ fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
+ for (i=0; i<nsamples; i++)
+ {
+ int idx = p[i] - args->lks;
+ double per_site = 0;
+ if ( args->sites[idx] )
+ {
+ if ( args->sites[idx] && extreme_lk_per_site )
+ {
+ per_site = args->lks[idx]/args->sites[idx];
+ per_site *= extreme_lk / extreme_lk_per_site;
+ }
+ else
+ per_site = 0;
+ }
+ fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i);
+ }
+
+ if ( args->plot )
+ {
+ fclose(fp);
+ plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
+ }
+}
+
+static inline int is_hom_most_likely(int nals, int *pls)
+{
+ int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
+ for (ia=1; ia<nals; ia++)
+ {
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+ idx++;
+ }
+ if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
+ idx++;
+ }
+ return min_is_hom;
+}
+
+static void cross_check_gts(args_t *args)
+{
+ int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
+ unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
+ int fake_pls = args->no_PLs, ignore_dp = 0;
+
+ int i,j,k,idx, pl_warned = 0, dp_warned = 0;
+ int32_t *dp_arr = NULL;
+ int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+ {
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
+ error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
+ if ( !args->no_PLs )
+ fprintf(pysamerr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
+ fake_pls = 1;
+ }
+ if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;
+
+ FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
+ print_header(args, fp);
+ if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ bcf_unpack(line, BCF_UN_FMT);
+
+ int npl;
+ if ( !fake_pls )
+ {
+ npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
+ if ( npl<=0 ) { pl_warned++; continue; }
+ npl /= nsamples;
+ }
+ else
+ npl = fake_PLs(args, args->sm_hdr, line);
+ int mdp = 0;
+ if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++;
+
+ if ( args->hom_only )
+ {
+ for (i=0; i<nsamples; i++)
+ is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
+ }
+
+ double sum = 0; int nsum = 0;
+ idx = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ int *ipl = &args->pl_arr[i*npl];
+ if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
+ if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
+ if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }
+
+ for (j=0; j<i; j++)
+ {
+ int *jpl = &args->pl_arr[j*npl];
+ if ( *jpl==-1 ) { idx++; continue; } // missing genotype
+ if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
+ if ( args->hom_only && !is_hom[j] ) { idx++; continue; }
+
+ int min_pl = INT_MAX;
+ for (k=0; k<npl; k++)
+ {
+ if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
+ if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
+ if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
+ }
+ if ( k!=npl ) { idx++; continue; }
+
+ if ( args->all_sites ) { sum += min_pl; nsum++; }
+ args->lks[idx] += min_pl;
+ args->cnts[idx]++;
+
+ if ( mdp>0 )
+ {
+ args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
+ dp[i] += dp_arr[i]; ndp[i]++;
+ dp[j] += dp_arr[j]; ndp[j]++;
+ }
+ else
+ {
+ args->dps[idx]++;
+ dp[i]++; ndp[i]++;
+ dp[j]++; ndp[j]++;
+ }
+ idx++;
+ }
+ }
+ if ( args->all_sites )
+ fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
+ }
+ if ( dp_arr ) free(dp_arr);
+ if ( args->pl_arr ) free(args->pl_arr);
+ if ( args->tmp_arr ) free(args->tmp_arr);
+ if ( is_hom ) free(is_hom);
+
+ if ( pl_warned ) fprintf(pysamerr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
+ if ( dp_warned ) fprintf(pysamerr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
+
+ // Output samples sorted by average discordance
+ double *score = (double*) calloc(nsamples,sizeof(double));
+ args->sites = (double*) calloc(nsamples,sizeof(double));
+ idx = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ for (j=0; j<i; j++)
+ {
+ score[i] += args->lks[idx];
+ score[j] += args->lks[idx];
+ args->sites[i] += args->cnts[idx];
+ args->sites[j] += args->cnts[idx];
+ idx++;
+ }
+ }
+ for (i=0; i<nsamples; i++)
+ if ( args->sites[i] ) score[i] /= args->sites[i];
+ double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
+ for (i=0; i<nsamples; i++) p[i] = &score[i];
+ qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+ // The average discordance gives the number of differing sites in % with -G1
+ fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
+ for (i=0; i<nsamples; i++)
+ {
+ idx = p[i] - score;
+ double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
+ double nsites = args->sites[idx]/(nsamples-1);
+ avg_score += score[idx];
+ fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
+ }
+
+ // // Overall score: maximum absolute deviation from the average score
+ // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
+ // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set
+ free(p);
+ free(score);
+ free(dp);
+ free(ndp);
+
+ // Pairwise discordances
+ fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
+ idx = 0;
+ for (i=0; i<nsamples; i++)
+ {
+ for (j=0; j<i; j++)
+ {
+ fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0,
+ args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ idx++;
+ }
+ }
+ fclose(fp);
+ if ( args->plot )
+ plot_cross_check(args);
+}
+
+static char *init_prefix(char *prefix)
+{
+ int len = strlen(prefix);
+ if ( prefix[len-1] == '/' || prefix[len-1] == '\\' )
+ return msprintf("%sgtcheck", prefix);
+ return strdup(prefix);
+}
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n");
+ fprintf(pysamerr, "Usage: bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -a, --all-sites output comparison for all sites\n");
+ fprintf(pysamerr, " -g, --genotypes <file> genotypes to compare against\n");
+ fprintf(pysamerr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
+ fprintf(pysamerr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
+ fprintf(pysamerr, " -p, --plot <prefix> plot\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --query-sample <string> query sample (by default the first sample is checked)\n");
+ fprintf(pysamerr, " -S, --target-sample <string> target sample in the -g file (used only for plotting)\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfgtcheck(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv; set_cwd(args);
+ char *regions = NULL, *targets = NULL;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"GTs-only",1,0,'G'},
+ {"all-sites",0,0,'a'},
+ {"homs-only",0,0,'H'},
+ {"help",0,0,'h'},
+ {"genotypes",1,0,'g'},
+ {"plot",1,0,'p'},
+ {"target-sample",1,0,'S'},
+ {"query-sample",1,0,'s'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {0,0,0,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'G':
+ args->no_PLs = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
+ break;
+ case 'a': args->all_sites = 1; break;
+ case 'H': args->hom_only = 1; break;
+ case 'g': args->gt_fname = optarg; break;
+ case 'p': args->plot = optarg; break;
+ case 'S': args->target_sample = optarg; break;
+ case 's': args->query_sample = optarg; break;
+ case 'r': regions = optarg; break;
+ case 'R': regions = optarg; regions_is_file = 1; break;
+ case 't': targets = optarg; break;
+ case 'T': targets = optarg; targets_is_file = 1; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(); // no files given
+ }
+ else fname = argv[optind];
+ if ( argc>optind+1 ) usage(); // too many files given
+ if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode
+ else args->files->require_index = 1;
+ if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions);
+ if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets);
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum));
+ args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS;
+ if ( args->plot ) args->plot = init_prefix(args->plot);
+ init_data(args);
+ if ( args->cross_check )
+ cross_check_gts(args);
+ else
+ check_gt(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ if (args->plot) free(args->plot);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c
new file mode 100644
index 0000000..e40fab5
--- /dev/null
+++ b/bcftools/vcfindex.c
@@ -0,0 +1,240 @@
+
+/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Shane McCarthy <sm15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <htslib/vcf.h>
+#include <htslib/tbx.h>
+#include <sys/stat.h>
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include "bcftools.h"
+
+#define BCF_LIDX_SHIFT 14
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Index bgzip compressed VCF/BCF files for random access.\n");
+ fprintf(stderr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Indexing options:\n");
+ fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
+ fprintf(stderr, " -f, --force overwrite index if it already exists\n");
+ fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Stats options:\n");
+ fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n");
+ fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int vcf_index_stats(char *fname, int stats)
+{
+ char *fn_out = NULL;
+ FILE *out;
+ out = fn_out ? fopen(fn_out, "w") : stdout;
+
+ const char **seq;
+ int i, nseq;
+ tbx_t *tbx = NULL;
+ hts_idx_t *idx = NULL;
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; }
+ bcf_hdr_t *hdr = bcf_hdr_read(fp);
+ if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; }
+
+ if ( hts_get_format(fp)->format==vcf )
+ {
+ tbx = tbx_index_load(fname);
+ if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; }
+ }
+ else if ( hts_get_format(fp)->format==bcf )
+ {
+ idx = bcf_index_load(fname);
+ if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; }
+ }
+ else
+ {
+ fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
+ return 1;
+ }
+
+ seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
+ uint64_t sum = 0;
+ for (i=0; i<nseq; i++)
+ {
+ uint64_t records, v;
+ hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
+ sum+=records;
+ if (stats&2 || !records) continue;
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
+ int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+ fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
+ }
+ if (!sum)
+ {
+ // No counts found.
+ // Is this because index version has no stored count data, or no records?
+ bcf1_t *rec = bcf_init1();
+ if (bcf_read1(fp, hdr, rec) >= 0)
+ {
+ fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
+ return 1;
+ }
+ bcf_destroy1(rec);
+ }
+ if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
+ free(seq);
+ fclose(out);
+ hts_close(fp);
+ bcf_hdr_destroy(hdr);
+ if (tbx)
+ tbx_destroy(tbx);
+ if (idx)
+ hts_idx_destroy(idx);
+ return 0;
+}
+
+int main_vcfindex(int argc, char *argv[])
+{
+ int c, force = 0, tbi = 0, stats = 0;
+ int min_shift = BCF_LIDX_SHIFT;
+
+ static struct option loptions[] =
+ {
+ {"csi",no_argument,NULL,'c'},
+ {"tbi",no_argument,NULL,'t'},
+ {"force",no_argument,NULL,'f'},
+ {"min-shift",required_argument,NULL,'m'},
+ {"stats",no_argument,NULL,'s'},
+ {"nrecords",no_argument,NULL,'n'},
+ {NULL, 0, NULL, 0}
+ };
+
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'c': tbi = 0; break;
+ case 't': tbi = 1; min_shift = 0; break;
+ case 'f': force = 1; break;
+ case 'm':
+ min_shift = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
+ break;
+ case 's': stats |= 1; break;
+ case 'n': stats |= 2; break;
+ default: usage();
+ }
+ }
+ if ( optind==argc ) usage();
+ if (stats>2)
+ {
+ fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
+ return 1;
+ }
+ if (tbi && min_shift>0)
+ {
+ fprintf(stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__);
+ return 1;
+ }
+ if (min_shift < 0 || min_shift > 30)
+ {
+ fprintf(stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
+ return 1;
+ }
+
+ char *fname = argv[optind];
+ if (stats) return vcf_index_stats(fname, stats);
+
+ htsFile *fp = hts_open(fname,"r");
+ htsFormat type = *hts_get_format(fp);
+ hts_close(fp);
+
+ if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf )
+ {
+ fprintf(stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
+ if ( type.compression!=bgzf )
+ fprintf(stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
+ return 1;
+ }
+ if (tbi && type.format==bcf)
+ {
+ fprintf(stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
+ tbi = 0; min_shift = BCF_LIDX_SHIFT;
+ }
+ if (min_shift == 0 && type.format==bcf)
+ {
+ fprintf(stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
+ return 1;
+ }
+ if (!tbi && type.format==vcf && min_shift == 0)
+ {
+ fprintf(stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
+ tbi = 1;
+ }
+
+ if (!force)
+ {
+ // Before complaining about existing index, check if the VCF file isn't newer.
+ char *idx_fname = (char*)alloca(strlen(fname) + 5);
+ strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi");
+ struct stat stat_tbi, stat_file;
+ if ( stat(idx_fname, &stat_tbi)==0 )
+ {
+ stat(fname, &stat_file);
+ if ( stat_file.st_mtime <= stat_tbi.st_mtime )
+ {
+ fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
+ return 1;
+ }
+ }
+ }
+
+ if (type.format==bcf)
+ {
+ if ( bcf_index_build(fname, min_shift) != 0 )
+ {
+ fprintf(stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
+ return 1;
+ }
+ }
+ else
+ {
+ if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
+ {
+ fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c
new file mode 100644
index 0000000..1cfde16
--- /dev/null
+++ b/bcftools/vcfindex.c.pysam.c
@@ -0,0 +1,242 @@
+#include "pysam.h"
+
+
+/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Shane McCarthy <sm15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <htslib/vcf.h>
+#include <htslib/tbx.h>
+#include <sys/stat.h>
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include "bcftools.h"
+
+#define BCF_LIDX_SHIFT 14
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Index bgzip compressed VCF/BCF files for random access.\n");
+ fprintf(pysamerr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Indexing options:\n");
+ fprintf(pysamerr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
+ fprintf(pysamerr, " -f, --force overwrite index if it already exists\n");
+ fprintf(pysamerr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(pysamerr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Stats options:\n");
+ fprintf(pysamerr, " -n, --nrecords print number of records based on existing index file\n");
+ fprintf(pysamerr, " -s, --stats print per contig stats based on existing index file\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int vcf_index_stats(char *fname, int stats)
+{
+ char *fn_out = NULL;
+ FILE *out;
+ out = fn_out ? fopen(fn_out, "w") : stdout;
+
+ const char **seq;
+ int i, nseq;
+ tbx_t *tbx = NULL;
+ hts_idx_t *idx = NULL;
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) { fprintf(pysamerr,"Could not read %s\n", fname); return 1; }
+ bcf_hdr_t *hdr = bcf_hdr_read(fp);
+ if ( !hdr ) { fprintf(pysamerr,"Could not read the header: %s\n", fname); return 1; }
+
+ if ( hts_get_format(fp)->format==vcf )
+ {
+ tbx = tbx_index_load(fname);
+ if ( !tbx ) { fprintf(pysamerr,"Could not load TBI index: %s\n", fname); return 1; }
+ }
+ else if ( hts_get_format(fp)->format==bcf )
+ {
+ idx = bcf_index_load(fname);
+ if ( !idx ) { fprintf(pysamerr,"Could not load CSI index: %s\n", fname); return 1; }
+ }
+ else
+ {
+ fprintf(pysamerr,"Could not detect the file type as VCF or BCF: %s\n", fname);
+ return 1;
+ }
+
+ seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
+ uint64_t sum = 0;
+ for (i=0; i<nseq; i++)
+ {
+ uint64_t records, v;
+ hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
+ sum+=records;
+ if (stats&2 || !records) continue;
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
+ int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+ fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
+ }
+ if (!sum)
+ {
+ // No counts found.
+ // Is this because index version has no stored count data, or no records?
+ bcf1_t *rec = bcf_init1();
+ if (bcf_read1(fp, hdr, rec) >= 0)
+ {
+ fprintf(pysamerr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
+ return 1;
+ }
+ bcf_destroy1(rec);
+ }
+ if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
+ free(seq);
+ fclose(out);
+ hts_close(fp);
+ bcf_hdr_destroy(hdr);
+ if (tbx)
+ tbx_destroy(tbx);
+ if (idx)
+ hts_idx_destroy(idx);
+ return 0;
+}
+
+int main_vcfindex(int argc, char *argv[])
+{
+ int c, force = 0, tbi = 0, stats = 0;
+ int min_shift = BCF_LIDX_SHIFT;
+
+ static struct option loptions[] =
+ {
+ {"csi",no_argument,NULL,'c'},
+ {"tbi",no_argument,NULL,'t'},
+ {"force",no_argument,NULL,'f'},
+ {"min-shift",required_argument,NULL,'m'},
+ {"stats",no_argument,NULL,'s'},
+ {"nrecords",no_argument,NULL,'n'},
+ {NULL, 0, NULL, 0}
+ };
+
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'c': tbi = 0; break;
+ case 't': tbi = 1; min_shift = 0; break;
+ case 'f': force = 1; break;
+ case 'm':
+ min_shift = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
+ break;
+ case 's': stats |= 1; break;
+ case 'n': stats |= 2; break;
+ default: usage();
+ }
+ }
+ if ( optind==argc ) usage();
+ if (stats>2)
+ {
+ fprintf(pysamerr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
+ return 1;
+ }
+ if (tbi && min_shift>0)
+ {
+ fprintf(pysamerr, "[E::%s] min-shift option only expected for CSI indices \n", __func__);
+ return 1;
+ }
+ if (min_shift < 0 || min_shift > 30)
+ {
+ fprintf(pysamerr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
+ return 1;
+ }
+
+ char *fname = argv[optind];
+ if (stats) return vcf_index_stats(fname, stats);
+
+ htsFile *fp = hts_open(fname,"r");
+ htsFormat type = *hts_get_format(fp);
+ hts_close(fp);
+
+ if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf )
+ {
+ fprintf(pysamerr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
+ if ( type.compression!=bgzf )
+ fprintf(pysamerr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
+ return 1;
+ }
+ if (tbi && type.format==bcf)
+ {
+ fprintf(pysamerr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
+ tbi = 0; min_shift = BCF_LIDX_SHIFT;
+ }
+ if (min_shift == 0 && type.format==bcf)
+ {
+ fprintf(pysamerr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
+ return 1;
+ }
+ if (!tbi && type.format==vcf && min_shift == 0)
+ {
+ fprintf(pysamerr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
+ tbi = 1;
+ }
+
+ if (!force)
+ {
+ // Before complaining about existing index, check if the VCF file isn't newer.
+ char *idx_fname = (char*)alloca(strlen(fname) + 5);
+ strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi");
+ struct stat stat_tbi, stat_file;
+ if ( stat(idx_fname, &stat_tbi)==0 )
+ {
+ stat(fname, &stat_file);
+ if ( stat_file.st_mtime <= stat_tbi.st_mtime )
+ {
+ fprintf(pysamerr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
+ return 1;
+ }
+ }
+ }
+
+ if (type.format==bcf)
+ {
+ if ( bcf_index_build(fname, min_shift) != 0 )
+ {
+ fprintf(pysamerr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
+ return 1;
+ }
+ }
+ else
+ {
+ if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
+ {
+ fprintf(pysamerr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c
new file mode 100644
index 0000000..6115146
--- /dev/null
+++ b/bcftools/vcfisec.c
@@ -0,0 +1,596 @@
+/* vcfisec.c -- Create intersections, unions and complements of VCF files.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+
+#define OP_PLUS 1
+#define OP_MINUS 2
+#define OP_EQUAL 3
+#define OP_VENN 4
+#define OP_COMPLEMENT 5
+#define OP_EXACT 6
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct
+{
+ int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads;
+ int nflt, *flt_logic;
+ filter_t **flt;
+ char **flt_expr;
+ bcf_srs_t *files;
+ FILE *fh_log, *fh_sites;
+ htsFile **fh_out;
+ char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
+ char *isec_exact;
+ int argc;
+}
+args_t;
+
+/**
+ * mkdir_p() - create new directory for a file $fname
+ * @fname: the file name to create the directory for, the part after last "/" is ignored
+ */
+void mkdir_p(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ int n = vsnprintf(NULL, 0, fmt, ap) + 2;
+ va_end(ap);
+
+ char *path = (char*)malloc(n);
+ va_start(ap, fmt);
+ vsnprintf(path, n, fmt, ap);
+ va_end(ap);
+
+ char *tmp = strdup(path), *p = tmp+1;
+ while (*p)
+ {
+ while (*p && *p!='/') p++;
+ if ( *p )
+ {
+ *p = 0;
+ mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+ *p = '/';
+ p++;
+ }
+ }
+ free(tmp);
+ free(path);
+}
+
+/**
+ * open_file() - open new file creating the file name using vsnprintf
+ * @fname: if not NULL, on output will point to newly allocated fname string
+ * @mode: if NULL, only the file name string will be created
+ * @fmt: vsnprintf format and args
+ *
+ * Returns open file descriptor or NULL if mode is NULL.
+ */
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ int n = vsnprintf(NULL, 0, fmt, ap) + 2;
+ va_end(ap);
+
+ char *str = (char*)malloc(n);
+ va_start(ap, fmt);
+ vsnprintf(str, n, fmt, ap);
+ va_end(ap);
+
+ mkdir_p(str);
+ if ( !mode )
+ {
+ if ( !fname ) error("Uh: expected fname or mode\n");
+ *fname = str;
+ return NULL;
+ }
+
+ FILE *fp = fopen(str,mode);
+ if ( fname ) *fname = str;
+ else free(str);
+ return fp;
+}
+
+void isec_vcf(args_t *args)
+{
+ bcf_srs_t *files = args->files;
+ kstring_t str = {0,0,0};
+ htsFile *out_fh = NULL;
+
+ // When only one VCF is output, print VCF to stdout or -o file
+ int out_std = 0;
+ if ( args->nwrite==1 && !args->prefix ) out_std = 1;
+ if ( args->targets_list && files->nreaders==1 ) out_std = 1;
+ if ( out_std )
+ {
+ out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
+ bcf_hdr_write(out_fh, files->readers[args->iwrite].header);
+ }
+ if ( !args->nwrite && !out_std && !args->prefix )
+ fprintf(stderr,"Note: -w option not given, printing list of sites...\n");
+
+ int n;
+ while ( (n=bcf_sr_next_line(files)) )
+ {
+ bcf_sr_t *reader = NULL;
+ bcf1_t *line = NULL;
+ int i, ret = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !bcf_sr_has_line(files,i) ) continue;
+
+ if ( args->nflt && args->flt[i] )
+ {
+ bcf1_t *rec = bcf_sr_get_line(files, i);
+ int pass = filter_test(args->flt[i], rec, NULL);
+ if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass )
+ {
+ files->has_line[i] = 0;
+ n--;
+ continue;
+ }
+ }
+
+ if ( !line )
+ {
+ line = files->readers[i].buffer[0];
+ reader = &files->readers[i];
+ }
+ ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN)
+ }
+
+ switch (args->isec_op)
+ {
+ case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break;
+ case OP_EQUAL: if ( n != args->isec_n ) continue; break;
+ case OP_PLUS: if ( n < args->isec_n ) continue; break;
+ case OP_MINUS: if ( n > args->isec_n ) continue; break;
+ case OP_EXACT:
+ for (i=0; i<files->nreaders; i++)
+ if ( files->has_line[i] != args->isec_exact[i] ) break;
+ if ( i<files->nreaders ) continue;
+ break;
+ }
+
+ if ( out_std )
+ {
+ if ( bcf_sr_has_line(files,args->iwrite) )
+ bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]);
+ continue;
+ }
+ else if ( args->fh_sites )
+ {
+ str.l = 0;
+ kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str);
+ kputw(line->pos+1, &str); kputc('\t', &str);
+ if (line->n_allele > 0) kputs(line->d.allele[0], &str);
+ else kputc('.', &str);
+ kputc('\t', &str);
+ if (line->n_allele > 1) kputs(line->d.allele[1], &str);
+ else kputc('.', &str);
+ for (i=2; i<line->n_allele; i++)
+ {
+ kputc(',', &str);
+ kputs(line->d.allele[i], &str);
+ }
+ kputc('\t', &str);
+ for (i=0; i<files->nreaders; i++)
+ kputc(bcf_sr_has_line(files,i)?'1':'0', &str);
+ kputc('\n', &str);
+ fwrite(str.s,sizeof(char),str.l,args->fh_sites);
+ }
+
+ if ( args->prefix )
+ {
+ if ( args->isec_op==OP_VENN && ret==3 )
+ {
+ if ( !args->nwrite || args->write[0] )
+ bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0));
+ if ( !args->nwrite || args->write[1] )
+ bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1));
+ }
+ else
+ {
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !bcf_sr_has_line(files,i) ) continue;
+ if ( args->write && !args->write[i] ) continue;
+ bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]);
+ }
+ }
+ }
+ }
+ if ( str.s ) free(str.s);
+ if ( out_fh ) hts_close(out_fh);
+}
+
+static void add_filter(args_t *args, char *expr, int logic)
+{
+ args->nflt++;
+ args->flt_expr = (char**) realloc(args->flt_expr,sizeof(char*)*args->nflt);
+ args->flt_logic = (int*) realloc(args->flt_logic,sizeof(int)*args->nflt);
+ args->flt = (filter_t**) realloc(args->flt,sizeof(filter_t*)*args->nflt);
+ if ( expr[0]=='-' && expr[1]==0 )
+ {
+ args->flt_expr[args->nflt-1] = NULL;
+ args->flt[args->nflt-1] = NULL;
+ }
+ else
+ args->flt_expr[args->nflt-1] = expr;
+ args->flt_logic[args->nflt-1] = logic;
+}
+
+static void destroy_data(args_t *args);
+static void init_data(args_t *args)
+{
+ int i;
+ if ( args->nflt )
+ {
+ if ( args->nflt > 1 && args->nflt!=args->files->nreaders )
+ error("Error: expected either one -i/-e option or as many as there are input files\n");
+ if ( args->nflt < args->files->nreaders )
+ {
+ if ( !args->flt_expr[0] ) error("Error: useless use of -i/-e\n");
+ args->nflt = args->files->nreaders;
+ args->flt_expr = (char**) realloc(args->flt_expr,sizeof(char*)*args->nflt);
+ args->flt_logic = (int*) realloc(args->flt_logic,sizeof(int)*args->nflt);
+ args->flt = (filter_t**) realloc(args->flt,sizeof(filter_t*)*args->nflt);
+ for (i=1; i<args->nflt; i++)
+ {
+ args->flt_expr[i] = args->flt_expr[0];
+ args->flt_logic[i] = args->flt_logic[0];
+ args->flt[i] = filter_init(args->files->readers[i].header,args->flt_expr[i]);
+ }
+ args->flt[0] = filter_init(args->files->readers[0].header,args->flt_expr[0]);
+ }
+ else
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !args->flt_expr[i] ) continue;
+ args->flt[i] = filter_init(args->files->readers[i].header,args->flt_expr[i]);
+ }
+ }
+ }
+
+ if ( args->isec_op==OP_EXACT )
+ {
+ if ( strlen(args->isec_exact)!=args->files->nreaders )
+ error("The number of files does not match the bitmask: %d vs %s\n", args->files->nreaders,args->isec_exact);
+ for (i=0; i<args->files->nreaders; i++)
+ if ( args->isec_exact[i]!='0' && args->isec_exact[i]!='1' ) error("Unexpected bitmask: %s\n",args->isec_exact);
+ for (i=0; i<args->files->nreaders; i++)
+ args->isec_exact[i] -= '0';
+ }
+
+ // Which files to write: parse the string passed with -w
+ char *p = args->write_files;
+ while (p && *p)
+ {
+ if ( !args->write ) args->write = (int*) calloc(args->files->nreaders,sizeof(int));
+ if ( sscanf(p,"%d",&i)!=1 ) error("Could not parse --write %s\n", args->write_files);
+ if ( i<0 || i>args->files->nreaders ) error("The index is out of range: %d (%s)\n", i, args->write_files);
+ args->write[i-1] = 1;
+ args->iwrite = i-1;
+ args->nwrite++;
+ while (*p && *p!=',') p++;
+ if ( *p==',' ) p++;
+ }
+ if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+ if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
+ {
+ if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
+ if ( !args->write[0] ) error("Only -w1 makes sense with -C\n");
+ }
+
+ if ( args->prefix )
+ {
+ // Init output directory and create the readme file
+ args->fh_log = open_file(NULL,"w","%s/README.txt", args->prefix);
+ if ( !args->fh_log ) error("%s/README.txt: %s\n", args->prefix, strerror(errno));
+
+ fprintf(args->fh_log,"This file was produced by vcfisec.\n");
+ fprintf(args->fh_log,"The command line was:\tbcftools %s ", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++) fprintf(args->fh_log," %s",args->argv[i]);
+ fprintf(args->fh_log,"\n\nUsing the following file names:\n");
+
+ const char *suffix = "vcf";
+ if ( args->output_type & FT_BCF ) suffix = "bcf";
+ else if ( args->output_type & FT_GZ ) suffix = "vcf.gz";
+
+ // Open output files and write the legend
+ if ( args->isec_op==OP_VENN )
+ {
+ args->fh_out = (htsFile**) malloc(sizeof(htsFile*)*4);
+ args->fnames = (char**) calloc(4,sizeof(char*));
+
+ #define OPEN_FILE(i,j) { \
+ open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
+ args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \
+ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
+ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
+ bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
+ bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \
+ }
+ if ( !args->nwrite || args->write[0] )
+ {
+ OPEN_FILE(0,0);
+ fprintf(args->fh_log,"%s\tfor records private to\t%s\n", args->fnames[0], args->files->readers[0].fname);
+ }
+ if ( !args->nwrite || args->write[1] )
+ {
+ OPEN_FILE(1,1);
+ fprintf(args->fh_log,"%s\tfor records private to\t%s\n", args->fnames[1], args->files->readers[1].fname);
+ }
+ if ( !args->nwrite || args->write[0] )
+ {
+ OPEN_FILE(2,0);
+ fprintf(args->fh_log,"%s\tfor records from %s shared by both\t%s %s\n", args->fnames[2], args->files->readers[0].fname, args->files->readers[0].fname, args->files->readers[1].fname);
+ }
+ if ( !args->nwrite || args->write[1] )
+ {
+ OPEN_FILE(3,1);
+ fprintf(args->fh_log,"%s\tfor records from %s shared by both\t%s %s\n", args->fnames[3], args->files->readers[1].fname, args->files->readers[0].fname, args->files->readers[1].fname);
+ }
+ }
+ else
+ {
+ // Init one output file for each reader
+ args->fh_out = (htsFile**) calloc(args->files->nreaders, sizeof(htsFile*));
+ args->fnames = (char**) calloc(args->files->nreaders, sizeof(char*));
+
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( args->write && !args->write[i] ) continue;
+ if ( args->isec_op==OP_COMPLEMENT && i>0 ) break;
+ OPEN_FILE(i,i);
+ fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname);
+ }
+ #undef OPEN_FILE
+
+ args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
+ if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
+ }
+ }
+ else {
+ if (args->output_fname) {
+ args->fh_sites = fopen(args->output_fname, "w");
+ if ( args->fh_sites == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ }
+ else
+ args->fh_sites = stdout;
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ if ( args->nflt )
+ {
+ for (i=0; i<args->nflt; i++)
+ {
+ if ( !args->flt[i] ) continue;
+ filter_destroy(args->flt[i]);
+ }
+ free(args->flt_expr);
+ free(args->flt);
+ free(args->flt_logic);
+ }
+ if ( args->prefix )
+ {
+ fclose(args->fh_log);
+ int n = args->isec_op==OP_VENN ? 4 : args->files->nreaders;
+ for (i=0; i<n; i++)
+ {
+ if ( !args->fnames[i] ) continue;
+ hts_close(args->fh_out[i]);
+ if ( args->output_type==FT_VCF_GZ )
+ {
+ tbx_conf_t conf = tbx_conf_vcf;
+ tbx_index_build(args->fnames[i], -1, &conf);
+ }
+ else if ( args->output_type==FT_BCF_GZ )
+ {
+ if ( bcf_index_build(args->fnames[i],14) ) error("Could not index %s\n", args->fnames[i]);
+ }
+ free(args->fnames[i]);
+ }
+ free(args->fh_out);
+ free(args->fnames);
+ if ( args->fh_sites ) fclose(args->fh_sites);
+ if ( args->write ) free(args->write);
+ }
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Create intersections, unions and complements of VCF files.\n");
+ fprintf(stderr, "Usage: bcftools isec [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(stderr, " -C, --complement output positions present only in the first file but missing in the others\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true\n");
+ fprintf(stderr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Examples:\n");
+ fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
+ fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " # Filter sites in A and B (but not in C) and create intersection\n");
+ fprintf(stderr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " # Extract and write records from A shared by both A and B using exact allele match\n");
+ fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " # Extract records private to A or B comparing by position only\n");
+ fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfisec(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv;
+ args->output_fname = NULL;
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ int targets_is_file = 0, regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
+ {"collapse",required_argument,NULL,'c'},
+ {"complement",no_argument,NULL,'C'},
+ {"apply-filters",required_argument,NULL,'f'},
+ {"nfiles",required_argument,NULL,'n'},
+ {"prefix",required_argument,NULL,'p'},
+ {"write",required_argument,NULL,'w'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'c':
+ if ( !strcmp(optarg,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'C': args->isec_op = OP_COMPLEMENT; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'p': args->prefix = optarg; break;
+ case 'w': args->write_files = optarg; break;
+ case 'i': add_filter(args, optarg, FLT_INCLUDE); break;
+ case 'e': add_filter(args, optarg, FLT_EXCLUDE); break;
+ case 'n':
+ {
+ char *p = optarg;
+ if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; }
+ else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; }
+ else if ( *p=='=' ) { args->isec_op = OP_EQUAL; p++; }
+ else if ( *p=='~' ) { args->isec_op = OP_EXACT; p++; }
+ else if ( isdigit(*p) ) args->isec_op = OP_EQUAL;
+ else error("Could not parse --nfiles %s\n", optarg);
+ if ( args->isec_op == OP_EXACT ) args->isec_exact = p;
+ else if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg);
+ }
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( argc-optind<1 ) usage(); // no file given
+ if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( argc-optind==2 && !args->isec_op )
+ {
+ args->isec_op = OP_VENN;
+ if ( !args->prefix ) error("Expected the -p option\n");
+ }
+ if ( !args->targets_list )
+ {
+ if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n");
+ if ( !args->isec_op ) error("Expected two file names or one of the options --complement, --nfiles or --targets\n");
+ }
+ args->files->require_index = 1;
+ while (optind<argc)
+ {
+ if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ optind++;
+ }
+ init_data(args);
+ isec_vcf(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c
new file mode 100644
index 0000000..2418895
--- /dev/null
+++ b/bcftools/vcfisec.c.pysam.c
@@ -0,0 +1,598 @@
+#include "pysam.h"
+
+/* vcfisec.c -- Create intersections, unions and complements of VCF files.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+
+#define OP_PLUS 1
+#define OP_MINUS 2
+#define OP_EQUAL 3
+#define OP_VENN 4
+#define OP_COMPLEMENT 5
+#define OP_EXACT 6
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct
+{
+ int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads;
+ int nflt, *flt_logic;
+ filter_t **flt;
+ char **flt_expr;
+ bcf_srs_t *files;
+ FILE *fh_log, *fh_sites;
+ htsFile **fh_out;
+ char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
+ char *isec_exact;
+ int argc;
+}
+args_t;
+
+/**
+ * mkdir_p() - create new directory for a file $fname
+ * @fname: the file name to create the directory for, the part after last "/" is ignored
+ */
+void mkdir_p(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ int n = vsnprintf(NULL, 0, fmt, ap) + 2;
+ va_end(ap);
+
+ char *path = (char*)malloc(n);
+ va_start(ap, fmt);
+ vsnprintf(path, n, fmt, ap);
+ va_end(ap);
+
+ char *tmp = strdup(path), *p = tmp+1;
+ while (*p)
+ {
+ while (*p && *p!='/') p++;
+ if ( *p )
+ {
+ *p = 0;
+ mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+ *p = '/';
+ p++;
+ }
+ }
+ free(tmp);
+ free(path);
+}
+
+/**
+ * open_file() - open new file creating the file name using vsnprintf
+ * @fname: if not NULL, on output will point to newly allocated fname string
+ * @mode: if NULL, only the file name string will be created
+ * @fmt: vsnprintf format and args
+ *
+ * Returns open file descriptor or NULL if mode is NULL.
+ */
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ int n = vsnprintf(NULL, 0, fmt, ap) + 2;
+ va_end(ap);
+
+ char *str = (char*)malloc(n);
+ va_start(ap, fmt);
+ vsnprintf(str, n, fmt, ap);
+ va_end(ap);
+
+ mkdir_p(str);
+ if ( !mode )
+ {
+ if ( !fname ) error("Uh: expected fname or mode\n");
+ *fname = str;
+ return NULL;
+ }
+
+ FILE *fp = fopen(str,mode);
+ if ( fname ) *fname = str;
+ else free(str);
+ return fp;
+}
+
+void isec_vcf(args_t *args)
+{
+ bcf_srs_t *files = args->files;
+ kstring_t str = {0,0,0};
+ htsFile *out_fh = NULL;
+
+ // When only one VCF is output, print VCF to stdout or -o file
+ int out_std = 0;
+ if ( args->nwrite==1 && !args->prefix ) out_std = 1;
+ if ( args->targets_list && files->nreaders==1 ) out_std = 1;
+ if ( out_std )
+ {
+ out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
+ bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
+ bcf_hdr_write(out_fh, files->readers[args->iwrite].header);
+ }
+ if ( !args->nwrite && !out_std && !args->prefix )
+ fprintf(pysamerr,"Note: -w option not given, printing list of sites...\n");
+
+ int n;
+ while ( (n=bcf_sr_next_line(files)) )
+ {
+ bcf_sr_t *reader = NULL;
+ bcf1_t *line = NULL;
+ int i, ret = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !bcf_sr_has_line(files,i) ) continue;
+
+ if ( args->nflt && args->flt[i] )
+ {
+ bcf1_t *rec = bcf_sr_get_line(files, i);
+ int pass = filter_test(args->flt[i], rec, NULL);
+ if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass )
+ {
+ files->has_line[i] = 0;
+ n--;
+ continue;
+ }
+ }
+
+ if ( !line )
+ {
+ line = files->readers[i].buffer[0];
+ reader = &files->readers[i];
+ }
+ ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN)
+ }
+
+ switch (args->isec_op)
+ {
+ case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break;
+ case OP_EQUAL: if ( n != args->isec_n ) continue; break;
+ case OP_PLUS: if ( n < args->isec_n ) continue; break;
+ case OP_MINUS: if ( n > args->isec_n ) continue; break;
+ case OP_EXACT:
+ for (i=0; i<files->nreaders; i++)
+ if ( files->has_line[i] != args->isec_exact[i] ) break;
+ if ( i<files->nreaders ) continue;
+ break;
+ }
+
+ if ( out_std )
+ {
+ if ( bcf_sr_has_line(files,args->iwrite) )
+ bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]);
+ continue;
+ }
+ else if ( args->fh_sites )
+ {
+ str.l = 0;
+ kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str);
+ kputw(line->pos+1, &str); kputc('\t', &str);
+ if (line->n_allele > 0) kputs(line->d.allele[0], &str);
+ else kputc('.', &str);
+ kputc('\t', &str);
+ if (line->n_allele > 1) kputs(line->d.allele[1], &str);
+ else kputc('.', &str);
+ for (i=2; i<line->n_allele; i++)
+ {
+ kputc(',', &str);
+ kputs(line->d.allele[i], &str);
+ }
+ kputc('\t', &str);
+ for (i=0; i<files->nreaders; i++)
+ kputc(bcf_sr_has_line(files,i)?'1':'0', &str);
+ kputc('\n', &str);
+ fwrite(str.s,sizeof(char),str.l,args->fh_sites);
+ }
+
+ if ( args->prefix )
+ {
+ if ( args->isec_op==OP_VENN && ret==3 )
+ {
+ if ( !args->nwrite || args->write[0] )
+ bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0));
+ if ( !args->nwrite || args->write[1] )
+ bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1));
+ }
+ else
+ {
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !bcf_sr_has_line(files,i) ) continue;
+ if ( args->write && !args->write[i] ) continue;
+ bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]);
+ }
+ }
+ }
+ }
+ if ( str.s ) free(str.s);
+ if ( out_fh ) hts_close(out_fh);
+}
+
+static void add_filter(args_t *args, char *expr, int logic)
+{
+ args->nflt++;
+ args->flt_expr = (char**) realloc(args->flt_expr,sizeof(char*)*args->nflt);
+ args->flt_logic = (int*) realloc(args->flt_logic,sizeof(int)*args->nflt);
+ args->flt = (filter_t**) realloc(args->flt,sizeof(filter_t*)*args->nflt);
+ if ( expr[0]=='-' && expr[1]==0 )
+ {
+ args->flt_expr[args->nflt-1] = NULL;
+ args->flt[args->nflt-1] = NULL;
+ }
+ else
+ args->flt_expr[args->nflt-1] = expr;
+ args->flt_logic[args->nflt-1] = logic;
+}
+
+static void destroy_data(args_t *args);
+static void init_data(args_t *args)
+{
+ int i;
+ if ( args->nflt )
+ {
+ if ( args->nflt > 1 && args->nflt!=args->files->nreaders )
+ error("Error: expected either one -i/-e option or as many as there are input files\n");
+ if ( args->nflt < args->files->nreaders )
+ {
+ if ( !args->flt_expr[0] ) error("Error: useless use of -i/-e\n");
+ args->nflt = args->files->nreaders;
+ args->flt_expr = (char**) realloc(args->flt_expr,sizeof(char*)*args->nflt);
+ args->flt_logic = (int*) realloc(args->flt_logic,sizeof(int)*args->nflt);
+ args->flt = (filter_t**) realloc(args->flt,sizeof(filter_t*)*args->nflt);
+ for (i=1; i<args->nflt; i++)
+ {
+ args->flt_expr[i] = args->flt_expr[0];
+ args->flt_logic[i] = args->flt_logic[0];
+ args->flt[i] = filter_init(args->files->readers[i].header,args->flt_expr[i]);
+ }
+ args->flt[0] = filter_init(args->files->readers[0].header,args->flt_expr[0]);
+ }
+ else
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !args->flt_expr[i] ) continue;
+ args->flt[i] = filter_init(args->files->readers[i].header,args->flt_expr[i]);
+ }
+ }
+ }
+
+ if ( args->isec_op==OP_EXACT )
+ {
+ if ( strlen(args->isec_exact)!=args->files->nreaders )
+ error("The number of files does not match the bitmask: %d vs %s\n", args->files->nreaders,args->isec_exact);
+ for (i=0; i<args->files->nreaders; i++)
+ if ( args->isec_exact[i]!='0' && args->isec_exact[i]!='1' ) error("Unexpected bitmask: %s\n",args->isec_exact);
+ for (i=0; i<args->files->nreaders; i++)
+ args->isec_exact[i] -= '0';
+ }
+
+ // Which files to write: parse the string passed with -w
+ char *p = args->write_files;
+ while (p && *p)
+ {
+ if ( !args->write ) args->write = (int*) calloc(args->files->nreaders,sizeof(int));
+ if ( sscanf(p,"%d",&i)!=1 ) error("Could not parse --write %s\n", args->write_files);
+ if ( i<0 || i>args->files->nreaders ) error("The index is out of range: %d (%s)\n", i, args->write_files);
+ args->write[i-1] = 1;
+ args->iwrite = i-1;
+ args->nwrite++;
+ while (*p && *p!=',') p++;
+ if ( *p==',' ) p++;
+ }
+ if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+ if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
+ {
+ if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
+ if ( !args->write[0] ) error("Only -w1 makes sense with -C\n");
+ }
+
+ if ( args->prefix )
+ {
+ // Init output directory and create the readme file
+ args->fh_log = open_file(NULL,"w","%s/README.txt", args->prefix);
+ if ( !args->fh_log ) error("%s/README.txt: %s\n", args->prefix, strerror(errno));
+
+ fprintf(args->fh_log,"This file was produced by vcfisec.\n");
+ fprintf(args->fh_log,"The command line was:\tbcftools %s ", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++) fprintf(args->fh_log," %s",args->argv[i]);
+ fprintf(args->fh_log,"\n\nUsing the following file names:\n");
+
+ const char *suffix = "vcf";
+ if ( args->output_type & FT_BCF ) suffix = "bcf";
+ else if ( args->output_type & FT_GZ ) suffix = "vcf.gz";
+
+ // Open output files and write the legend
+ if ( args->isec_op==OP_VENN )
+ {
+ args->fh_out = (htsFile**) malloc(sizeof(htsFile*)*4);
+ args->fnames = (char**) calloc(4,sizeof(char*));
+
+ #define OPEN_FILE(i,j) { \
+ open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
+ args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \
+ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
+ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
+ bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
+ bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \
+ }
+ if ( !args->nwrite || args->write[0] )
+ {
+ OPEN_FILE(0,0);
+ fprintf(args->fh_log,"%s\tfor records private to\t%s\n", args->fnames[0], args->files->readers[0].fname);
+ }
+ if ( !args->nwrite || args->write[1] )
+ {
+ OPEN_FILE(1,1);
+ fprintf(args->fh_log,"%s\tfor records private to\t%s\n", args->fnames[1], args->files->readers[1].fname);
+ }
+ if ( !args->nwrite || args->write[0] )
+ {
+ OPEN_FILE(2,0);
+ fprintf(args->fh_log,"%s\tfor records from %s shared by both\t%s %s\n", args->fnames[2], args->files->readers[0].fname, args->files->readers[0].fname, args->files->readers[1].fname);
+ }
+ if ( !args->nwrite || args->write[1] )
+ {
+ OPEN_FILE(3,1);
+ fprintf(args->fh_log,"%s\tfor records from %s shared by both\t%s %s\n", args->fnames[3], args->files->readers[1].fname, args->files->readers[0].fname, args->files->readers[1].fname);
+ }
+ }
+ else
+ {
+ // Init one output file for each reader
+ args->fh_out = (htsFile**) calloc(args->files->nreaders, sizeof(htsFile*));
+ args->fnames = (char**) calloc(args->files->nreaders, sizeof(char*));
+
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( args->write && !args->write[i] ) continue;
+ if ( args->isec_op==OP_COMPLEMENT && i>0 ) break;
+ OPEN_FILE(i,i);
+ fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname);
+ }
+ #undef OPEN_FILE
+
+ args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
+ if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
+ }
+ }
+ else {
+ if (args->output_fname) {
+ args->fh_sites = fopen(args->output_fname, "w");
+ if ( args->fh_sites == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ }
+ else
+ args->fh_sites = stdout;
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ if ( args->nflt )
+ {
+ for (i=0; i<args->nflt; i++)
+ {
+ if ( !args->flt[i] ) continue;
+ filter_destroy(args->flt[i]);
+ }
+ free(args->flt_expr);
+ free(args->flt);
+ free(args->flt_logic);
+ }
+ if ( args->prefix )
+ {
+ fclose(args->fh_log);
+ int n = args->isec_op==OP_VENN ? 4 : args->files->nreaders;
+ for (i=0; i<n; i++)
+ {
+ if ( !args->fnames[i] ) continue;
+ hts_close(args->fh_out[i]);
+ if ( args->output_type==FT_VCF_GZ )
+ {
+ tbx_conf_t conf = tbx_conf_vcf;
+ tbx_index_build(args->fnames[i], -1, &conf);
+ }
+ else if ( args->output_type==FT_BCF_GZ )
+ {
+ if ( bcf_index_build(args->fnames[i],14) ) error("Could not index %s\n", args->fnames[i]);
+ }
+ free(args->fnames[i]);
+ }
+ free(args->fh_out);
+ free(args->fnames);
+ if ( args->fh_sites ) fclose(args->fh_sites);
+ if ( args->write ) free(args->write);
+ }
+}
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Create intersections, unions and complements of VCF files.\n");
+ fprintf(pysamerr, "Usage: bcftools isec [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(pysamerr, " -C, --complement output positions present only in the first file but missing in the others\n");
+ fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysamerr, " -i, --include <expr> include only sites for which the expression is true\n");
+ fprintf(pysamerr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysamerr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Examples:\n");
+ fprintf(pysamerr, " # Create intersection and complements of two sets saving the output in dir/*\n");
+ fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, " # Filter sites in A and B (but not in C) and create intersection\n");
+ fprintf(pysamerr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, " # Extract and write records from A shared by both A and B using exact allele match\n");
+ fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, " # Extract records private to A or B comparing by position only\n");
+ fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfisec(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv;
+ args->output_fname = NULL;
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ int targets_is_file = 0, regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
+ {"collapse",required_argument,NULL,'c'},
+ {"complement",no_argument,NULL,'C'},
+ {"apply-filters",required_argument,NULL,'f'},
+ {"nfiles",required_argument,NULL,'n'},
+ {"prefix",required_argument,NULL,'p'},
+ {"write",required_argument,NULL,'w'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'c':
+ if ( !strcmp(optarg,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'C': args->isec_op = OP_COMPLEMENT; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'p': args->prefix = optarg; break;
+ case 'w': args->write_files = optarg; break;
+ case 'i': add_filter(args, optarg, FLT_INCLUDE); break;
+ case 'e': add_filter(args, optarg, FLT_EXCLUDE); break;
+ case 'n':
+ {
+ char *p = optarg;
+ if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; }
+ else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; }
+ else if ( *p=='=' ) { args->isec_op = OP_EQUAL; p++; }
+ else if ( *p=='~' ) { args->isec_op = OP_EXACT; p++; }
+ else if ( isdigit(*p) ) args->isec_op = OP_EQUAL;
+ else error("Could not parse --nfiles %s\n", optarg);
+ if ( args->isec_op == OP_EXACT ) args->isec_exact = p;
+ else if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg);
+ }
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( argc-optind<1 ) usage(); // no file given
+ if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( argc-optind==2 && !args->isec_op )
+ {
+ args->isec_op = OP_VENN;
+ if ( !args->prefix ) error("Expected the -p option\n");
+ }
+ if ( !args->targets_list )
+ {
+ if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n");
+ if ( !args->isec_op ) error("Expected two file names or one of the options --complement, --nfiles or --targets\n");
+ }
+ args->files->require_index = 1;
+ while (optind<argc)
+ {
+ if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ optind++;
+ }
+ init_data(args);
+ isec_vcf(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c
new file mode 100644
index 0000000..0517bd5
--- /dev/null
+++ b/bcftools/vcfmerge.c
@@ -0,0 +1,2067 @@
+/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <math.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "vcmp.h"
+
+#include <htslib/khash.h>
+KHASH_MAP_INIT_STR(strdict, int)
+typedef khash_t(strdict) strdict_t;
+
+#define SKIP_DONE 1
+#define SKIP_DIFF 2
+
+#define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G)
+#define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A)
+#define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R)
+
+// For merging INFO Number=A,G,R tags
+typedef struct
+{
+ const char *hdr_tag;
+ int type, nvals;
+ int nbuf, mbuf;
+ uint8_t *buf;
+}
+AGR_info_t;
+
+// Rules for merging arbitrary INFO tags
+typedef struct _info_rule_t
+{
+ char *hdr_tag;
+ void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule);
+ int type; // one of BCF_HT_*
+ int block_size; // number of values in a block
+ int nblocks; // number of blocks in nvals (the number of merged files)
+ int nvals, mvals; // used and total size of vals array
+ void *vals; // the info tag values
+}
+info_rule_t;
+
+// Auxiliary merge data for selecting the right combination
+// of buffered records across multiple readers. maux1_t
+// corresponds to one buffered line.
+typedef struct
+{
+ int skip;
+ int *map; // mapping from input alleles to the output array
+ int mmap; // size of map array (only buffer[i].n_allele is actually used)
+ int als_differ;
+}
+maux1_t;
+typedef struct
+{
+ int n; // number of readers
+ char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
+ int nals, mals, nout_als, mout_als; // size of the output array
+ int *cnt, ncnt; // number of records that refer to the alleles
+ int *nbuf; // readers have buffers of varying lengths
+ int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
+ int *flt, mflt, minf;
+ bcf_info_t *inf;// out_line's INFO fields
+ bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
+ int nfmt_map; // number of rows in the fmt_map array
+ int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
+ void *tmp_arr;
+ int ntmp_arr;
+ maux1_t **d; // d[i][j] i-th reader, j-th buffer line
+ AGR_info_t *AGR_info;
+ int nAGR_info, mAGR_info;
+ bcf_srs_t *files;
+ int *has_line; // which files are being merged
+}
+maux_t;
+
+typedef struct
+{
+ vcmp_t *vcmp;
+ maux_t *maux;
+ int header_only, collapse, output_type, force_samples, merge_by_id;
+ char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
+ info_rule_t *rules;
+ int nrules;
+ strdict_t *tmph;
+ kstring_t tmps;
+ bcf_srs_t *files;
+ bcf1_t *out_line;
+ htsFile *out_fh;
+ bcf_hdr_t *out_hdr;
+ char **argv;
+ int argc, n_threads;
+}
+args_t;
+
+static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = 0; \
+ for (i=1; i<rule->nblocks; i++) \
+ { \
+ for (j=0; j<ndim; j++) ptr[j] += ptr[j+i*ndim]; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_avg(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = 0; \
+ for (j=0; j<ndim; j++) \
+ { \
+ double sum = 0; \
+ for (i=0; i<rule->nblocks; i++) sum += ptr[j+i*ndim]; \
+ ptr[j] = sum / rule->nblocks; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_min(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing,set_missing,huge_val) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = huge_val; \
+ for (i=1; i<rule->nblocks; i++) \
+ { \
+ for (j=0; j<ndim; j++) if ( ptr[j] > ptr[j+i*ndim] ) ptr[j] = ptr[j+i*ndim]; \
+ } \
+ for (i=0; i<rule->nvals; i++) if ( ptr[i]==huge_val ) set_missing; \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing, ptr[i]=bcf_int32_missing, INT32_MAX); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i]), bcf_float_set_missing(ptr[i]), HUGE_VAL); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_max(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing,set_missing,huge_val) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = huge_val; \
+ for (i=1; i<rule->nblocks; i++) \
+ { \
+ for (j=0; j<ndim; j++) if ( ptr[j] < ptr[j+i*ndim] ) ptr[j] = ptr[j+i*ndim]; \
+ } \
+ for (i=0; i<rule->nvals; i++) if ( ptr[i]==huge_val ) set_missing; \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing, ptr[i]=bcf_int32_missing, INT32_MIN); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i]), bcf_float_set_missing(ptr[i]), -HUGE_VAL); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ if ( rule->type==BCF_HT_STR )
+ {
+ ((char*)rule->vals)[rule->nvals] = 0;
+ bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals);
+ }
+ else
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type);
+}
+
+static int info_rules_comp_key2(const void *a, const void *b)
+{
+ info_rule_t *rule1 = (info_rule_t*) a;
+ info_rule_t *rule2 = (info_rule_t*) b;
+ return strcmp(rule1->hdr_tag, rule2->hdr_tag);
+}
+static int info_rules_comp_key(const void *a, const void *b)
+{
+ char *key = (char*) a;
+ info_rule_t *rule = (info_rule_t*) b;
+ return strcmp(key, rule->hdr_tag);
+}
+static void info_rules_init(args_t *args)
+{
+ if ( args->info_rules && !strcmp("-",args->info_rules) ) return;
+
+ kstring_t str = {0,0,0};
+ if ( !args->info_rules )
+ {
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "DP")) ) kputs("DP:sum",&str);
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "DP4")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("DP4:sum",&str);
+ }
+ if ( !str.l ) return;
+ args->info_rules = str.s;
+ }
+
+ args->nrules = 1;
+ char *ss = strdup(args->info_rules), *tmp = ss;
+ int n = 0;
+ while ( *ss )
+ {
+ if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse INFO rules: \"%s\"\n", args->info_rules); }
+ else if ( *ss==',' ) { *ss = 0; args->nrules++; n++; if ( n%2==1 ) error("Could not parse INFO rules: \"%s\"\n", args->info_rules); }
+ ss++;
+ }
+ if ( n%2==0 ) error("Could not parse INFO rules: \"%s\"\n", args->info_rules);
+ args->rules = (info_rule_t*) calloc(args->nrules,sizeof(info_rule_t));
+
+ n = 0;
+ ss = tmp;
+ while ( n < args->nrules )
+ {
+ info_rule_t *rule = &args->rules[n];
+ rule->hdr_tag = strdup(ss);
+ int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
+ if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+ rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
+ if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+
+ while ( *ss ) ss++; ss++;
+ if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
+
+ int is_join = 0;
+ if ( !strcasecmp(ss,"sum") ) rule->merger = info_rules_merge_sum;
+ else if ( !strcasecmp(ss,"avg") ) rule->merger = info_rules_merge_avg;
+ else if ( !strcasecmp(ss,"min") ) rule->merger = info_rules_merge_min;
+ else if ( !strcasecmp(ss,"max") ) rule->merger = info_rules_merge_max;
+ else if ( !strcasecmp(ss,"join") ) { rule->merger = info_rules_merge_join; is_join = 1; }
+ else error("The rule logic \"%s\" not recognised\n", ss);
+
+ if ( !is_join && rule->type==BCF_HT_STR )
+ error("Numeric operation \"%s\" requested on non-numeric field: %s\n", ss, rule->hdr_tag);
+ if ( bcf_hdr_id2number(args->out_hdr,BCF_HL_INFO,id)==0xfffff )
+ {
+ int is_agr = (
+ bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_A ||
+ bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G ||
+ bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R
+ ) ? 1 : 0;
+ if ( is_join && is_agr )
+ error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag);
+ if ( !is_join && !is_agr )
+ error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
+ }
+
+ while ( *ss ) ss++; ss++; n++;
+ }
+ free(str.s);
+ free(tmp);
+
+ qsort(args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key2);
+}
+static void info_rules_destroy(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nrules; i++)
+ {
+ info_rule_t *rule = &args->rules[i];
+ free(rule->hdr_tag);
+ free(rule->vals);
+ }
+ free(args->rules);
+}
+static void info_rules_reset(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nrules; i++)
+ args->rules[i].nblocks = args->rules[i].nvals = args->rules[i].block_size = 0;
+}
+static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len)
+{
+ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type);
+ if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret);
+
+ rule->nblocks++;
+
+ if ( rule->type==BCF_HT_STR )
+ {
+ int need_comma = rule->nblocks==1 ? 0 : 1;
+ hts_expand(char,rule->nvals+ret+need_comma+1,rule->mvals,rule->vals); // 1 for null-termination
+ char *tmp = (char*) rule->vals + rule->nvals;
+ if ( rule->nvals>0 ) { *tmp = ','; tmp++; }
+ strncpy(tmp,(char*)args->maux->tmp_arr,ret);
+ rule->nvals += ret + need_comma;
+ return 1;
+ }
+
+ int i, j;
+ if ( var_len==BCF_VL_A )
+ {
+ assert( ret==line->n_allele-1 );
+ args->maux->nagr_map = ret;
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ // create mapping from source file ALT indexes to dst file indexes
+ for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i+1] - 1;
+ rule->block_size = args->maux->nout_als - 1;
+ }
+ else if ( var_len==BCF_VL_R )
+ {
+ assert( ret==line->n_allele );
+ args->maux->nagr_map = ret;
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
+ rule->block_size = args->maux->nout_als;
+ }
+ else if ( var_len==BCF_VL_G )
+ {
+ args->maux->nagr_map = bcf_alleles2gt(line->n_allele-1,line->n_allele-1)+1;
+ assert( ret==line->n_allele || ret==args->maux->nagr_map );
+ if ( ret==line->n_allele ) // haploid
+ {
+ args->maux->nagr_map = line->n_allele;
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
+ rule->block_size = args->maux->nout_als;
+ }
+ else
+ {
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ int k_src = 0;
+ for (i=0; i<line->n_allele; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ args->maux->agr_map[k_src] = bcf_alleles2gt(als->map[i],als->map[j]);
+ k_src++;
+ }
+ }
+ rule->block_size = bcf_alleles2gt(args->maux->nout_als-1,args->maux->nout_als-1)+1;
+ }
+ }
+ else
+ {
+ if ( rule->nblocks>1 && ret!=rule->block_size )
+ error("Mismatch in number of values for INFO/%s at %s:%d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
+ rule->block_size = ret;
+ args->maux->nagr_map = 0;
+ }
+
+ #define BRANCH(src_type_t,dst_type_t,set_missing) { \
+ src_type_t *src = (src_type_t *) args->maux->tmp_arr; \
+ hts_expand0(dst_type_t,(rule->nvals+rule->block_size),rule->mvals,rule->vals); \
+ dst_type_t *dst = (dst_type_t *) rule->vals + rule->nvals; \
+ rule->nvals += rule->block_size; \
+ if ( !args->maux->nagr_map ) \
+ { \
+ for (i=0; i<ret; i++) dst[i] = src[i]; \
+ } \
+ else \
+ { \
+ for (i=0; i<rule->block_size; i++) set_missing; \
+ for (i=0; i<ret; i++) dst[args->maux->agr_map[i]] = src[i]; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int, int32_t, dst[i] = bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH(float, float, bcf_float_set_missing(dst[i])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ return 1;
+}
+
+int bcf_hdr_sync(bcf_hdr_t *h);
+
+void merge_headers(bcf_hdr_t *hw, const bcf_hdr_t *hr, const char *clash_prefix, int force_samples)
+{
+ // header lines
+ hw = bcf_hdr_merge(hw, hr);
+
+ // samples
+ int i;
+ for (i=0; i<bcf_hdr_nsamples(hr); i++)
+ {
+ char *name = hr->samples[i];
+ if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 )
+ {
+ // there is a sample with the same name
+ if ( !force_samples ) error("Error: Duplicate sample names (%s), use --force-samples to proceed anyway.\n", name);
+
+ int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1;
+ name = (char*) malloc(sizeof(char)*(len+1));
+ sprintf(name,"%s:%s",clash_prefix,hr->samples[i]);
+ bcf_hdr_add_sample(hw,name);
+ free(name);
+ }
+ else
+ bcf_hdr_add_sample(hw,name);
+ }
+}
+
+void debug_als(char **als, int nals)
+{
+ int k; for (k=0; k<nals; k++) fprintf(stderr,"%s ", als[k]);
+ fprintf(stderr,"\n");
+}
+
+/**
+ * normalize_alleles() - create smallest possible representation of the alleles
+ * @als: alleles to be merged, first is REF (rw)
+ * @nals: number of $a alleles
+ *
+ * Best explained on an example:
+ * In: REF=GTTT ALT=GTT
+ * Out: REF=GT ALT=G
+ *
+ * Note: the als array will be modified
+ */
+void normalize_alleles(char **als, int nals)
+{
+ if ( !als[0][1] ) return; // ref is 1base long, we're done
+
+ int j, i = 1, done = 0;
+ int *lens = (int*) malloc(sizeof(int)*nals);
+ for (j=0; j<nals; j++) lens[j] = strlen(als[j]);
+
+ while ( i<lens[0] )
+ {
+ for (j=1; j<nals; j++)
+ {
+ if ( i>=lens[j] ) done = 1;
+ if ( als[j][lens[j]-i] != als[0][lens[0]-i] ) { done = 1; break; }
+ }
+ if ( done ) break;
+ i++;
+ }
+ if ( i>1 )
+ {
+ i--;
+ als[0][lens[0]-i] = 0;
+ for (j=1; j<nals; j++) als[j][lens[j]-i] = 0;
+ }
+ free(lens);
+}
+
+ /**
+ * merge_alleles() - merge two REF,ALT records, $a and $b into $b.
+ * @a: alleles to be merged, first is REF
+ * @na: number of $a alleles
+ * @map: map from the original $a indexes to new $b indexes (0-based)
+ * @b: alleles to be merged, the array will be expanded as required
+ * @nb: number of $b alleles
+ * @mb: size of $b
+ *
+ * Returns NULL on error or $b expanded to incorporate $a alleles and sets
+ * $map. Best explained on an example:
+ * In: REF ALT
+ * a: ACG, AC,A (1bp and 2bp deletion)
+ * b: ACGT, A (3bp deletion)
+ * Out:
+ * b: ACGT, A,ACT,AT (3bp, 1bp and 2bp deletion)
+ * map: 0,2,3
+ * Here the mapping from the original $a alleles to the new $b alleles is 0->0,
+ * 1->2, and 2->3.
+ */
+char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
+{
+ // reference allele never changes
+ map[0] = 0;
+
+ int i,j;
+ int rla = !a[0][1] ? 1 : strlen(a[0]);
+ int rlb = !b[0][1] ? 1 : strlen(b[0]);
+
+ // the most common case: same SNPs
+ if ( na==2 && *nb==2 && rla==1 && rlb==1 && a[1][0]==b[1][0] && !a[1][1] && !b[1][1] )
+ {
+ map[1] = 1;
+ return b;
+ }
+
+ // Sanity check: reference prefixes must be identical
+ if ( strncmp(a[0],b[0],rla<rlb?rla:rlb) )
+ {
+ if ( strncasecmp(a[0],b[0],rla<rlb?rla:rlb) )
+ {
+ fprintf(stderr, "The REF prefixes differ: %s vs %s (%d,%d)\n", a[0],b[0],rla,rlb);
+ return NULL;
+ }
+ // Different case, change to uppercase
+ for (i=0; i<na; i++)
+ {
+ int len = strlen(a[i]);
+ for (j=0; j<len; j++) a[i][j] = toupper(a[i][j]);
+ }
+ for (i=0; i<*nb; i++)
+ {
+ int len = strlen(b[i]);
+ for (j=0; j<len; j++) b[i][j] = toupper(b[i][j]);
+ }
+ }
+
+ int n = *nb + na;
+ hts_expand0(char*,n,*mb,b);
+
+ // $b alleles need expanding
+ if ( rla>rlb )
+ {
+ for (i=0; i<*nb; i++)
+ {
+ int l = strlen(b[i]);
+ b[i] = (char*) realloc(b[i],l+rla-rlb+1);
+ memcpy(b[i]+l,a[0]+rlb,rla-rlb+1);
+ }
+ }
+
+ // now check if the $a alleles are present and if not add them
+ for (i=1; i<na; i++)
+ {
+ char *ai;
+ if ( rlb>rla ) // $a alleles need expanding
+ {
+ int l = strlen(a[i]);
+ ai = (char*) malloc(l+rlb-rla+1);
+ memcpy(ai,a[i],l);
+ memcpy(ai+l,b[0]+rla,rlb-rla+1);
+ }
+ else
+ ai = a[i];
+
+ for (j=1; j<*nb; j++)
+ if ( !strcasecmp(ai,b[j]) ) break;
+
+ if ( j<*nb ) // $b already has the same allele
+ {
+ map[i] = j;
+ if ( rlb>rla ) free(ai);
+ continue;
+ }
+ // new allele
+ map[i] = *nb;
+ b[*nb] = rlb>rla ? ai : strdup(ai);
+ (*nb)++;
+ }
+ return b;
+}
+
+maux_t *maux_init(bcf_srs_t *files)
+{
+ maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t));
+ ma->n = files->nreaders;
+ ma->nbuf = (int *) calloc(ma->n,sizeof(int));
+ ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*));
+ ma->files = files;
+ int i, n_smpl = 0;
+ for (i=0; i<ma->n; i++)
+ n_smpl += bcf_hdr_nsamples(files->readers[i].header);
+ ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int));
+ ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
+ ma->has_line = (int*) malloc(ma->n*sizeof(int));
+ return ma;
+}
+void maux_destroy(maux_t *ma)
+{
+ int i;
+ for (i=0; i<ma->n; i++) // for each reader
+ {
+ if ( !ma->d[i] ) continue;
+ int j;
+ for (j=0; j<ma->nbuf[i]; j++) // for each buffered line
+ if ( ma->d[i][j].map ) free(ma->d[i][j].map);
+ free(ma->d[i]);
+ }
+ for (i=0; i<ma->mAGR_info; i++)
+ free(ma->AGR_info[i].buf);
+ free(ma->agr_map);
+ free(ma->AGR_info);
+ if (ma->ntmp_arr) free(ma->tmp_arr);
+ if (ma->nfmt_map) free(ma->fmt_map);
+ // ma->inf freed in bcf_destroy1
+ free(ma->d);
+ free(ma->nbuf);
+ for (i=0; i<ma->mals; i++) free(ma->als[i]);
+ if (ma->mout_als) free(ma->out_als);
+ free(ma->als);
+ free(ma->cnt);
+ free(ma->smpl_ploidy);
+ free(ma->smpl_nGsize);
+ free(ma->has_line);
+ free(ma);
+}
+void maux_expand1(maux_t *ma, int i)
+{
+ if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer )
+ {
+ int n = ma->files->readers[i].nbuffer + 1;
+ ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n);
+ memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i]));
+ ma->nbuf[i] = n;
+ }
+}
+void maux_reset(maux_t *ma)
+{
+ int i;
+ for (i=0; i<ma->n; i++) maux_expand1(ma, i);
+ for (i=1; i<ma->ncnt; i++) ma->cnt[i] = 0;
+}
+void maux_debug(maux_t *ma, int ir, int ib)
+{
+ printf("[%d,%d]\t", ir,ib);
+ int i;
+ for (i=0; i<ma->nals; i++)
+ {
+ printf(" %s [%d]", ma->als[i], ma->cnt[i]);
+ }
+ printf("\n");
+}
+
+void merge_chrom2qual(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+
+ int i, ret;
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+ kstring_t *tmps = &args->tmps;
+ tmps->l = 0;
+
+ maux_t *ma = args->maux;
+ int *al_idxs = (int*) calloc(ma->nals,sizeof(int));
+ bcf_float_set_missing(out->qual);
+
+ // CHROM, POS, ID, QUAL
+ out->pos = -1;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+
+ // alleles
+ int j;
+ for (j=1; j<line->n_allele; j++)
+ al_idxs[ ma->d[i][0].map[j] ] = 1;
+
+ // position
+ if ( out->pos==-1 )
+ {
+ const char *chr = hdr->id[BCF_DT_CTG][line->rid].key;
+ out->rid = bcf_hdr_name2id(out_hdr, chr);
+ if ( strcmp(chr,out_hdr->id[BCF_DT_CTG][out->rid].key) ) error("Uh\n");
+ out->pos = line->pos;
+ }
+
+ // ID
+ if ( line->d.id[0]!='.' || line->d.id[1] )
+ {
+ kitr = kh_get(strdict, tmph, line->d.id);
+ if ( kitr == kh_end(tmph) )
+ {
+ if ( tmps->l ) kputc(';', tmps);
+ kputs(line->d.id, tmps);
+ kh_put(strdict, tmph, line->d.id, &ret);
+ }
+ }
+
+ // set QUAL to the max qual value. Not exactly correct, but good enough for now
+ if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) )
+ {
+ if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual;
+ }
+ }
+
+ // set ID
+ if ( !tmps->l ) kputs(".", tmps);
+ if ( out->d.id ) free(out->d.id);
+ out->d.id = strdup(tmps->s);
+
+ // set alleles
+ ma->nout_als = 0;
+ for (i=1; i<ma->nals; i++)
+ {
+ if ( !al_idxs[i] ) continue;
+ ma->nout_als++;
+
+ // Adjust the indexes, the allele map could be created for multiple collapsed records,
+ // some of which might be unused for this output line
+ int ir, j;
+ for (ir=0; ir<files->nreaders; ir++)
+ {
+ if ( !ma->has_line[ir] ) continue;
+ bcf1_t *line = files->readers[ir].buffer[0];
+ for (j=1; j<line->n_allele; j++)
+ if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als;
+ }
+ }
+ // Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block.
+ ma->nout_als++;
+ hts_expand0(char*, ma->nout_als, ma->mout_als, ma->out_als);
+ int k = 0;
+ for (i=0; i<ma->nals; i++)
+ if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]);
+ assert( k==ma->nout_als );
+ normalize_alleles(ma->out_als, ma->nout_als);
+ bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als);
+ free(al_idxs);
+ for (i=0; i<ma->nout_als; i++) free(ma->out_als[i]);
+}
+
+void merge_filter(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+
+ int i, ret;
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+
+ maux_t *ma = args->maux;
+ out->d.n_flt = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i]) continue;
+
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_unpack(line, BCF_UN_ALL);
+
+ int k;
+ for (k=0; k<line->d.n_flt; k++)
+ {
+ const char *flt = hdr->id[BCF_DT_ID][line->d.flt[k]].key;
+ kitr = kh_get(strdict, tmph, flt);
+ if ( kitr == kh_end(tmph) )
+ {
+ int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt);
+ if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt);
+ hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt);
+ ma->flt[out->d.n_flt] = id;
+ out->d.n_flt++;
+ kh_put(strdict, tmph, flt, &ret);
+ }
+ }
+ }
+ // Check if PASS is not mixed with other filters
+ if ( out->d.n_flt>1 )
+ {
+ int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
+ for (i=0; i<out->d.n_flt; i++)
+ if ( ma->flt[i]==id ) break;
+ if ( i<out->d.n_flt )
+ {
+ out->d.n_flt--;
+ for (; i<out->d.n_flt; i++) ma->flt[i] = ma->flt[i+1];
+ }
+ }
+ out->d.flt = ma->flt;
+}
+
+static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str)
+{
+ assert( !info->vptr_free );
+
+ uint8_t *ptr = info->vptr - info->vptr_off;
+ bcf_dec_typed_int1(ptr, &ptr);
+
+ tmp_str->l = 0;
+ bcf_enc_int1(tmp_str, id);
+
+ if ( tmp_str->l == ptr - info->vptr + info->vptr_off )
+ {
+ // the new id is represented with the same number of bytes
+ memcpy(info->vptr - info->vptr_off, tmp_str->s, tmp_str->l);
+ return;
+ }
+
+ kputsn_(ptr, info->vptr - ptr, tmp_str);
+ info->vptr_off = tmp_str->l;
+ kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str);
+
+ info->vptr = (uint8_t*) tmp_str->s + info->vptr_off;
+ info->vptr_free = 1;
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ tmp_str->s = NULL;
+ tmp_str->m = 0;
+ tmp_str->l = 0;
+}
+
+/*
+ * copy_string_field() - copy a comma-separated field
+ * @param src: source string
+ * @param isrc: index of the field to copy
+ * @param src_len: length of source string (excluding the terminating \0)
+ * @param dst: destination kstring (must be initialized)
+ * @param idst: index of the destination field
+ */
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst)
+{
+ int ith_src = 0, start_src = 0; // i-th field in src string
+ while ( ith_src<isrc && start_src<src_len )
+ {
+ if ( src[start_src]==',' ) { ith_src++; }
+ start_src++;
+ }
+ if ( ith_src!=isrc ) return -1; // requested field not found
+ int end_src = start_src;
+ while ( end_src<src_len && src[end_src]!=',' ) end_src++;
+
+ int nsrc_cpy = end_src - start_src;
+ if ( nsrc_cpy==1 && src[start_src]=='.' ) return 0; // don't write missing values, dst is already initialized
+
+ int ith_dst = 0, start_dst = 0;
+ while ( ith_dst<idst && start_dst<dst->l )
+ {
+ if ( dst->s[start_dst]==',' ) { ith_dst++; }
+ start_dst++;
+ }
+ if ( ith_dst!=idst ) return -2;
+ int end_dst = start_dst;
+ while ( end_dst<dst->l && dst->s[end_dst]!=',' ) end_dst++;
+
+ if ( end_dst - start_dst>1 || dst->s[start_dst]!='.' ) return 0; // do not overwrite non-empty values
+
+ // Now start_dst and end_dst are indexes to the destination memory area
+ // which needs to be replaced with nsrc_cpy
+ // source bytes, end_dst points just after.
+ int ndst_shift = nsrc_cpy - (end_dst - start_dst);
+ int ndst_move = dst->l - end_dst + 1; // how many bytes must be moved (including \0)
+ if ( ndst_shift )
+ {
+ ks_resize(dst, dst->l + ndst_shift + 1); // plus \0
+ memmove(dst->s+end_dst+ndst_shift, dst->s+end_dst, ndst_move);
+ }
+ memcpy(dst->s+start_dst, src+start_src, nsrc_cpy);
+ dst->l += ndst_shift;
+ return 0;
+}
+
+static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, int len, maux1_t *als, AGR_info_t *agr)
+{
+ int i;
+ if ( !agr->nbuf )
+ {
+ if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT )
+ {
+ agr->nbuf = 4 * agr->nvals;
+ hts_expand(uint8_t,agr->nbuf,agr->mbuf,agr->buf);
+ if ( info->type!=BCF_BT_FLOAT )
+ {
+ int32_t *tmp = (int32_t*) agr->buf;
+ for (i=0; i<agr->nvals; i++) tmp[i] = bcf_int32_missing;
+ }
+ else
+ {
+ float *tmp = (float*) agr->buf;
+ for (i=0; i<agr->nvals; i++) bcf_float_set_missing(tmp[i]);
+ }
+ }
+ else if ( info->type==BCF_BT_CHAR )
+ {
+ kstring_t tmp; tmp.l = 0; tmp.m = agr->mbuf; tmp.s = (char*)agr->buf;
+ kputc('.',&tmp);
+ for (i=1; i<agr->nvals; i++) kputs(",.",&tmp);
+ agr->mbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s;
+ }
+ else
+ error("Not ready for type [%d]: %s at %d\n", info->type,agr->hdr_tag,line->pos+1);
+ }
+
+ if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT )
+ {
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int ifrom = len==BCF_VL_A ? 1 : 0;
+ #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
+ type_t *src = (type_t *) info->vptr; \
+ out_type_t *tgt = (out_type_t *) agr->buf; \
+ int iori, inew; \
+ for (iori=ifrom; iori<line->n_allele; iori++) \
+ { \
+ if ( is_vector_end ) break; \
+ if ( is_missing ) continue; \
+ inew = als->map[iori] - ifrom; \
+ tgt[inew] = *src; \
+ src++; \
+ } \
+ }
+ switch (info->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, int); break;
+ case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break;
+ default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ else
+ {
+ #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
+ type_t *src = (type_t *) info->vptr; \
+ out_type_t *tgt = (out_type_t *) agr->buf; \
+ int iori,jori, inew,jnew; \
+ for (iori=0; iori<line->n_allele; iori++) \
+ { \
+ inew = als->map[iori]; \
+ for (jori=0; jori<=iori; jori++) \
+ { \
+ jnew = als->map[jori]; \
+ int kori = iori*(iori+1)/2 + jori; \
+ if ( is_vector_end ) break; \
+ if ( is_missing ) continue; \
+ int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
+ tgt[knew] = src[kori]; \
+ } \
+ if ( jori<=iori ) break; \
+ } \
+ }
+ switch (info->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, src[kori]==bcf_int8_missing, src[kori]==bcf_int8_vector_end, int); break;
+ case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break;
+ case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break;
+ default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ }
+ else
+ {
+ kstring_t tmp; tmp.l = agr->nbuf; tmp.m = agr->mbuf; tmp.s = (char*)agr->buf;
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int iori, ifrom = len==BCF_VL_A ? 1 : 0;
+ for (iori=ifrom; iori<line->n_allele; iori++)
+ {
+ int ret = copy_string_field((char*)info->vptr, iori-ifrom, info->len, &tmp, als->map[iori]-ifrom);
+ if ( ret )
+ error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag);
+ }
+ }
+ else
+ {
+ int iori,jori, inew,jnew;
+ for (iori=0; iori<line->n_allele; iori++)
+ {
+ inew = als->map[iori];
+ for (jori=0; jori<=iori; jori++)
+ {
+ jnew = als->map[jori];
+ int kori = iori*(iori+1)/2 + jori;
+ int knew = bcf_alleles2gt(inew,jnew);
+ int ret = copy_string_field((char*)info->vptr, kori, info->len, &tmp, knew);
+ if ( ret )
+ error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag);
+ }
+ }
+ }
+ agr->mbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s;
+ }
+}
+
+void merge_info(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+
+ int i, j, ret;
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+
+ maux_t *ma = args->maux;
+ ma->nAGR_info = 0;
+ out->n_info = 0;
+ info_rules_reset(args);
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+ for (j=0; j<line->n_info; j++)
+ {
+ bcf_info_t *inf = &line->d.info[j];
+
+ const char *key = hdr->id[BCF_DT_ID][inf->key].key;
+ if ( !strcmp("AC",key) || !strcmp("AN",key) ) continue; // AC and AN are done in merge_format() after genotypes are done
+
+ int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key);
+ if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key);
+
+ kitr = kh_get(strdict, tmph, key); // have we seen the tag in one of the readers?
+ int len = bcf_hdr_id2length(hdr,BCF_HL_INFO,inf->key);
+ if ( args->nrules )
+ {
+ info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key);
+ if ( rule )
+ {
+ maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL;
+ if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue;
+ }
+ }
+
+ // Todo: Number=AGR tags should use the newer info_rules_* functions (info_rules_merge_first to be added)
+ // and merge_AGR_info_tag to be made obsolete.
+ if ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) // Number=R,G,A requires special treatment
+ {
+ if ( kitr == kh_end(tmph) )
+ {
+ // first occurance in this reader, alloc arrays
+ ma->nAGR_info++;
+ hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info);
+ kitr = kh_put(strdict, tmph, key, &ret);
+ kh_val(tmph,kitr) = ma->nAGR_info - 1;
+ ma->AGR_info[ma->nAGR_info-1].hdr_tag = key;
+ ma->AGR_info[ma->nAGR_info-1].type = bcf_hdr_id2type(hdr,BCF_HL_INFO,inf->key);
+ ma->AGR_info[ma->nAGR_info-1].nbuf = 0; // size of the buffer
+ switch (len)
+ {
+ case BCF_VL_A: ma->AGR_info[ma->nAGR_info-1].nvals = ma->nout_als - 1; break;
+ case BCF_VL_G: ma->AGR_info[ma->nAGR_info-1].nvals = bcf_alleles2gt(ma->nout_als-1,ma->nout_als-1)+1; break;
+ case BCF_VL_R: ma->AGR_info[ma->nAGR_info-1].nvals = ma->nout_als; break;
+ }
+ }
+ kitr = kh_get(strdict, tmph, key);
+ int idx = kh_val(tmph, kitr);
+ if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1);
+ merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]);
+ continue;
+ }
+
+ if ( kitr == kh_end(tmph) )
+ {
+ hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf);
+ ma->inf[out->n_info].key = id;
+ ma->inf[out->n_info].type = inf->type;
+ ma->inf[out->n_info].len = inf->len;
+ ma->inf[out->n_info].vptr = inf->vptr;
+ ma->inf[out->n_info].v1.i = inf->v1.i;
+ ma->inf[out->n_info].v1.f = inf->v1.f;
+ ma->inf[out->n_info].vptr_off = inf->vptr_off;
+ ma->inf[out->n_info].vptr_len = inf->vptr_len;
+ ma->inf[out->n_info].vptr_free = inf->vptr_free;
+ if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) )
+ {
+ // The existing packed info cannot be reused. Change the id.
+ // Although quite hacky, it's faster than anything else given
+ // the data structures
+ bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps);
+ }
+ out->n_info++;
+ kitr = kh_put(strdict, tmph, key, &ret);
+ kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value
+ }
+ }
+ }
+ out->d.info = ma->inf;
+ out->d.m_info = ma->minf;
+ for (i=0; i<args->nrules; i++)
+ args->rules[i].merger(args->out_hdr, out, &args->rules[i]);
+ for (i=0; i<ma->nAGR_info; i++)
+ {
+ AGR_info_t *agr = &ma->AGR_info[i];
+ bcf_update_info(out_hdr,out,agr->hdr_tag,agr->buf,agr->nvals,agr->type);
+ }
+}
+
+void update_AN_AC(bcf_hdr_t *hdr, bcf1_t *line)
+{
+ int32_t an = 0, *tmp = (int32_t*) malloc(sizeof(int)*line->n_allele);
+ int ret = bcf_calc_ac(hdr, line, tmp, BCF_UN_FMT);
+ if ( ret>0 )
+ {
+ int i;
+ for (i=0; i<line->n_allele; i++) an += tmp[i];
+ bcf_update_info_int32(hdr, line, "AN", &an, 1);
+ bcf_update_info_int32(hdr, line, "AC", tmp+1, line->n_allele-1);
+ }
+ free(tmp);
+}
+
+void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+ maux_t *ma = args->maux;
+ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
+
+ int nsize = 0, msize = sizeof(int32_t);
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !fmt_map[i] ) continue;
+ if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
+ }
+
+ if ( ma->ntmp_arr < nsamples*nsize*msize )
+ {
+ ma->ntmp_arr = nsamples*nsize*msize;
+ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
+ }
+ memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
+
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize;
+
+ int j, k;
+ if ( !fmt_ori )
+ {
+ // missing values: assume maximum ploidy
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++)
+ {
+ for (k=0; k<nsize; k++) { tmp[k] = 0; ma->smpl_ploidy[ismpl+j]++; }
+ tmp += nsize;
+ }
+ ismpl += bcf_hdr_nsamples(hdr);
+ continue;
+ }
+
+ #define BRANCH(type_t, vector_end) { \
+ type_t *p_ori = (type_t*) fmt_ori->p; \
+ if ( !ma->d[i][0].als_differ ) \
+ { \
+ /* the allele numbering is unchanged */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ for (k=0; k<fmt_ori->n; k++) \
+ { \
+ if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+ ma->smpl_ploidy[ismpl+j]++; \
+ if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
+ else tmp[k] = p_ori[k]; \
+ } \
+ for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
+ tmp += nsize; \
+ p_ori += fmt_ori->n; \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ continue; \
+ } \
+ /* allele numbering needs to be changed */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ for (k=0; k<fmt_ori->n; k++) \
+ { \
+ if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+ ma->smpl_ploidy[ismpl+j]++; \
+ if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
+ else \
+ { \
+ int al = (p_ori[k]>>1) - 1; \
+ al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \
+ tmp[k] = (al << 1) | ((p_ori[k])&1); \
+ } \
+ } \
+ for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
+ tmp += nsize; \
+ p_ori += fmt_ori->n; \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+ default: error("Unexpected case: %d\n", fmt_ori->type);
+ }
+ #undef BRANCH
+ }
+ bcf_update_format_int32(out_hdr, out, "GT", (int32_t*)ma->tmp_arr, nsamples*nsize);
+}
+
+void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+ maux_t *ma = args->maux;
+ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
+
+ const char *key = NULL;
+ int nsize = 0, length = BCF_VL_FIXED, type = -1;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+ if ( !fmt_map[i] ) continue;
+ if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key;
+ type = fmt_map[i]->type;
+ if ( IS_VL_G(files->readers[i].header, fmt_map[i]->id) )
+ {
+ length = BCF_VL_G;
+ nsize = out->n_allele*(out->n_allele + 1)/2;
+ break;
+ }
+ if ( IS_VL_A(files->readers[i].header, fmt_map[i]->id) )
+ {
+ length = BCF_VL_A;
+ nsize = out->n_allele - 1;
+ break;
+ }
+ if ( IS_VL_R(files->readers[i].header, fmt_map[i]->id) )
+ {
+ length = BCF_VL_R;
+ nsize = out->n_allele;
+ break;
+ }
+ if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
+ }
+
+ int msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+ if ( ma->ntmp_arr < nsamples*nsize*msize )
+ {
+ ma->ntmp_arr = nsamples*nsize*msize;
+ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
+ }
+
+ // Fill the temp array for all samples by collecting values from all files
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ if ( fmt_ori )
+ {
+ type = fmt_ori->type;
+ int nals_ori = reader->buffer[0]->n_allele;
+ if ( length==BCF_VL_G )
+ {
+ // if all fields are missing then n==1 is valid
+ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori*(nals_ori+1)/2 && fmt_map[i]->n != nals_ori )
+ error("Incorrect number of %s fields (%d) at %s:%d, cannot merge.\n", key,fmt_ori->n,bcf_seqname(args->out_hdr,out),out->pos+1);
+ }
+ else if ( length==BCF_VL_A )
+ {
+ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori-1 )
+ error("Incorrect number of %s fields (%d) at %s:%d, cannot merge.\n", key,fmt_ori->n,bcf_seqname(args->out_hdr,out),out->pos+1);
+ }
+ else if ( length==BCF_VL_R )
+ {
+ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori )
+ error("Incorrect number of %s fields (%d) at %s:%d, cannot merge.\n", key,fmt_ori->n,bcf_seqname(args->out_hdr,out),out->pos+1);
+ }
+ }
+
+ // set the values
+ #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+ int j, l, k; \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ if ( !fmt_ori ) \
+ { \
+ /* the field is not present in this file, set missing values */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ tgt_set_missing; tgt++; for (l=1; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ continue; \
+ } \
+ assert( ma->has_line[i] ); \
+ bcf1_t *line = reader->buffer[0]; \
+ src_type_t *src = (src_type_t*) fmt_ori->p; \
+ if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \
+ { \
+ /* alleles unchanged, copy over */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ for (l=0; l<fmt_ori->n; l++) \
+ { \
+ if ( src_is_vector_end ) break; \
+ else if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ tgt++; src++; \
+ } \
+ for (k=l; k<nsize; k++) { tgt_set_vector_end; tgt++; } \
+ src += fmt_ori->n - l; \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ continue; \
+ } \
+ /* allele numbering needs to be changed */ \
+ if ( length==BCF_VL_G ) \
+ { \
+ /* Number=G tags */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+ { \
+ /* tag with missing value "." */ \
+ tgt_set_missing; \
+ for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
+ continue; \
+ } \
+ int ngsize = ma->smpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
+ for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+ for (; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
+ if ( ma->smpl_ploidy[ismpl+j]==1 ) \
+ { \
+ /* Haploid */ \
+ int iori, inew; \
+ for (iori=0; iori<line->n_allele; iori++) \
+ { \
+ inew = ma->d[i][0].map[iori]; \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
+ if ( src_is_vector_end ) break; \
+ if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ } \
+ } \
+ else \
+ { \
+ /* Diploid */ \
+ int iori,jori, inew,jnew; \
+ for (iori=0; iori<line->n_allele; iori++) \
+ { \
+ inew = ma->d[i][0].map[iori]; \
+ for (jori=0; jori<=iori; jori++) \
+ { \
+ jnew = ma->d[i][0].map[jori]; \
+ int kori = iori*(iori+1)/2 + jori; \
+ int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + knew; \
+ if ( src_is_vector_end ) \
+ { \
+ iori = line->n_allele; \
+ break; \
+ } \
+ if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ else \
+ { \
+ /* Number=A or Number=R tags */ \
+ int ifrom = length==BCF_VL_A ? 1 : 0; \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
+ src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
+ if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+ { \
+ /* tag with missing value "." */ \
+ tgt_set_missing; \
+ for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
+ continue; \
+ } \
+ src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
+ for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+ int iori,inew; \
+ for (iori=ifrom; iori<line->n_allele; iori++) \
+ { \
+ inew = ma->d[i][0].map[iori] - ifrom; \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
+ if ( src_is_vector_end ) break; \
+ if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ src++; \
+ } \
+ } \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ }
+ switch (type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+ case BCF_BT_CHAR: BRANCH(uint8_t, uint8_t, *src==bcf_str_missing, *src==bcf_str_vector_end, *tgt=bcf_str_missing, *tgt=bcf_str_vector_end); break;
+ default: error("Unexpected case: %d, %s\n", type, key);
+ }
+ #undef BRANCH
+ }
+ if ( type==BCF_BT_FLOAT )
+ bcf_update_format_float(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize);
+ else if ( type==BCF_BT_CHAR )
+ bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize);
+ else
+ bcf_update_format_int32(out_hdr, out, key, (int32_t*)ma->tmp_arr, nsamples*nsize);
+}
+
+void merge_format(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+ maux_t *ma = args->maux;
+ if ( !ma->nfmt_map )
+ {
+ ma->nfmt_map = 2;
+ ma->fmt_map = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*));
+ }
+ else
+ memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**));
+
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+ int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+ for (j=0; j<line->n_fmt; j++)
+ {
+ // Wat this tag already seen?
+ bcf_fmt_t *fmt = &line->d.fmt[j];
+ const char *key = hdr->id[BCF_DT_ID][fmt->id].key;
+ kitr = kh_get(strdict, tmph, key);
+
+ int ifmt;
+ if ( kitr != kh_end(tmph) )
+ ifmt = kh_value(tmph, kitr); // seen
+ else
+ {
+ // new FORMAT tag
+ if ( key[0]=='G' && key[1]=='T' && key[2]==0 ) { has_GT = 1; ifmt = 0; }
+ else
+ {
+ ifmt = ++max_ifmt;
+ if ( max_ifmt >= ma->nfmt_map )
+ {
+ ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders);
+ memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
+ ma->nfmt_map = max_ifmt+1;
+ }
+ }
+ kitr = kh_put(strdict, tmph, key, &ret);
+ kh_value(tmph, kitr) = ifmt;
+ }
+ ma->fmt_map[ifmt*files->nreaders+i] = fmt;
+ }
+ // Check if the allele numbering must be changed
+ for (j=1; j<reader->buffer[0]->n_allele; j++)
+ if ( ma->d[i][0].map[j]!=j ) break;
+ ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1;
+ }
+
+ out->n_sample = bcf_hdr_nsamples(out_hdr);
+ if ( has_GT )
+ merge_GT(args, ma->fmt_map, out);
+ update_AN_AC(out_hdr, out);
+
+ if ( out->d.info!=ma->inf )
+ {
+ // hacky, we rely on htslib internals: bcf_update_info() reallocated the info
+ ma->inf = out->d.info;
+ ma->minf = out->d.m_info;
+ }
+
+ for (i=1; i<=max_ifmt; i++)
+ merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+ out->d.indiv_dirty = 1;
+}
+
+// The core merging function, one or none line from each reader
+void merge_line(args_t *args)
+{
+ bcf1_t *out = args->out_line;
+ bcf_clear1(out);
+ out->unpacked = BCF_UN_ALL;
+
+ merge_chrom2qual(args, out);
+ merge_filter(args, out);
+ merge_info(args, out);
+ merge_format(args, out);
+
+ bcf_write1(args->out_fh, args->out_hdr, out);
+}
+
+
+void debug_buffers(FILE *fp, bcf_srs_t *files);
+void debug_buffer(FILE *fp, bcf_sr_t *reader);
+
+#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+
+// Clean the reader's buffer to and make it ready for the next next_line() call.
+// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put
+// the rest to the beggining. Then shorten the buffer so that the last element
+// points to the last unfinished record. There are two special cases: the last
+// line of the buffer typically has a different position and must stay at the
+// end; next, the first record of the buffer must be one of those already
+// printed, as it will be discarded by next_line().
+//
+void shake_buffer(maux_t *maux, int ir, int pos)
+{
+ bcf_sr_t *reader = &maux->files->readers[ir];
+ maux1_t *m = maux->d[ir];
+
+ if ( !reader->buffer ) return;
+
+ int i;
+ // FILE *fp = stdout;
+ // fprintf(fp,"<going to shake> nbuf=%d\t", reader->nbuffer); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n");
+ // debug_buffer(fp,reader);
+ // fprintf(fp,"--\n");
+
+ int a = 1, b = reader->nbuffer;
+ if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards
+
+ while ( a<b )
+ {
+ if ( !(m[a].skip&SKIP_DONE) ) { a++; continue; }
+ if ( m[b].skip&SKIP_DONE ) { b--; continue; }
+ SWAP(bcf1_t*, reader->buffer[a], reader->buffer[b]);
+ SWAP(maux1_t, m[a], m[b]);
+ a++;
+ b--;
+ }
+
+ // position $a to the after the first unfinished record
+ while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++;
+
+ if ( a<reader->nbuffer )
+ {
+ // there is a gap between the unfinished lines at the beggining and the
+ // last line. The last line must be brought forward to fill the gap
+ if ( reader->buffer[reader->nbuffer]->pos != pos )
+ {
+ SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]);
+ SWAP(maux1_t, m[a], m[reader->nbuffer]);
+ reader->nbuffer = a;
+ }
+ }
+
+ if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos )
+ {
+ // the first record is unfinished, replace it with an empty line
+ // from the end of the buffer or else next_line will remove it
+ if ( reader->nbuffer + 1 >= maux->nbuf[ir] )
+ {
+ reader->nbuffer++;
+ maux_expand1(maux, ir);
+ reader->nbuffer--;
+ m = maux->d[ir];
+ }
+ if ( reader->nbuffer+1 >= reader->mbuffer )
+ error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer);
+
+ if ( reader->buffer[reader->nbuffer]->pos!=pos )
+ {
+ // 4way swap
+ bcf1_t *tmp = reader->buffer[0];
+ reader->buffer[0] = reader->buffer[reader->nbuffer+1];
+ reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer];
+ reader->buffer[reader->nbuffer] = tmp;
+ m[reader->nbuffer].skip = m[0].skip;
+ m[reader->nbuffer+1].skip = SKIP_DIFF;
+ reader->nbuffer++;
+ }
+ else
+ {
+ SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]);
+ SWAP(maux1_t, m[0], m[reader->nbuffer+1]);
+ }
+ }
+
+ // debug_buffer(fp,reader);
+ // fprintf(fp,"<shaken>\t"); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]);
+ // fprintf(fp,"\n\n");
+
+ // set position of finished buffer[0] line to -1, otherwise swapping may
+ // bring it back after next_line()
+ reader->buffer[0]->pos = -1;
+
+ // trim the buffer, remove finished lines from the end
+ i = reader->nbuffer;
+ while ( i>=1 && m[i--].skip&SKIP_DONE )
+ reader->nbuffer--;
+}
+
+void debug_maux(args_t *args, int pos, int var_type)
+{
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
+ int j,k,l;
+
+ fprintf(stderr,"Alleles to merge at %d\n", pos+1);
+ for (j=0; j<files->nreaders; j++)
+ {
+ bcf_sr_t *reader = &files->readers[j];
+ fprintf(stderr," reader %d: ", j);
+ for (k=0; k<=reader->nbuffer; k++)
+ {
+ if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ bcf1_t *line = reader->buffer[k];
+ if ( line->pos!=pos ) continue;
+ fprintf(stderr,"\t");
+ if ( maux->d[j][k].skip ) fprintf(stderr,"["); // this record will not be merged in this round
+ for (l=0; l<line->n_allele; l++)
+ fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]);
+ if ( maux->d[j][k].skip ) fprintf(stderr,"]");
+ }
+ fprintf(stderr,"\n");
+ }
+ fprintf(stderr," counts: ");
+ for (j=0; j<maux->nals; j++) fprintf(stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(stderr,"\n");
+ for (j=0; j<files->nreaders; j++)
+ {
+ bcf_sr_t *reader = &files->readers[j];
+ fprintf(stderr," out %d: ", j);
+ for (k=0; k<=reader->nbuffer; k++)
+ {
+ if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ bcf1_t *line = reader->buffer[k];
+ if ( line->pos!=pos ) continue;
+ if ( maux->d[j][k].skip ) continue;
+ fprintf(stderr,"\t");
+ for (l=0; l<line->n_allele; l++)
+ fprintf(stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
+ }
+ fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+}
+
+// Determine which line should be merged from which reader: go through all
+// readers and all buffered lines, expand REF,ALT and try to match lines with
+// the same ALTs. A step towards output independent on input ordering of the
+// lines.
+void merge_buffer(args_t *args)
+{
+ bcf_srs_t *files = args->files;
+ int i, pos = -1, var_type = 0;
+ char *id = NULL;
+ maux_t *maux = args->maux;
+ maux_reset(maux);
+
+ // set the current position
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( bcf_sr_has_line(files,i) )
+ {
+ bcf1_t *line = bcf_sr_get_line(files,i);
+ pos = line->pos;
+ var_type = bcf_get_variant_types(line);
+ id = line->d.id;
+ break;
+ }
+ }
+
+ // In this loop we select from each reader compatible candidate lines.
+ // (i.e. SNPs or indels). Go through all files and all lines at this
+ // position and normalize relevant alleles.
+ // REF-only sites may be associated with both SNPs and indels.
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ if ( !reader->buffer ) continue;
+ int j, k;
+ for (j=0; j<=reader->nbuffer; j++)
+ {
+ bcf1_t *line = reader->buffer[j];
+ int line_type = bcf_get_variant_types(line);
+ // select relevant lines
+ maux->d[i][j].skip = SKIP_DIFF;
+ if ( pos!=line->pos )
+ {
+ if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore
+ continue;
+ }
+ if ( args->merge_by_id )
+ {
+ if ( strcmp(id,line->d.id) ) continue;
+ }
+ else
+ {
+ if ( args->collapse==COLLAPSE_NONE && maux->nals )
+ {
+ // All alleles of the tested record must be present in the
+ // selected maux record plus variant types must be the same
+ if ( var_type!=line->d.var_type ) continue;
+ if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
+ for (k=1; k<line->n_allele; k++)
+ {
+ if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
+ }
+ if ( k==line->n_allele ) continue; // no matching allele
+ }
+ if ( !(args->collapse&COLLAPSE_ANY) )
+ {
+ int compatible = 0;
+ if ( line_type==var_type ) compatible = 1;
+ else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything
+ else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1;
+ else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1;
+ else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1;
+ else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1;
+ else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1;
+ if ( !compatible ) continue;
+ }
+ }
+ maux->d[i][j].skip = 0;
+
+ hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map);
+ if ( !maux->nals ) // first record, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=0; k<maux->nals; k++)
+ {
+ maux->als[k] = strdup(line->d.allele[k]);
+ maux->d[i][j].map[k] = k;
+ maux->cnt[k] = 1;
+ }
+ pos = line->pos;
+ continue;
+ }
+
+ // normalize alleles
+ maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=1; k<line->n_allele; k++)
+ maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files
+ maux->cnt[0]++;
+ }
+ }
+
+ // debug_maux(args, pos, var_type);
+
+ // Select records that have the same alleles; the input ordering of indels
+ // must not matter. Multiple VCF lines can be emitted from this loop.
+ // We expect only very few alleles and not many records with the same
+ // position in the buffers, therefore the nested loops should not slow us
+ // much.
+ while (1)
+ {
+ // take the most frequent allele present in multiple files
+ int icnt = 0;
+ for (i=1; i<maux->nals; i++)
+ if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+ if ( maux->cnt[icnt]<0 ) break;
+
+ int nmask = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ maux->has_line[i] = 0;
+
+ bcf_sr_t *reader = &files->readers[i];
+ if ( !reader->buffer ) continue;
+
+ // find lines with the same allele
+ int j;
+ for (j=0; j<=reader->nbuffer; j++)
+ {
+ if ( maux->d[i][j].skip ) continue;
+ int k;
+ for (k=0; k<reader->buffer[j]->n_allele; k++)
+ if ( icnt==maux->d[i][j].map[k] ) break;
+ if ( k<reader->buffer[j]->n_allele ) break;
+ }
+ if ( j>reader->nbuffer )
+ {
+ // no matching allele found in this file
+ if ( args->collapse==COLLAPSE_NONE ) continue;
+
+ for (j=0; j<=reader->nbuffer; j++)
+ {
+ if ( maux->d[i][j].skip ) continue;
+ if ( args->collapse&COLLAPSE_ANY ) break;
+ int line_type = bcf_get_variant_types(reader->buffer[j]);
+ if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type==VCF_REF )
+ {
+ if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ }
+ else if ( var_type==VCF_REF )
+ {
+ if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ }
+ }
+ }
+ if ( j<=reader->nbuffer )
+ {
+ // found a suitable line for merging, place it at the beggining
+ if ( j>0 )
+ {
+ SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]);
+ SWAP(maux1_t, maux->d[i][0], maux->d[i][j]);
+ }
+ // mark as finished so that it's ignored next time
+ maux->d[i][0].skip |= SKIP_DONE;
+ maux->has_line[i] = 1;
+ nmask++;
+ }
+ }
+ if ( !nmask ) break; // done, no more lines suitable for merging found
+ merge_line(args); // merge and output the line
+ maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished
+ }
+
+ // clean the alleles
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = 0;
+ }
+ maux->nals = 0;
+
+ // get the buffers ready for the next next_line() call
+ for (i=0; i<files->nreaders; i++)
+ shake_buffer(maux, i, pos);
+}
+
+void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd)
+{
+ kstring_t str = {0,0,0};
+ ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version());
+ bcf_hdr_append(hdr,str.s);
+
+ str.l = 0;
+ ksprintf(&str,"##%sCommand=%s", cmd, argv[0]);
+ int i;
+ for (i=1; i<argc; i++)
+ {
+ if ( strchr(argv[i],' ') )
+ ksprintf(&str, " '%s'", argv[i]);
+ else
+ ksprintf(&str, " %s", argv[i]);
+ }
+ kputc('\n', &str);
+ bcf_hdr_append(hdr,str.s);
+ free(str.s);
+
+ bcf_hdr_sync(hdr);
+}
+
+void merge_vcf(args_t *args)
+{
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ args->out_hdr = bcf_hdr_init("w");
+
+ if ( args->header_fname )
+ {
+ if ( bcf_hdr_set(args->out_hdr,args->header_fname) ) error("Could not read/parse the header: %s\n", args->header_fname);
+ }
+ else
+ {
+ int i;
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ char buf[10]; snprintf(buf,10,"%d",i+1);
+ merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
+ }
+ bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
+ bcf_hdr_sync(args->out_hdr);
+ }
+ info_rules_init(args);
+
+ bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
+ bcf_hdr_write(args->out_fh, args->out_hdr);
+ if ( args->header_only )
+ {
+ bcf_hdr_destroy(args->out_hdr);
+ hts_close(args->out_fh);
+ return;
+ }
+
+ if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
+ args->maux = maux_init(args->files);
+ args->out_line = bcf_init1();
+ args->tmph = kh_init(strdict);
+ int ret;
+ while ( (ret=bcf_sr_next_line(args->files)) )
+ {
+ merge_buffer(args);
+ }
+ info_rules_destroy(args);
+ maux_destroy(args->maux);
+ bcf_hdr_destroy(args->out_hdr);
+ hts_close(args->out_fh);
+ bcf_destroy1(args->out_line);
+ kh_destroy(strdict, args->tmph);
+ if ( args->tmps.m ) free(args->tmps.s);
+ if ( args->vcmp ) vcmp_destroy(args->vcmp);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n");
+ fprintf(stderr, " Note that only records from different files can be merged, never from the same file. For\n");
+ fprintf(stderr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n");
+ fprintf(stderr, "Usage: bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " --force-samples resolve duplicate sample names\n");
+ fprintf(stderr, " --print-header print only the merged header and exit\n");
+ fprintf(stderr, " --use-header <file> use the provided header\n");
+ fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
+ fprintf(stderr, " -l, --file-list <file> read file names from the file\n");
+ fprintf(stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfmerge(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv;
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->collapse = COLLAPSE_BOTH;
+ int regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"merge",required_argument,NULL,'m'},
+ {"file-list",required_argument,NULL,'l'},
+ {"apply-filters",required_argument,NULL,'f'},
+ {"use-header",required_argument,NULL,1},
+ {"print-header",no_argument,NULL,2},
+ {"force-samples",no_argument,NULL,3},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"info-rules",required_argument,NULL,'i'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'l': args->file_list = optarg; break;
+ case 'i': args->info_rules = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'm':
+ args->collapse = COLLAPSE_NONE;
+ if ( !strcmp(optarg,"snps") ) args->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) args->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) args->collapse |= COLLAPSE_BOTH;
+ else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
+ else error("The -m type \"%s\" is not recognised.\n", optarg);
+ break;
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 1 : args->header_fname = optarg; break;
+ case 2 : args->header_only = 1; break;
+ case 3 : args->force_samples = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( argc==optind && !args->file_list ) usage();
+ if ( argc-optind<2 && !args->file_list ) usage();
+
+ args->files->require_index = 1;
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+
+ while (optind<argc)
+ {
+ if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ optind++;
+ }
+ if ( args->file_list )
+ {
+ int nfiles, i;
+ char **files = hts_readlines(args->file_list, &nfiles);
+ if ( !files ) error("Failed to read from %s\n", args->file_list);
+ for (i=0;i<nfiles; i++)
+ if ( !bcf_sr_add_reader(args->files, files[i]) ) error("Failed to open %s: %s\n", files[i],bcf_sr_strerror(args->files->errnum));
+ for (i=0; i<nfiles; i++) free(files[i]);
+ free(files);
+ }
+ merge_vcf(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c
new file mode 100644
index 0000000..94b5252
--- /dev/null
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -0,0 +1,2069 @@
+#include "pysam.h"
+
+/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <math.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "vcmp.h"
+
+#include <htslib/khash.h>
+KHASH_MAP_INIT_STR(strdict, int)
+typedef khash_t(strdict) strdict_t;
+
+#define SKIP_DONE 1
+#define SKIP_DIFF 2
+
+#define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G)
+#define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A)
+#define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R)
+
+// For merging INFO Number=A,G,R tags
+typedef struct
+{
+ const char *hdr_tag;
+ int type, nvals;
+ int nbuf, mbuf;
+ uint8_t *buf;
+}
+AGR_info_t;
+
+// Rules for merging arbitrary INFO tags
+typedef struct _info_rule_t
+{
+ char *hdr_tag;
+ void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule);
+ int type; // one of BCF_HT_*
+ int block_size; // number of values in a block
+ int nblocks; // number of blocks in nvals (the number of merged files)
+ int nvals, mvals; // used and total size of vals array
+ void *vals; // the info tag values
+}
+info_rule_t;
+
+// Auxiliary merge data for selecting the right combination
+// of buffered records across multiple readers. maux1_t
+// corresponds to one buffered line.
+typedef struct
+{
+ int skip;
+ int *map; // mapping from input alleles to the output array
+ int mmap; // size of map array (only buffer[i].n_allele is actually used)
+ int als_differ;
+}
+maux1_t;
+typedef struct
+{
+ int n; // number of readers
+ char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
+ int nals, mals, nout_als, mout_als; // size of the output array
+ int *cnt, ncnt; // number of records that refer to the alleles
+ int *nbuf; // readers have buffers of varying lengths
+ int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
+ int *flt, mflt, minf;
+ bcf_info_t *inf;// out_line's INFO fields
+ bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
+ int nfmt_map; // number of rows in the fmt_map array
+ int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
+ void *tmp_arr;
+ int ntmp_arr;
+ maux1_t **d; // d[i][j] i-th reader, j-th buffer line
+ AGR_info_t *AGR_info;
+ int nAGR_info, mAGR_info;
+ bcf_srs_t *files;
+ int *has_line; // which files are being merged
+}
+maux_t;
+
+typedef struct
+{
+ vcmp_t *vcmp;
+ maux_t *maux;
+ int header_only, collapse, output_type, force_samples, merge_by_id;
+ char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
+ info_rule_t *rules;
+ int nrules;
+ strdict_t *tmph;
+ kstring_t tmps;
+ bcf_srs_t *files;
+ bcf1_t *out_line;
+ htsFile *out_fh;
+ bcf_hdr_t *out_hdr;
+ char **argv;
+ int argc, n_threads;
+}
+args_t;
+
+static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = 0; \
+ for (i=1; i<rule->nblocks; i++) \
+ { \
+ for (j=0; j<ndim; j++) ptr[j] += ptr[j+i*ndim]; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_avg(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = 0; \
+ for (j=0; j<ndim; j++) \
+ { \
+ double sum = 0; \
+ for (i=0; i<rule->nblocks; i++) sum += ptr[j+i*ndim]; \
+ ptr[j] = sum / rule->nblocks; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_min(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing,set_missing,huge_val) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = huge_val; \
+ for (i=1; i<rule->nblocks; i++) \
+ { \
+ for (j=0; j<ndim; j++) if ( ptr[j] > ptr[j+i*ndim] ) ptr[j] = ptr[j+i*ndim]; \
+ } \
+ for (i=0; i<rule->nvals; i++) if ( ptr[i]==huge_val ) set_missing; \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing, ptr[i]=bcf_int32_missing, INT32_MAX); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i]), bcf_float_set_missing(ptr[i]), HUGE_VAL); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_max(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ int i, j, ndim = rule->block_size;
+ #define BRANCH(type_t,is_missing,set_missing,huge_val) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (i=0; i<rule->nvals; i++) if ( is_missing ) ptr[i] = huge_val; \
+ for (i=1; i<rule->nblocks; i++) \
+ { \
+ for (j=0; j<ndim; j++) if ( ptr[j] < ptr[j+i*ndim] ) ptr[j] = ptr[j+i*ndim]; \
+ } \
+ for (i=0; i<rule->nvals; i++) if ( ptr[i]==huge_val ) set_missing; \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[i]==bcf_int32_missing, ptr[i]=bcf_int32_missing, INT32_MIN); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[i]), bcf_float_set_missing(ptr[i]), -HUGE_VAL); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,ndim,rule->type);
+}
+static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
+{
+ if ( !rule->nvals ) return;
+ if ( rule->type==BCF_HT_STR )
+ {
+ ((char*)rule->vals)[rule->nvals] = 0;
+ bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals);
+ }
+ else
+ bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type);
+}
+
+static int info_rules_comp_key2(const void *a, const void *b)
+{
+ info_rule_t *rule1 = (info_rule_t*) a;
+ info_rule_t *rule2 = (info_rule_t*) b;
+ return strcmp(rule1->hdr_tag, rule2->hdr_tag);
+}
+static int info_rules_comp_key(const void *a, const void *b)
+{
+ char *key = (char*) a;
+ info_rule_t *rule = (info_rule_t*) b;
+ return strcmp(key, rule->hdr_tag);
+}
+static void info_rules_init(args_t *args)
+{
+ if ( args->info_rules && !strcmp("-",args->info_rules) ) return;
+
+ kstring_t str = {0,0,0};
+ if ( !args->info_rules )
+ {
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "DP")) ) kputs("DP:sum",&str);
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "DP4")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("DP4:sum",&str);
+ }
+ if ( !str.l ) return;
+ args->info_rules = str.s;
+ }
+
+ args->nrules = 1;
+ char *ss = strdup(args->info_rules), *tmp = ss;
+ int n = 0;
+ while ( *ss )
+ {
+ if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse INFO rules: \"%s\"\n", args->info_rules); }
+ else if ( *ss==',' ) { *ss = 0; args->nrules++; n++; if ( n%2==1 ) error("Could not parse INFO rules: \"%s\"\n", args->info_rules); }
+ ss++;
+ }
+ if ( n%2==0 ) error("Could not parse INFO rules: \"%s\"\n", args->info_rules);
+ args->rules = (info_rule_t*) calloc(args->nrules,sizeof(info_rule_t));
+
+ n = 0;
+ ss = tmp;
+ while ( n < args->nrules )
+ {
+ info_rule_t *rule = &args->rules[n];
+ rule->hdr_tag = strdup(ss);
+ int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
+ if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+ rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
+ if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+
+ while ( *ss ) ss++; ss++;
+ if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
+
+ int is_join = 0;
+ if ( !strcasecmp(ss,"sum") ) rule->merger = info_rules_merge_sum;
+ else if ( !strcasecmp(ss,"avg") ) rule->merger = info_rules_merge_avg;
+ else if ( !strcasecmp(ss,"min") ) rule->merger = info_rules_merge_min;
+ else if ( !strcasecmp(ss,"max") ) rule->merger = info_rules_merge_max;
+ else if ( !strcasecmp(ss,"join") ) { rule->merger = info_rules_merge_join; is_join = 1; }
+ else error("The rule logic \"%s\" not recognised\n", ss);
+
+ if ( !is_join && rule->type==BCF_HT_STR )
+ error("Numeric operation \"%s\" requested on non-numeric field: %s\n", ss, rule->hdr_tag);
+ if ( bcf_hdr_id2number(args->out_hdr,BCF_HL_INFO,id)==0xfffff )
+ {
+ int is_agr = (
+ bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_A ||
+ bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G ||
+ bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R
+ ) ? 1 : 0;
+ if ( is_join && is_agr )
+ error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag);
+ if ( !is_join && !is_agr )
+ error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
+ }
+
+ while ( *ss ) ss++; ss++; n++;
+ }
+ free(str.s);
+ free(tmp);
+
+ qsort(args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key2);
+}
+static void info_rules_destroy(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nrules; i++)
+ {
+ info_rule_t *rule = &args->rules[i];
+ free(rule->hdr_tag);
+ free(rule->vals);
+ }
+ free(args->rules);
+}
+static void info_rules_reset(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nrules; i++)
+ args->rules[i].nblocks = args->rules[i].nvals = args->rules[i].block_size = 0;
+}
+static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len)
+{
+ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type);
+ if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret);
+
+ rule->nblocks++;
+
+ if ( rule->type==BCF_HT_STR )
+ {
+ int need_comma = rule->nblocks==1 ? 0 : 1;
+ hts_expand(char,rule->nvals+ret+need_comma+1,rule->mvals,rule->vals); // 1 for null-termination
+ char *tmp = (char*) rule->vals + rule->nvals;
+ if ( rule->nvals>0 ) { *tmp = ','; tmp++; }
+ strncpy(tmp,(char*)args->maux->tmp_arr,ret);
+ rule->nvals += ret + need_comma;
+ return 1;
+ }
+
+ int i, j;
+ if ( var_len==BCF_VL_A )
+ {
+ assert( ret==line->n_allele-1 );
+ args->maux->nagr_map = ret;
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ // create mapping from source file ALT indexes to dst file indexes
+ for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i+1] - 1;
+ rule->block_size = args->maux->nout_als - 1;
+ }
+ else if ( var_len==BCF_VL_R )
+ {
+ assert( ret==line->n_allele );
+ args->maux->nagr_map = ret;
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
+ rule->block_size = args->maux->nout_als;
+ }
+ else if ( var_len==BCF_VL_G )
+ {
+ args->maux->nagr_map = bcf_alleles2gt(line->n_allele-1,line->n_allele-1)+1;
+ assert( ret==line->n_allele || ret==args->maux->nagr_map );
+ if ( ret==line->n_allele ) // haploid
+ {
+ args->maux->nagr_map = line->n_allele;
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
+ rule->block_size = args->maux->nout_als;
+ }
+ else
+ {
+ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
+ int k_src = 0;
+ for (i=0; i<line->n_allele; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ args->maux->agr_map[k_src] = bcf_alleles2gt(als->map[i],als->map[j]);
+ k_src++;
+ }
+ }
+ rule->block_size = bcf_alleles2gt(args->maux->nout_als-1,args->maux->nout_als-1)+1;
+ }
+ }
+ else
+ {
+ if ( rule->nblocks>1 && ret!=rule->block_size )
+ error("Mismatch in number of values for INFO/%s at %s:%d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
+ rule->block_size = ret;
+ args->maux->nagr_map = 0;
+ }
+
+ #define BRANCH(src_type_t,dst_type_t,set_missing) { \
+ src_type_t *src = (src_type_t *) args->maux->tmp_arr; \
+ hts_expand0(dst_type_t,(rule->nvals+rule->block_size),rule->mvals,rule->vals); \
+ dst_type_t *dst = (dst_type_t *) rule->vals + rule->nvals; \
+ rule->nvals += rule->block_size; \
+ if ( !args->maux->nagr_map ) \
+ { \
+ for (i=0; i<ret; i++) dst[i] = src[i]; \
+ } \
+ else \
+ { \
+ for (i=0; i<rule->block_size; i++) set_missing; \
+ for (i=0; i<ret; i++) dst[args->maux->agr_map[i]] = src[i]; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int, int32_t, dst[i] = bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH(float, float, bcf_float_set_missing(dst[i])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ return 1;
+}
+
+int bcf_hdr_sync(bcf_hdr_t *h);
+
+void merge_headers(bcf_hdr_t *hw, const bcf_hdr_t *hr, const char *clash_prefix, int force_samples)
+{
+ // header lines
+ hw = bcf_hdr_merge(hw, hr);
+
+ // samples
+ int i;
+ for (i=0; i<bcf_hdr_nsamples(hr); i++)
+ {
+ char *name = hr->samples[i];
+ if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 )
+ {
+ // there is a sample with the same name
+ if ( !force_samples ) error("Error: Duplicate sample names (%s), use --force-samples to proceed anyway.\n", name);
+
+ int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1;
+ name = (char*) malloc(sizeof(char)*(len+1));
+ sprintf(name,"%s:%s",clash_prefix,hr->samples[i]);
+ bcf_hdr_add_sample(hw,name);
+ free(name);
+ }
+ else
+ bcf_hdr_add_sample(hw,name);
+ }
+}
+
+void debug_als(char **als, int nals)
+{
+ int k; for (k=0; k<nals; k++) fprintf(pysamerr,"%s ", als[k]);
+ fprintf(pysamerr,"\n");
+}
+
+/**
+ * normalize_alleles() - create smallest possible representation of the alleles
+ * @als: alleles to be merged, first is REF (rw)
+ * @nals: number of $a alleles
+ *
+ * Best explained on an example:
+ * In: REF=GTTT ALT=GTT
+ * Out: REF=GT ALT=G
+ *
+ * Note: the als array will be modified
+ */
+void normalize_alleles(char **als, int nals)
+{
+ if ( !als[0][1] ) return; // ref is 1base long, we're done
+
+ int j, i = 1, done = 0;
+ int *lens = (int*) malloc(sizeof(int)*nals);
+ for (j=0; j<nals; j++) lens[j] = strlen(als[j]);
+
+ while ( i<lens[0] )
+ {
+ for (j=1; j<nals; j++)
+ {
+ if ( i>=lens[j] ) done = 1;
+ if ( als[j][lens[j]-i] != als[0][lens[0]-i] ) { done = 1; break; }
+ }
+ if ( done ) break;
+ i++;
+ }
+ if ( i>1 )
+ {
+ i--;
+ als[0][lens[0]-i] = 0;
+ for (j=1; j<nals; j++) als[j][lens[j]-i] = 0;
+ }
+ free(lens);
+}
+
+ /**
+ * merge_alleles() - merge two REF,ALT records, $a and $b into $b.
+ * @a: alleles to be merged, first is REF
+ * @na: number of $a alleles
+ * @map: map from the original $a indexes to new $b indexes (0-based)
+ * @b: alleles to be merged, the array will be expanded as required
+ * @nb: number of $b alleles
+ * @mb: size of $b
+ *
+ * Returns NULL on error or $b expanded to incorporate $a alleles and sets
+ * $map. Best explained on an example:
+ * In: REF ALT
+ * a: ACG, AC,A (1bp and 2bp deletion)
+ * b: ACGT, A (3bp deletion)
+ * Out:
+ * b: ACGT, A,ACT,AT (3bp, 1bp and 2bp deletion)
+ * map: 0,2,3
+ * Here the mapping from the original $a alleles to the new $b alleles is 0->0,
+ * 1->2, and 2->3.
+ */
+char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
+{
+ // reference allele never changes
+ map[0] = 0;
+
+ int i,j;
+ int rla = !a[0][1] ? 1 : strlen(a[0]);
+ int rlb = !b[0][1] ? 1 : strlen(b[0]);
+
+ // the most common case: same SNPs
+ if ( na==2 && *nb==2 && rla==1 && rlb==1 && a[1][0]==b[1][0] && !a[1][1] && !b[1][1] )
+ {
+ map[1] = 1;
+ return b;
+ }
+
+ // Sanity check: reference prefixes must be identical
+ if ( strncmp(a[0],b[0],rla<rlb?rla:rlb) )
+ {
+ if ( strncasecmp(a[0],b[0],rla<rlb?rla:rlb) )
+ {
+ fprintf(pysamerr, "The REF prefixes differ: %s vs %s (%d,%d)\n", a[0],b[0],rla,rlb);
+ return NULL;
+ }
+ // Different case, change to uppercase
+ for (i=0; i<na; i++)
+ {
+ int len = strlen(a[i]);
+ for (j=0; j<len; j++) a[i][j] = toupper(a[i][j]);
+ }
+ for (i=0; i<*nb; i++)
+ {
+ int len = strlen(b[i]);
+ for (j=0; j<len; j++) b[i][j] = toupper(b[i][j]);
+ }
+ }
+
+ int n = *nb + na;
+ hts_expand0(char*,n,*mb,b);
+
+ // $b alleles need expanding
+ if ( rla>rlb )
+ {
+ for (i=0; i<*nb; i++)
+ {
+ int l = strlen(b[i]);
+ b[i] = (char*) realloc(b[i],l+rla-rlb+1);
+ memcpy(b[i]+l,a[0]+rlb,rla-rlb+1);
+ }
+ }
+
+ // now check if the $a alleles are present and if not add them
+ for (i=1; i<na; i++)
+ {
+ char *ai;
+ if ( rlb>rla ) // $a alleles need expanding
+ {
+ int l = strlen(a[i]);
+ ai = (char*) malloc(l+rlb-rla+1);
+ memcpy(ai,a[i],l);
+ memcpy(ai+l,b[0]+rla,rlb-rla+1);
+ }
+ else
+ ai = a[i];
+
+ for (j=1; j<*nb; j++)
+ if ( !strcasecmp(ai,b[j]) ) break;
+
+ if ( j<*nb ) // $b already has the same allele
+ {
+ map[i] = j;
+ if ( rlb>rla ) free(ai);
+ continue;
+ }
+ // new allele
+ map[i] = *nb;
+ b[*nb] = rlb>rla ? ai : strdup(ai);
+ (*nb)++;
+ }
+ return b;
+}
+
+maux_t *maux_init(bcf_srs_t *files)
+{
+ maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t));
+ ma->n = files->nreaders;
+ ma->nbuf = (int *) calloc(ma->n,sizeof(int));
+ ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*));
+ ma->files = files;
+ int i, n_smpl = 0;
+ for (i=0; i<ma->n; i++)
+ n_smpl += bcf_hdr_nsamples(files->readers[i].header);
+ ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int));
+ ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
+ ma->has_line = (int*) malloc(ma->n*sizeof(int));
+ return ma;
+}
+void maux_destroy(maux_t *ma)
+{
+ int i;
+ for (i=0; i<ma->n; i++) // for each reader
+ {
+ if ( !ma->d[i] ) continue;
+ int j;
+ for (j=0; j<ma->nbuf[i]; j++) // for each buffered line
+ if ( ma->d[i][j].map ) free(ma->d[i][j].map);
+ free(ma->d[i]);
+ }
+ for (i=0; i<ma->mAGR_info; i++)
+ free(ma->AGR_info[i].buf);
+ free(ma->agr_map);
+ free(ma->AGR_info);
+ if (ma->ntmp_arr) free(ma->tmp_arr);
+ if (ma->nfmt_map) free(ma->fmt_map);
+ // ma->inf freed in bcf_destroy1
+ free(ma->d);
+ free(ma->nbuf);
+ for (i=0; i<ma->mals; i++) free(ma->als[i]);
+ if (ma->mout_als) free(ma->out_als);
+ free(ma->als);
+ free(ma->cnt);
+ free(ma->smpl_ploidy);
+ free(ma->smpl_nGsize);
+ free(ma->has_line);
+ free(ma);
+}
+void maux_expand1(maux_t *ma, int i)
+{
+ if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer )
+ {
+ int n = ma->files->readers[i].nbuffer + 1;
+ ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n);
+ memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i]));
+ ma->nbuf[i] = n;
+ }
+}
+void maux_reset(maux_t *ma)
+{
+ int i;
+ for (i=0; i<ma->n; i++) maux_expand1(ma, i);
+ for (i=1; i<ma->ncnt; i++) ma->cnt[i] = 0;
+}
+void maux_debug(maux_t *ma, int ir, int ib)
+{
+ printf("[%d,%d]\t", ir,ib);
+ int i;
+ for (i=0; i<ma->nals; i++)
+ {
+ printf(" %s [%d]", ma->als[i], ma->cnt[i]);
+ }
+ printf("\n");
+}
+
+void merge_chrom2qual(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+
+ int i, ret;
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+ kstring_t *tmps = &args->tmps;
+ tmps->l = 0;
+
+ maux_t *ma = args->maux;
+ int *al_idxs = (int*) calloc(ma->nals,sizeof(int));
+ bcf_float_set_missing(out->qual);
+
+ // CHROM, POS, ID, QUAL
+ out->pos = -1;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+
+ // alleles
+ int j;
+ for (j=1; j<line->n_allele; j++)
+ al_idxs[ ma->d[i][0].map[j] ] = 1;
+
+ // position
+ if ( out->pos==-1 )
+ {
+ const char *chr = hdr->id[BCF_DT_CTG][line->rid].key;
+ out->rid = bcf_hdr_name2id(out_hdr, chr);
+ if ( strcmp(chr,out_hdr->id[BCF_DT_CTG][out->rid].key) ) error("Uh\n");
+ out->pos = line->pos;
+ }
+
+ // ID
+ if ( line->d.id[0]!='.' || line->d.id[1] )
+ {
+ kitr = kh_get(strdict, tmph, line->d.id);
+ if ( kitr == kh_end(tmph) )
+ {
+ if ( tmps->l ) kputc(';', tmps);
+ kputs(line->d.id, tmps);
+ kh_put(strdict, tmph, line->d.id, &ret);
+ }
+ }
+
+ // set QUAL to the max qual value. Not exactly correct, but good enough for now
+ if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) )
+ {
+ if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual;
+ }
+ }
+
+ // set ID
+ if ( !tmps->l ) kputs(".", tmps);
+ if ( out->d.id ) free(out->d.id);
+ out->d.id = strdup(tmps->s);
+
+ // set alleles
+ ma->nout_als = 0;
+ for (i=1; i<ma->nals; i++)
+ {
+ if ( !al_idxs[i] ) continue;
+ ma->nout_als++;
+
+ // Adjust the indexes, the allele map could be created for multiple collapsed records,
+ // some of which might be unused for this output line
+ int ir, j;
+ for (ir=0; ir<files->nreaders; ir++)
+ {
+ if ( !ma->has_line[ir] ) continue;
+ bcf1_t *line = files->readers[ir].buffer[0];
+ for (j=1; j<line->n_allele; j++)
+ if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als;
+ }
+ }
+ // Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block.
+ ma->nout_als++;
+ hts_expand0(char*, ma->nout_als, ma->mout_als, ma->out_als);
+ int k = 0;
+ for (i=0; i<ma->nals; i++)
+ if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]);
+ assert( k==ma->nout_als );
+ normalize_alleles(ma->out_als, ma->nout_als);
+ bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als);
+ free(al_idxs);
+ for (i=0; i<ma->nout_als; i++) free(ma->out_als[i]);
+}
+
+void merge_filter(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+
+ int i, ret;
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+
+ maux_t *ma = args->maux;
+ out->d.n_flt = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i]) continue;
+
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_unpack(line, BCF_UN_ALL);
+
+ int k;
+ for (k=0; k<line->d.n_flt; k++)
+ {
+ const char *flt = hdr->id[BCF_DT_ID][line->d.flt[k]].key;
+ kitr = kh_get(strdict, tmph, flt);
+ if ( kitr == kh_end(tmph) )
+ {
+ int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt);
+ if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt);
+ hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt);
+ ma->flt[out->d.n_flt] = id;
+ out->d.n_flt++;
+ kh_put(strdict, tmph, flt, &ret);
+ }
+ }
+ }
+ // Check if PASS is not mixed with other filters
+ if ( out->d.n_flt>1 )
+ {
+ int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
+ for (i=0; i<out->d.n_flt; i++)
+ if ( ma->flt[i]==id ) break;
+ if ( i<out->d.n_flt )
+ {
+ out->d.n_flt--;
+ for (; i<out->d.n_flt; i++) ma->flt[i] = ma->flt[i+1];
+ }
+ }
+ out->d.flt = ma->flt;
+}
+
+static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str)
+{
+ assert( !info->vptr_free );
+
+ uint8_t *ptr = info->vptr - info->vptr_off;
+ bcf_dec_typed_int1(ptr, &ptr);
+
+ tmp_str->l = 0;
+ bcf_enc_int1(tmp_str, id);
+
+ if ( tmp_str->l == ptr - info->vptr + info->vptr_off )
+ {
+ // the new id is represented with the same number of bytes
+ memcpy(info->vptr - info->vptr_off, tmp_str->s, tmp_str->l);
+ return;
+ }
+
+ kputsn_(ptr, info->vptr - ptr, tmp_str);
+ info->vptr_off = tmp_str->l;
+ kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str);
+
+ info->vptr = (uint8_t*) tmp_str->s + info->vptr_off;
+ info->vptr_free = 1;
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ tmp_str->s = NULL;
+ tmp_str->m = 0;
+ tmp_str->l = 0;
+}
+
+/*
+ * copy_string_field() - copy a comma-separated field
+ * @param src: source string
+ * @param isrc: index of the field to copy
+ * @param src_len: length of source string (excluding the terminating \0)
+ * @param dst: destination kstring (must be initialized)
+ * @param idst: index of the destination field
+ */
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst)
+{
+ int ith_src = 0, start_src = 0; // i-th field in src string
+ while ( ith_src<isrc && start_src<src_len )
+ {
+ if ( src[start_src]==',' ) { ith_src++; }
+ start_src++;
+ }
+ if ( ith_src!=isrc ) return -1; // requested field not found
+ int end_src = start_src;
+ while ( end_src<src_len && src[end_src]!=',' ) end_src++;
+
+ int nsrc_cpy = end_src - start_src;
+ if ( nsrc_cpy==1 && src[start_src]=='.' ) return 0; // don't write missing values, dst is already initialized
+
+ int ith_dst = 0, start_dst = 0;
+ while ( ith_dst<idst && start_dst<dst->l )
+ {
+ if ( dst->s[start_dst]==',' ) { ith_dst++; }
+ start_dst++;
+ }
+ if ( ith_dst!=idst ) return -2;
+ int end_dst = start_dst;
+ while ( end_dst<dst->l && dst->s[end_dst]!=',' ) end_dst++;
+
+ if ( end_dst - start_dst>1 || dst->s[start_dst]!='.' ) return 0; // do not overwrite non-empty values
+
+ // Now start_dst and end_dst are indexes to the destination memory area
+ // which needs to be replaced with nsrc_cpy
+ // source bytes, end_dst points just after.
+ int ndst_shift = nsrc_cpy - (end_dst - start_dst);
+ int ndst_move = dst->l - end_dst + 1; // how many bytes must be moved (including \0)
+ if ( ndst_shift )
+ {
+ ks_resize(dst, dst->l + ndst_shift + 1); // plus \0
+ memmove(dst->s+end_dst+ndst_shift, dst->s+end_dst, ndst_move);
+ }
+ memcpy(dst->s+start_dst, src+start_src, nsrc_cpy);
+ dst->l += ndst_shift;
+ return 0;
+}
+
+static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, int len, maux1_t *als, AGR_info_t *agr)
+{
+ int i;
+ if ( !agr->nbuf )
+ {
+ if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT )
+ {
+ agr->nbuf = 4 * agr->nvals;
+ hts_expand(uint8_t,agr->nbuf,agr->mbuf,agr->buf);
+ if ( info->type!=BCF_BT_FLOAT )
+ {
+ int32_t *tmp = (int32_t*) agr->buf;
+ for (i=0; i<agr->nvals; i++) tmp[i] = bcf_int32_missing;
+ }
+ else
+ {
+ float *tmp = (float*) agr->buf;
+ for (i=0; i<agr->nvals; i++) bcf_float_set_missing(tmp[i]);
+ }
+ }
+ else if ( info->type==BCF_BT_CHAR )
+ {
+ kstring_t tmp; tmp.l = 0; tmp.m = agr->mbuf; tmp.s = (char*)agr->buf;
+ kputc('.',&tmp);
+ for (i=1; i<agr->nvals; i++) kputs(",.",&tmp);
+ agr->mbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s;
+ }
+ else
+ error("Not ready for type [%d]: %s at %d\n", info->type,agr->hdr_tag,line->pos+1);
+ }
+
+ if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT )
+ {
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int ifrom = len==BCF_VL_A ? 1 : 0;
+ #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
+ type_t *src = (type_t *) info->vptr; \
+ out_type_t *tgt = (out_type_t *) agr->buf; \
+ int iori, inew; \
+ for (iori=ifrom; iori<line->n_allele; iori++) \
+ { \
+ if ( is_vector_end ) break; \
+ if ( is_missing ) continue; \
+ inew = als->map[iori] - ifrom; \
+ tgt[inew] = *src; \
+ src++; \
+ } \
+ }
+ switch (info->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, int); break;
+ case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break;
+ default: fprintf(pysamerr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ else
+ {
+ #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
+ type_t *src = (type_t *) info->vptr; \
+ out_type_t *tgt = (out_type_t *) agr->buf; \
+ int iori,jori, inew,jnew; \
+ for (iori=0; iori<line->n_allele; iori++) \
+ { \
+ inew = als->map[iori]; \
+ for (jori=0; jori<=iori; jori++) \
+ { \
+ jnew = als->map[jori]; \
+ int kori = iori*(iori+1)/2 + jori; \
+ if ( is_vector_end ) break; \
+ if ( is_missing ) continue; \
+ int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
+ tgt[knew] = src[kori]; \
+ } \
+ if ( jori<=iori ) break; \
+ } \
+ }
+ switch (info->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, src[kori]==bcf_int8_missing, src[kori]==bcf_int8_vector_end, int); break;
+ case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break;
+ case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break;
+ default: fprintf(pysamerr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ }
+ else
+ {
+ kstring_t tmp; tmp.l = agr->nbuf; tmp.m = agr->mbuf; tmp.s = (char*)agr->buf;
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int iori, ifrom = len==BCF_VL_A ? 1 : 0;
+ for (iori=ifrom; iori<line->n_allele; iori++)
+ {
+ int ret = copy_string_field((char*)info->vptr, iori-ifrom, info->len, &tmp, als->map[iori]-ifrom);
+ if ( ret )
+ error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag);
+ }
+ }
+ else
+ {
+ int iori,jori, inew,jnew;
+ for (iori=0; iori<line->n_allele; iori++)
+ {
+ inew = als->map[iori];
+ for (jori=0; jori<=iori; jori++)
+ {
+ jnew = als->map[jori];
+ int kori = iori*(iori+1)/2 + jori;
+ int knew = bcf_alleles2gt(inew,jnew);
+ int ret = copy_string_field((char*)info->vptr, kori, info->len, &tmp, knew);
+ if ( ret )
+ error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag);
+ }
+ }
+ }
+ agr->mbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s;
+ }
+}
+
+void merge_info(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+
+ int i, j, ret;
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+
+ maux_t *ma = args->maux;
+ ma->nAGR_info = 0;
+ out->n_info = 0;
+ info_rules_reset(args);
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+ for (j=0; j<line->n_info; j++)
+ {
+ bcf_info_t *inf = &line->d.info[j];
+
+ const char *key = hdr->id[BCF_DT_ID][inf->key].key;
+ if ( !strcmp("AC",key) || !strcmp("AN",key) ) continue; // AC and AN are done in merge_format() after genotypes are done
+
+ int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key);
+ if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key);
+
+ kitr = kh_get(strdict, tmph, key); // have we seen the tag in one of the readers?
+ int len = bcf_hdr_id2length(hdr,BCF_HL_INFO,inf->key);
+ if ( args->nrules )
+ {
+ info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key);
+ if ( rule )
+ {
+ maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL;
+ if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue;
+ }
+ }
+
+ // Todo: Number=AGR tags should use the newer info_rules_* functions (info_rules_merge_first to be added)
+ // and merge_AGR_info_tag to be made obsolete.
+ if ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) // Number=R,G,A requires special treatment
+ {
+ if ( kitr == kh_end(tmph) )
+ {
+ // first occurance in this reader, alloc arrays
+ ma->nAGR_info++;
+ hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info);
+ kitr = kh_put(strdict, tmph, key, &ret);
+ kh_val(tmph,kitr) = ma->nAGR_info - 1;
+ ma->AGR_info[ma->nAGR_info-1].hdr_tag = key;
+ ma->AGR_info[ma->nAGR_info-1].type = bcf_hdr_id2type(hdr,BCF_HL_INFO,inf->key);
+ ma->AGR_info[ma->nAGR_info-1].nbuf = 0; // size of the buffer
+ switch (len)
+ {
+ case BCF_VL_A: ma->AGR_info[ma->nAGR_info-1].nvals = ma->nout_als - 1; break;
+ case BCF_VL_G: ma->AGR_info[ma->nAGR_info-1].nvals = bcf_alleles2gt(ma->nout_als-1,ma->nout_als-1)+1; break;
+ case BCF_VL_R: ma->AGR_info[ma->nAGR_info-1].nvals = ma->nout_als; break;
+ }
+ }
+ kitr = kh_get(strdict, tmph, key);
+ int idx = kh_val(tmph, kitr);
+ if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1);
+ merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]);
+ continue;
+ }
+
+ if ( kitr == kh_end(tmph) )
+ {
+ hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf);
+ ma->inf[out->n_info].key = id;
+ ma->inf[out->n_info].type = inf->type;
+ ma->inf[out->n_info].len = inf->len;
+ ma->inf[out->n_info].vptr = inf->vptr;
+ ma->inf[out->n_info].v1.i = inf->v1.i;
+ ma->inf[out->n_info].v1.f = inf->v1.f;
+ ma->inf[out->n_info].vptr_off = inf->vptr_off;
+ ma->inf[out->n_info].vptr_len = inf->vptr_len;
+ ma->inf[out->n_info].vptr_free = inf->vptr_free;
+ if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) )
+ {
+ // The existing packed info cannot be reused. Change the id.
+ // Although quite hacky, it's faster than anything else given
+ // the data structures
+ bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps);
+ }
+ out->n_info++;
+ kitr = kh_put(strdict, tmph, key, &ret);
+ kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value
+ }
+ }
+ }
+ out->d.info = ma->inf;
+ out->d.m_info = ma->minf;
+ for (i=0; i<args->nrules; i++)
+ args->rules[i].merger(args->out_hdr, out, &args->rules[i]);
+ for (i=0; i<ma->nAGR_info; i++)
+ {
+ AGR_info_t *agr = &ma->AGR_info[i];
+ bcf_update_info(out_hdr,out,agr->hdr_tag,agr->buf,agr->nvals,agr->type);
+ }
+}
+
+void update_AN_AC(bcf_hdr_t *hdr, bcf1_t *line)
+{
+ int32_t an = 0, *tmp = (int32_t*) malloc(sizeof(int)*line->n_allele);
+ int ret = bcf_calc_ac(hdr, line, tmp, BCF_UN_FMT);
+ if ( ret>0 )
+ {
+ int i;
+ for (i=0; i<line->n_allele; i++) an += tmp[i];
+ bcf_update_info_int32(hdr, line, "AN", &an, 1);
+ bcf_update_info_int32(hdr, line, "AC", tmp+1, line->n_allele-1);
+ }
+ free(tmp);
+}
+
+void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+ maux_t *ma = args->maux;
+ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
+
+ int nsize = 0, msize = sizeof(int32_t);
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !fmt_map[i] ) continue;
+ if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
+ }
+
+ if ( ma->ntmp_arr < nsamples*nsize*msize )
+ {
+ ma->ntmp_arr = nsamples*nsize*msize;
+ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
+ }
+ memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
+
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize;
+
+ int j, k;
+ if ( !fmt_ori )
+ {
+ // missing values: assume maximum ploidy
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++)
+ {
+ for (k=0; k<nsize; k++) { tmp[k] = 0; ma->smpl_ploidy[ismpl+j]++; }
+ tmp += nsize;
+ }
+ ismpl += bcf_hdr_nsamples(hdr);
+ continue;
+ }
+
+ #define BRANCH(type_t, vector_end) { \
+ type_t *p_ori = (type_t*) fmt_ori->p; \
+ if ( !ma->d[i][0].als_differ ) \
+ { \
+ /* the allele numbering is unchanged */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ for (k=0; k<fmt_ori->n; k++) \
+ { \
+ if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+ ma->smpl_ploidy[ismpl+j]++; \
+ if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
+ else tmp[k] = p_ori[k]; \
+ } \
+ for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
+ tmp += nsize; \
+ p_ori += fmt_ori->n; \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ continue; \
+ } \
+ /* allele numbering needs to be changed */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ for (k=0; k<fmt_ori->n; k++) \
+ { \
+ if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+ ma->smpl_ploidy[ismpl+j]++; \
+ if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
+ else \
+ { \
+ int al = (p_ori[k]>>1) - 1; \
+ al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \
+ tmp[k] = (al << 1) | ((p_ori[k])&1); \
+ } \
+ } \
+ for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
+ tmp += nsize; \
+ p_ori += fmt_ori->n; \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+ default: error("Unexpected case: %d\n", fmt_ori->type);
+ }
+ #undef BRANCH
+ }
+ bcf_update_format_int32(out_hdr, out, "GT", (int32_t*)ma->tmp_arr, nsamples*nsize);
+}
+
+void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+ maux_t *ma = args->maux;
+ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
+
+ const char *key = NULL;
+ int nsize = 0, length = BCF_VL_FIXED, type = -1;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+ if ( !fmt_map[i] ) continue;
+ if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key;
+ type = fmt_map[i]->type;
+ if ( IS_VL_G(files->readers[i].header, fmt_map[i]->id) )
+ {
+ length = BCF_VL_G;
+ nsize = out->n_allele*(out->n_allele + 1)/2;
+ break;
+ }
+ if ( IS_VL_A(files->readers[i].header, fmt_map[i]->id) )
+ {
+ length = BCF_VL_A;
+ nsize = out->n_allele - 1;
+ break;
+ }
+ if ( IS_VL_R(files->readers[i].header, fmt_map[i]->id) )
+ {
+ length = BCF_VL_R;
+ nsize = out->n_allele;
+ break;
+ }
+ if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
+ }
+
+ int msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+ if ( ma->ntmp_arr < nsamples*nsize*msize )
+ {
+ ma->ntmp_arr = nsamples*nsize*msize;
+ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
+ }
+
+ // Fill the temp array for all samples by collecting values from all files
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ if ( fmt_ori )
+ {
+ type = fmt_ori->type;
+ int nals_ori = reader->buffer[0]->n_allele;
+ if ( length==BCF_VL_G )
+ {
+ // if all fields are missing then n==1 is valid
+ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori*(nals_ori+1)/2 && fmt_map[i]->n != nals_ori )
+ error("Incorrect number of %s fields (%d) at %s:%d, cannot merge.\n", key,fmt_ori->n,bcf_seqname(args->out_hdr,out),out->pos+1);
+ }
+ else if ( length==BCF_VL_A )
+ {
+ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori-1 )
+ error("Incorrect number of %s fields (%d) at %s:%d, cannot merge.\n", key,fmt_ori->n,bcf_seqname(args->out_hdr,out),out->pos+1);
+ }
+ else if ( length==BCF_VL_R )
+ {
+ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori )
+ error("Incorrect number of %s fields (%d) at %s:%d, cannot merge.\n", key,fmt_ori->n,bcf_seqname(args->out_hdr,out),out->pos+1);
+ }
+ }
+
+ // set the values
+ #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+ int j, l, k; \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ if ( !fmt_ori ) \
+ { \
+ /* the field is not present in this file, set missing values */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ tgt_set_missing; tgt++; for (l=1; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ continue; \
+ } \
+ assert( ma->has_line[i] ); \
+ bcf1_t *line = reader->buffer[0]; \
+ src_type_t *src = (src_type_t*) fmt_ori->p; \
+ if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \
+ { \
+ /* alleles unchanged, copy over */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ for (l=0; l<fmt_ori->n; l++) \
+ { \
+ if ( src_is_vector_end ) break; \
+ else if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ tgt++; src++; \
+ } \
+ for (k=l; k<nsize; k++) { tgt_set_vector_end; tgt++; } \
+ src += fmt_ori->n - l; \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ continue; \
+ } \
+ /* allele numbering needs to be changed */ \
+ if ( length==BCF_VL_G ) \
+ { \
+ /* Number=G tags */ \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+ { \
+ /* tag with missing value "." */ \
+ tgt_set_missing; \
+ for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
+ continue; \
+ } \
+ int ngsize = ma->smpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
+ for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+ for (; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
+ if ( ma->smpl_ploidy[ismpl+j]==1 ) \
+ { \
+ /* Haploid */ \
+ int iori, inew; \
+ for (iori=0; iori<line->n_allele; iori++) \
+ { \
+ inew = ma->d[i][0].map[iori]; \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
+ if ( src_is_vector_end ) break; \
+ if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ } \
+ } \
+ else \
+ { \
+ /* Diploid */ \
+ int iori,jori, inew,jnew; \
+ for (iori=0; iori<line->n_allele; iori++) \
+ { \
+ inew = ma->d[i][0].map[iori]; \
+ for (jori=0; jori<=iori; jori++) \
+ { \
+ jnew = ma->d[i][0].map[jori]; \
+ int kori = iori*(iori+1)/2 + jori; \
+ int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + knew; \
+ if ( src_is_vector_end ) \
+ { \
+ iori = line->n_allele; \
+ break; \
+ } \
+ if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ else \
+ { \
+ /* Number=A or Number=R tags */ \
+ int ifrom = length==BCF_VL_A ? 1 : 0; \
+ for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
+ { \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
+ src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
+ if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+ { \
+ /* tag with missing value "." */ \
+ tgt_set_missing; \
+ for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
+ continue; \
+ } \
+ src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
+ for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+ int iori,inew; \
+ for (iori=ifrom; iori<line->n_allele; iori++) \
+ { \
+ inew = ma->d[i][0].map[iori] - ifrom; \
+ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
+ if ( src_is_vector_end ) break; \
+ if ( src_is_missing ) tgt_set_missing; \
+ else *tgt = *src; \
+ src++; \
+ } \
+ } \
+ } \
+ ismpl += bcf_hdr_nsamples(hdr); \
+ }
+ switch (type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+ case BCF_BT_CHAR: BRANCH(uint8_t, uint8_t, *src==bcf_str_missing, *src==bcf_str_vector_end, *tgt=bcf_str_missing, *tgt=bcf_str_vector_end); break;
+ default: error("Unexpected case: %d, %s\n", type, key);
+ }
+ #undef BRANCH
+ }
+ if ( type==BCF_BT_FLOAT )
+ bcf_update_format_float(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize);
+ else if ( type==BCF_BT_CHAR )
+ bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize);
+ else
+ bcf_update_format_int32(out_hdr, out, key, (int32_t*)ma->tmp_arr, nsamples*nsize);
+}
+
+void merge_format(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ bcf_hdr_t *out_hdr = args->out_hdr;
+ maux_t *ma = args->maux;
+ if ( !ma->nfmt_map )
+ {
+ ma->nfmt_map = 2;
+ ma->fmt_map = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*));
+ }
+ else
+ memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**));
+
+ khiter_t kitr;
+ strdict_t *tmph = args->tmph;
+ kh_clear(strdict, tmph);
+ int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !ma->has_line[i] ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf1_t *line = reader->buffer[0];
+ bcf_hdr_t *hdr = reader->header;
+ for (j=0; j<line->n_fmt; j++)
+ {
+ // Wat this tag already seen?
+ bcf_fmt_t *fmt = &line->d.fmt[j];
+ const char *key = hdr->id[BCF_DT_ID][fmt->id].key;
+ kitr = kh_get(strdict, tmph, key);
+
+ int ifmt;
+ if ( kitr != kh_end(tmph) )
+ ifmt = kh_value(tmph, kitr); // seen
+ else
+ {
+ // new FORMAT tag
+ if ( key[0]=='G' && key[1]=='T' && key[2]==0 ) { has_GT = 1; ifmt = 0; }
+ else
+ {
+ ifmt = ++max_ifmt;
+ if ( max_ifmt >= ma->nfmt_map )
+ {
+ ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders);
+ memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
+ ma->nfmt_map = max_ifmt+1;
+ }
+ }
+ kitr = kh_put(strdict, tmph, key, &ret);
+ kh_value(tmph, kitr) = ifmt;
+ }
+ ma->fmt_map[ifmt*files->nreaders+i] = fmt;
+ }
+ // Check if the allele numbering must be changed
+ for (j=1; j<reader->buffer[0]->n_allele; j++)
+ if ( ma->d[i][0].map[j]!=j ) break;
+ ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1;
+ }
+
+ out->n_sample = bcf_hdr_nsamples(out_hdr);
+ if ( has_GT )
+ merge_GT(args, ma->fmt_map, out);
+ update_AN_AC(out_hdr, out);
+
+ if ( out->d.info!=ma->inf )
+ {
+ // hacky, we rely on htslib internals: bcf_update_info() reallocated the info
+ ma->inf = out->d.info;
+ ma->minf = out->d.m_info;
+ }
+
+ for (i=1; i<=max_ifmt; i++)
+ merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+ out->d.indiv_dirty = 1;
+}
+
+// The core merging function, one or none line from each reader
+void merge_line(args_t *args)
+{
+ bcf1_t *out = args->out_line;
+ bcf_clear1(out);
+ out->unpacked = BCF_UN_ALL;
+
+ merge_chrom2qual(args, out);
+ merge_filter(args, out);
+ merge_info(args, out);
+ merge_format(args, out);
+
+ bcf_write1(args->out_fh, args->out_hdr, out);
+}
+
+
+void debug_buffers(FILE *fp, bcf_srs_t *files);
+void debug_buffer(FILE *fp, bcf_sr_t *reader);
+
+#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+
+// Clean the reader's buffer to and make it ready for the next next_line() call.
+// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put
+// the rest to the beggining. Then shorten the buffer so that the last element
+// points to the last unfinished record. There are two special cases: the last
+// line of the buffer typically has a different position and must stay at the
+// end; next, the first record of the buffer must be one of those already
+// printed, as it will be discarded by next_line().
+//
+void shake_buffer(maux_t *maux, int ir, int pos)
+{
+ bcf_sr_t *reader = &maux->files->readers[ir];
+ maux1_t *m = maux->d[ir];
+
+ if ( !reader->buffer ) return;
+
+ int i;
+ // FILE *fp = stdout;
+ // fprintf(fp,"<going to shake> nbuf=%d\t", reader->nbuffer); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n");
+ // debug_buffer(fp,reader);
+ // fprintf(fp,"--\n");
+
+ int a = 1, b = reader->nbuffer;
+ if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards
+
+ while ( a<b )
+ {
+ if ( !(m[a].skip&SKIP_DONE) ) { a++; continue; }
+ if ( m[b].skip&SKIP_DONE ) { b--; continue; }
+ SWAP(bcf1_t*, reader->buffer[a], reader->buffer[b]);
+ SWAP(maux1_t, m[a], m[b]);
+ a++;
+ b--;
+ }
+
+ // position $a to the after the first unfinished record
+ while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++;
+
+ if ( a<reader->nbuffer )
+ {
+ // there is a gap between the unfinished lines at the beggining and the
+ // last line. The last line must be brought forward to fill the gap
+ if ( reader->buffer[reader->nbuffer]->pos != pos )
+ {
+ SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]);
+ SWAP(maux1_t, m[a], m[reader->nbuffer]);
+ reader->nbuffer = a;
+ }
+ }
+
+ if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos )
+ {
+ // the first record is unfinished, replace it with an empty line
+ // from the end of the buffer or else next_line will remove it
+ if ( reader->nbuffer + 1 >= maux->nbuf[ir] )
+ {
+ reader->nbuffer++;
+ maux_expand1(maux, ir);
+ reader->nbuffer--;
+ m = maux->d[ir];
+ }
+ if ( reader->nbuffer+1 >= reader->mbuffer )
+ error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer);
+
+ if ( reader->buffer[reader->nbuffer]->pos!=pos )
+ {
+ // 4way swap
+ bcf1_t *tmp = reader->buffer[0];
+ reader->buffer[0] = reader->buffer[reader->nbuffer+1];
+ reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer];
+ reader->buffer[reader->nbuffer] = tmp;
+ m[reader->nbuffer].skip = m[0].skip;
+ m[reader->nbuffer+1].skip = SKIP_DIFF;
+ reader->nbuffer++;
+ }
+ else
+ {
+ SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]);
+ SWAP(maux1_t, m[0], m[reader->nbuffer+1]);
+ }
+ }
+
+ // debug_buffer(fp,reader);
+ // fprintf(fp,"<shaken>\t"); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]);
+ // fprintf(fp,"\n\n");
+
+ // set position of finished buffer[0] line to -1, otherwise swapping may
+ // bring it back after next_line()
+ reader->buffer[0]->pos = -1;
+
+ // trim the buffer, remove finished lines from the end
+ i = reader->nbuffer;
+ while ( i>=1 && m[i--].skip&SKIP_DONE )
+ reader->nbuffer--;
+}
+
+void debug_maux(args_t *args, int pos, int var_type)
+{
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
+ int j,k,l;
+
+ fprintf(pysamerr,"Alleles to merge at %d\n", pos+1);
+ for (j=0; j<files->nreaders; j++)
+ {
+ bcf_sr_t *reader = &files->readers[j];
+ fprintf(pysamerr," reader %d: ", j);
+ for (k=0; k<=reader->nbuffer; k++)
+ {
+ if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ bcf1_t *line = reader->buffer[k];
+ if ( line->pos!=pos ) continue;
+ fprintf(pysamerr,"\t");
+ if ( maux->d[j][k].skip ) fprintf(pysamerr,"["); // this record will not be merged in this round
+ for (l=0; l<line->n_allele; l++)
+ fprintf(pysamerr,"%s%s", l==0?"":",", line->d.allele[l]);
+ if ( maux->d[j][k].skip ) fprintf(pysamerr,"]");
+ }
+ fprintf(pysamerr,"\n");
+ }
+ fprintf(pysamerr," counts: ");
+ for (j=0; j<maux->nals; j++) fprintf(pysamerr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysamerr,"\n");
+ for (j=0; j<files->nreaders; j++)
+ {
+ bcf_sr_t *reader = &files->readers[j];
+ fprintf(pysamerr," out %d: ", j);
+ for (k=0; k<=reader->nbuffer; k++)
+ {
+ if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ bcf1_t *line = reader->buffer[k];
+ if ( line->pos!=pos ) continue;
+ if ( maux->d[j][k].skip ) continue;
+ fprintf(pysamerr,"\t");
+ for (l=0; l<line->n_allele; l++)
+ fprintf(pysamerr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
+ }
+ fprintf(pysamerr,"\n");
+ }
+ fprintf(pysamerr,"\n");
+}
+
+// Determine which line should be merged from which reader: go through all
+// readers and all buffered lines, expand REF,ALT and try to match lines with
+// the same ALTs. A step towards output independent on input ordering of the
+// lines.
+void merge_buffer(args_t *args)
+{
+ bcf_srs_t *files = args->files;
+ int i, pos = -1, var_type = 0;
+ char *id = NULL;
+ maux_t *maux = args->maux;
+ maux_reset(maux);
+
+ // set the current position
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( bcf_sr_has_line(files,i) )
+ {
+ bcf1_t *line = bcf_sr_get_line(files,i);
+ pos = line->pos;
+ var_type = bcf_get_variant_types(line);
+ id = line->d.id;
+ break;
+ }
+ }
+
+ // In this loop we select from each reader compatible candidate lines.
+ // (i.e. SNPs or indels). Go through all files and all lines at this
+ // position and normalize relevant alleles.
+ // REF-only sites may be associated with both SNPs and indels.
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ if ( !reader->buffer ) continue;
+ int j, k;
+ for (j=0; j<=reader->nbuffer; j++)
+ {
+ bcf1_t *line = reader->buffer[j];
+ int line_type = bcf_get_variant_types(line);
+ // select relevant lines
+ maux->d[i][j].skip = SKIP_DIFF;
+ if ( pos!=line->pos )
+ {
+ if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore
+ continue;
+ }
+ if ( args->merge_by_id )
+ {
+ if ( strcmp(id,line->d.id) ) continue;
+ }
+ else
+ {
+ if ( args->collapse==COLLAPSE_NONE && maux->nals )
+ {
+ // All alleles of the tested record must be present in the
+ // selected maux record plus variant types must be the same
+ if ( var_type!=line->d.var_type ) continue;
+ if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
+ for (k=1; k<line->n_allele; k++)
+ {
+ if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
+ }
+ if ( k==line->n_allele ) continue; // no matching allele
+ }
+ if ( !(args->collapse&COLLAPSE_ANY) )
+ {
+ int compatible = 0;
+ if ( line_type==var_type ) compatible = 1;
+ else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything
+ else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1;
+ else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1;
+ else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1;
+ else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1;
+ else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1;
+ if ( !compatible ) continue;
+ }
+ }
+ maux->d[i][j].skip = 0;
+
+ hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map);
+ if ( !maux->nals ) // first record, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=0; k<maux->nals; k++)
+ {
+ maux->als[k] = strdup(line->d.allele[k]);
+ maux->d[i][j].map[k] = k;
+ maux->cnt[k] = 1;
+ }
+ pos = line->pos;
+ continue;
+ }
+
+ // normalize alleles
+ maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=1; k<line->n_allele; k++)
+ maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files
+ maux->cnt[0]++;
+ }
+ }
+
+ // debug_maux(args, pos, var_type);
+
+ // Select records that have the same alleles; the input ordering of indels
+ // must not matter. Multiple VCF lines can be emitted from this loop.
+ // We expect only very few alleles and not many records with the same
+ // position in the buffers, therefore the nested loops should not slow us
+ // much.
+ while (1)
+ {
+ // take the most frequent allele present in multiple files
+ int icnt = 0;
+ for (i=1; i<maux->nals; i++)
+ if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+ if ( maux->cnt[icnt]<0 ) break;
+
+ int nmask = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ maux->has_line[i] = 0;
+
+ bcf_sr_t *reader = &files->readers[i];
+ if ( !reader->buffer ) continue;
+
+ // find lines with the same allele
+ int j;
+ for (j=0; j<=reader->nbuffer; j++)
+ {
+ if ( maux->d[i][j].skip ) continue;
+ int k;
+ for (k=0; k<reader->buffer[j]->n_allele; k++)
+ if ( icnt==maux->d[i][j].map[k] ) break;
+ if ( k<reader->buffer[j]->n_allele ) break;
+ }
+ if ( j>reader->nbuffer )
+ {
+ // no matching allele found in this file
+ if ( args->collapse==COLLAPSE_NONE ) continue;
+
+ for (j=0; j<=reader->nbuffer; j++)
+ {
+ if ( maux->d[i][j].skip ) continue;
+ if ( args->collapse&COLLAPSE_ANY ) break;
+ int line_type = bcf_get_variant_types(reader->buffer[j]);
+ if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type==VCF_REF )
+ {
+ if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ }
+ else if ( var_type==VCF_REF )
+ {
+ if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ }
+ }
+ }
+ if ( j<=reader->nbuffer )
+ {
+ // found a suitable line for merging, place it at the beggining
+ if ( j>0 )
+ {
+ SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]);
+ SWAP(maux1_t, maux->d[i][0], maux->d[i][j]);
+ }
+ // mark as finished so that it's ignored next time
+ maux->d[i][0].skip |= SKIP_DONE;
+ maux->has_line[i] = 1;
+ nmask++;
+ }
+ }
+ if ( !nmask ) break; // done, no more lines suitable for merging found
+ merge_line(args); // merge and output the line
+ maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished
+ }
+
+ // clean the alleles
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = 0;
+ }
+ maux->nals = 0;
+
+ // get the buffers ready for the next next_line() call
+ for (i=0; i<files->nreaders; i++)
+ shake_buffer(maux, i, pos);
+}
+
+void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd)
+{
+ kstring_t str = {0,0,0};
+ ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version());
+ bcf_hdr_append(hdr,str.s);
+
+ str.l = 0;
+ ksprintf(&str,"##%sCommand=%s", cmd, argv[0]);
+ int i;
+ for (i=1; i<argc; i++)
+ {
+ if ( strchr(argv[i],' ') )
+ ksprintf(&str, " '%s'", argv[i]);
+ else
+ ksprintf(&str, " %s", argv[i]);
+ }
+ kputc('\n', &str);
+ bcf_hdr_append(hdr,str.s);
+ free(str.s);
+
+ bcf_hdr_sync(hdr);
+}
+
+void merge_vcf(args_t *args)
+{
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ args->out_hdr = bcf_hdr_init("w");
+
+ if ( args->header_fname )
+ {
+ if ( bcf_hdr_set(args->out_hdr,args->header_fname) ) error("Could not read/parse the header: %s\n", args->header_fname);
+ }
+ else
+ {
+ int i;
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ char buf[10]; snprintf(buf,10,"%d",i+1);
+ merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
+ }
+ bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
+ bcf_hdr_sync(args->out_hdr);
+ }
+ info_rules_init(args);
+
+ bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
+ bcf_hdr_write(args->out_fh, args->out_hdr);
+ if ( args->header_only )
+ {
+ bcf_hdr_destroy(args->out_hdr);
+ hts_close(args->out_fh);
+ return;
+ }
+
+ if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
+ args->maux = maux_init(args->files);
+ args->out_line = bcf_init1();
+ args->tmph = kh_init(strdict);
+ int ret;
+ while ( (ret=bcf_sr_next_line(args->files)) )
+ {
+ merge_buffer(args);
+ }
+ info_rules_destroy(args);
+ maux_destroy(args->maux);
+ bcf_hdr_destroy(args->out_hdr);
+ hts_close(args->out_fh);
+ bcf_destroy1(args->out_line);
+ kh_destroy(strdict, args->tmph);
+ if ( args->tmps.m ) free(args->tmps.s);
+ if ( args->vcmp ) vcmp_destroy(args->vcmp);
+}
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n");
+ fprintf(pysamerr, " Note that only records from different files can be merged, never from the same file. For\n");
+ fprintf(pysamerr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n");
+ fprintf(pysamerr, "Usage: bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " --force-samples resolve duplicate sample names\n");
+ fprintf(pysamerr, " --print-header print only the merged header and exit\n");
+ fprintf(pysamerr, " --use-header <file> use the provided header\n");
+ fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysamerr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
+ fprintf(pysamerr, " -l, --file-list <file> read file names from the file\n");
+ fprintf(pysamerr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfmerge(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv;
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->collapse = COLLAPSE_BOTH;
+ int regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"merge",required_argument,NULL,'m'},
+ {"file-list",required_argument,NULL,'l'},
+ {"apply-filters",required_argument,NULL,'f'},
+ {"use-header",required_argument,NULL,1},
+ {"print-header",no_argument,NULL,2},
+ {"force-samples",no_argument,NULL,3},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"info-rules",required_argument,NULL,'i'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'l': args->file_list = optarg; break;
+ case 'i': args->info_rules = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'm':
+ args->collapse = COLLAPSE_NONE;
+ if ( !strcmp(optarg,"snps") ) args->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) args->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) args->collapse |= COLLAPSE_BOTH;
+ else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
+ else error("The -m type \"%s\" is not recognised.\n", optarg);
+ break;
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 1 : args->header_fname = optarg; break;
+ case 2 : args->header_only = 1; break;
+ case 3 : args->force_samples = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( argc==optind && !args->file_list ) usage();
+ if ( argc-optind<2 && !args->file_list ) usage();
+
+ args->files->require_index = 1;
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+
+ while (optind<argc)
+ {
+ if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ optind++;
+ }
+ if ( args->file_list )
+ {
+ int nfiles, i;
+ char **files = hts_readlines(args->file_list, &nfiles);
+ if ( !files ) error("Failed to read from %s\n", args->file_list);
+ for (i=0;i<nfiles; i++)
+ if ( !bcf_sr_add_reader(args->files, files[i]) ) error("Failed to open %s: %s\n", files[i],bcf_sr_strerror(args->files->errnum));
+ for (i=0; i<nfiles; i++) free(files[i]);
+ free(files);
+ }
+ merge_vcf(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c
new file mode 100644
index 0000000..732eca9
--- /dev/null
+++ b/bcftools/vcfnorm.c
@@ -0,0 +1,1810 @@
+/* vcfnorm.c -- Left-align and normalize indels.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/faidx.h>
+#include "bcftools.h"
+#include "rbuf.h"
+
+#define CHECK_REF_EXIT 0
+#define CHECK_REF_WARN 1
+#define CHECK_REF_SKIP 2
+#define CHECK_REF_FIX 4
+
+#define MROWS_SPLIT 1
+#define MROWS_MERGE 2
+
+// for -m+, mapping from allele indexes of a single input record
+// to allele indexes of output record
+typedef struct
+{
+ int nals, mals, *map;
+}
+map_t;
+
+typedef struct
+{
+ char *tseq, *seq;
+ int mseq;
+ bcf1_t **lines, **tmp_lines, **alines, **blines, *mrow_out;
+ int ntmp_lines, mtmp_lines, nalines, malines, nblines, mblines;
+ map_t *maps; // mrow map for each buffered record
+ char **als;
+ int mmaps, nals, mals;
+ uint8_t *tmp_arr1, *tmp_arr2, *diploid;
+ int ntmp_arr1, ntmp_arr2;
+ kstring_t *tmp_str;
+ kstring_t *tmp_als, tmp_als_str;
+ int ntmp_als;
+ rbuf_t rbuf;
+ int buf_win; // maximum distance between two records to consider
+ int aln_win; // the realignment window size (maximum repeat size)
+ bcf_srs_t *files; // using the synced reader only for -r option
+ bcf_hdr_t *hdr;
+ faidx_t *fai;
+ struct { int tot, set, swap; } nref;
+ char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
+ int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
+ int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
+}
+args_t;
+
+static inline int replace_iupac_codes(char *seq, int nseq)
+{
+ // Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end
+ int i, n = 0;
+ for (i=0; i<nseq; i++)
+ {
+ char c = toupper(seq[i]);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' ) { seq[i] = 'N'; n++; }
+ }
+ return n;
+}
+
+static void fix_ref(args_t *args, bcf1_t *line)
+{
+ int reflen = strlen(line->d.allele[0]);
+ int i, maxlen = reflen, len;
+ for (i=1; i<line->n_allele; i++)
+ {
+ int len = strlen(line->d.allele[i]);
+ if ( maxlen < len ) maxlen = len;
+ }
+
+ char *ref = faidx_fetch_seq(args->fai, (char*)bcf_seqname(args->hdr,line), line->pos, line->pos+maxlen-1, &len);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1);
+ replace_iupac_codes(ref,len);
+
+ args->nref.tot++;
+
+ // is the REF different?
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+
+ // is the REF allele missing or N?
+ if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') )
+ {
+ line->d.allele[0][0] = ref[0];
+ args->nref.set++;
+ free(ref);
+ bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ return;
+ }
+
+ // does REF contain non-standard bases?
+ if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) )
+ {
+ args->nref.set++;
+ bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ }
+
+ // is it swapped?
+ for (i=1; i<line->n_allele; i++)
+ {
+ int len = strlen(line->d.allele[i]);
+ if ( !strncasecmp(line->d.allele[i],ref,len) ) break;
+ }
+
+ kstring_t str = {0,0,0};
+ if ( i==line->n_allele )
+ {
+ // none of the alternate alleles matches the reference
+ if ( line->n_allele>1 )
+ args->nref.set++;
+ else
+ args->nref.swap++;
+
+ kputs(line->d.allele[0],&str);
+ kputc(',',&str);
+ for (i=1; i<line->n_allele; i++)
+ {
+ kputs(line->d.allele[i],&str);
+ kputc(',',&str);
+ }
+ kputc(ref[0],&str);
+ bcf_update_alleles_str(args->hdr,line,str.s);
+ str.l = 0;
+ }
+ else
+ args->nref.swap++;
+ free(ref);
+
+ // swap the alleles
+ int j;
+ kputs(line->d.allele[i],&str);
+ for (j=1; j<i; j++)
+ {
+ kputc(',',&str);
+ kputs(line->d.allele[j],&str);
+ }
+ kputc(',',&str);
+ kputs(line->d.allele[0],&str);
+ for (j=i+1; j<line->n_allele; j++)
+ {
+ kputc(',',&str);
+ kputs(line->d.allele[j],&str);
+ }
+ bcf_update_alleles_str(args->hdr,line,str.s);
+
+ // swap genotypes
+ int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
+ int ngts = bcf_get_genotypes(args->hdr, line, &args->tmp_arr1, &ntmp);
+ args->ntmp_arr1 = ntmp * sizeof(int32_t);
+ int32_t *gts = (int32_t*) args->tmp_arr1;
+ int ni = 0;
+ for (j=0; j<ngts; j++)
+ {
+ if ( gts[j]==bcf_gt_unphased(0) ) { gts[j] = bcf_gt_unphased(i); ni++; }
+ else if ( gts[j]==bcf_gt_phased(0) ) { gts[j] = bcf_gt_phased(i); ni++; }
+ else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0);
+ else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0);
+ }
+ bcf_update_genotypes(args->hdr,line,gts,ngts);
+
+ // update AC
+ int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp);
+ args->ntmp_arr1 = ntmp * sizeof(int32_t);
+ if ( i <= nac )
+ {
+ int32_t *ac = (int32_t*)args->tmp_arr1;
+ ac[i-1] = ni;
+ bcf_update_info_int32(args->hdr, line, "AC", ac, nac);
+ }
+
+ free(str.s);
+}
+
+static void fix_dup_alt(args_t *args, bcf1_t *line)
+{
+ // update alleles, create a mapping between old and new indexes
+ hts_expand(uint8_t,line->n_allele,args->ntmp_arr1,args->tmp_arr1);
+ args->tmp_arr1[0] = 0; // ref always unchanged
+
+ int i, j, nals = line->n_allele, nals_ori = line->n_allele;
+ for (i=1, j=1; i<line->n_allele; i++)
+ {
+ if ( strcmp(line->d.allele[0],line->d.allele[i]) )
+ {
+ args->tmp_arr1[i] = j++;
+ continue;
+ }
+ args->tmp_arr1[i] = 0;
+ nals--;
+ }
+ for (i=1, j=1; i<line->n_allele; i++)
+ {
+ if ( !args->tmp_arr1[i] ) continue;
+ line->d.allele[j++] = line->d.allele[i];
+ }
+ bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals);
+
+
+ // update genotypes
+ int ntmp = args->ntmp_arr2 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
+ int ngts = bcf_get_genotypes(args->hdr, line, &args->tmp_arr2, &ntmp);
+ args->ntmp_arr2 = ntmp * sizeof(int32_t);
+ int32_t *gts = (int32_t*) args->tmp_arr2;
+ int changed = 0;
+ for (i=0; i<ngts; i++)
+ {
+ if ( bcf_gt_is_missing(gts[i]) || gts[i]==bcf_int32_vector_end ) continue;
+ int ial = bcf_gt_allele(gts[i]);
+ if ( ial<nals_ori && ial==args->tmp_arr1[ial] ) continue;
+ int ial_new = ial<nals_ori ? args->tmp_arr1[ial] : 0;
+ gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new);
+ changed = 1;
+ }
+ if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
+}
+
+#define ERR_DUP_ALLELE -2
+#define ERR_REF_MISMATCH -1
+#define ERR_OK 0
+#define ERR_SYMBOLIC 1
+
+static int realign(args_t *args, bcf1_t *line)
+{
+ bcf_unpack(line, BCF_UN_STR);
+
+ // Sanity check REF
+ int i, nref, reflen = strlen(line->d.allele[0]);
+ char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+ replace_iupac_codes(ref,nref);
+
+ // does REF contain non-standard bases?
+ if ( replace_iupac_codes(line->d.allele[0],reflen) )
+ {
+ args->nchanged++;
+ bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ }
+ if ( strcasecmp(ref,line->d.allele[0]) )
+ {
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ free(ref);
+ return ERR_REF_MISMATCH;
+ }
+ free(ref);
+ ref = NULL;
+
+ if ( line->n_allele == 1 ) return ERR_OK; // a REF
+
+ // make a copy of each allele for trimming
+ hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
+ kstring_t *als = args->tmp_als;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+
+ als[i].l = 0;
+ kputs(line->d.allele[i], &als[i]);
+
+ if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
+ }
+
+
+ // trim from right
+ int ori_pos = line->pos;
+ while (1)
+ {
+ // is the rightmost base identical in all alleles?
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break;
+ }
+ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+
+ int pad_from_left = 0;
+ for (i=0; i<line->n_allele; i++) // trim all alleles
+ {
+ als[i].l--;
+ if ( !als[i].l ) pad_from_left = 1;
+ }
+ if ( pad_from_left )
+ {
+ int npad = line->pos >= args->aln_win ? args->aln_win : line->pos;
+ free(ref);
+ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad+1);
+ replace_iupac_codes(ref,nref);
+ for (i=0; i<line->n_allele; i++)
+ {
+ ks_resize(&als[i], als[i].l + npad);
+ if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
+ memcpy(als[i].s,ref,npad);
+ als[i].l += npad;
+ }
+ line->pos -= npad;
+ }
+ }
+ free(ref);
+
+ // trim from left
+ int ntrim_left = 0;
+ while (1)
+ {
+ // is the first base identical in all alleles?
+ int min_len = als[0].l - ntrim_left;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
+ if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+ }
+ if ( i!=line->n_allele || min_len==1 ) break; // there are differences, cannot be trimmed
+ ntrim_left++;
+ }
+ if ( ntrim_left )
+ {
+ for (i=0; i<line->n_allele; i++)
+ {
+ memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
+ als[i].l -= ntrim_left;
+ }
+ line->pos += ntrim_left;
+ }
+
+ // Have the alleles changed?
+ als[0].s[ als[0].l ] = 0; // in order for strcmp to work
+ if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+
+ // Create new block of alleles and update
+ args->tmp_als_str.l = 0;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if (i>0) kputc(',',&args->tmp_als_str);
+ kputsn(als[i].s,als[i].l,&args->tmp_als_str);
+ }
+ args->tmp_als_str.s[ args->tmp_als_str.l ] = 0;
+ bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s);
+ args->nchanged++;
+
+ return ERR_OK;
+}
+
+static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int ret = bcf_get_info_##type(args->hdr,src,tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( ret>0 ); \
+ type_t *vals = (type_t*) args->tmp_arr1; \
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
+ if ( len==BCF_VL_A ) \
+ { \
+ assert( ret==src->n_allele-1); \
+ bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ assert( ret==src->n_allele); \
+ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ assert( ret==src->n_allele*(src->n_allele+1)/2 ); \
+ if ( ialt!=0 ) \
+ { \
+ vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
+ vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,3); \
+ } \
+ else \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+// Find n-th field in a comma-separated list and move it to dst.
+// The memory areas may overlap.
+#define STR_MOVE_NTH(dst,src,end,nth,len) \
+{ \
+ char *ss = src, *se = src; \
+ int j = 0; \
+ while ( *se && se<(end) ) \
+ { \
+ if ( *se==',' ) \
+ { \
+ if ( j==nth ) break; \
+ j++; \
+ ss = se+1; \
+ } \
+ se++; \
+ } \
+ if ( j==nth ) \
+ { \
+ int n = se - ss; \
+ memmove((dst),ss,n); \
+ src = se; \
+ len += n; \
+ } \
+ else len = -1; \
+}
+static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+ int ret = bcf_get_info_string(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
+ assert( ret>0 );
+
+ kstring_t str;
+ str.m = args->ntmp_arr1;
+ str.l = ret;
+ str.s = (char*) args->tmp_arr1;
+
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key);
+ if ( len==BCF_VL_A )
+ {
+ char *tmp = str.s;
+ int len = 0;
+ STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len] = 0;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ char *tmp = str.s;
+ int len = 0;
+ STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
+ str.s[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len] = 0;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+ char *tmp = str.s;
+ int len = 0;
+ STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
+ str.s[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len] = 0;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+}
+static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+ int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+}
+
+static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
+{
+ int ntmp = args->ntmp_arr1 / 4;
+ int ngts = bcf_get_genotypes(args->hdr,src,&args->tmp_arr1,&ntmp);
+ args->ntmp_arr1 = ntmp * 4;
+ assert( ngts >0 );
+
+ int32_t *gt = (int32_t*) args->tmp_arr1;
+ int i, j, nsmpl = bcf_hdr_nsamples(args->hdr);
+ ngts /= nsmpl;
+ for (i=0; i<nsmpl; i++)
+ {
+ for (j=0; j<ngts; j++)
+ {
+ if ( gt[j]==bcf_int32_vector_end ) break;
+ if ( bcf_gt_is_missing(gt[j]) || bcf_gt_allele(gt[j])==0 ) continue; // missing allele or ref: leave as is
+ if ( bcf_gt_allele(gt[j])==ialt+1 )
+ gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT
+ else
+ gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF
+ }
+ gt += ngts;
+ }
+ bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+}
+static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int nvals = bcf_get_format_##type(args->hdr,src,tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( nvals>0 ); \
+ type_t *vals = (type_t *) args->tmp_arr1; \
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
+ int i, nsmpl = bcf_hdr_nsamples(args->hdr); \
+ if ( nvals==nsmpl ) /* all values are missing */ \
+ { \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ return; \
+ } \
+ if ( len==BCF_VL_A ) \
+ { \
+ assert( nvals==(src->n_allele-1)*nsmpl); \
+ nvals /= nsmpl; \
+ type_t *src_vals = vals, *dst_vals = vals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[ialt]; \
+ dst_vals += 1; \
+ src_vals += nvals; \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ assert( nvals==src->n_allele*nsmpl); \
+ nvals /= nsmpl; \
+ type_t *src_vals = vals, *dst_vals = vals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[0]; \
+ dst_vals[1] = src_vals[ialt+1]; \
+ dst_vals += 2; \
+ src_vals += nvals; \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl*2); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ if ( nvals!=src->n_allele*(src->n_allele+1)/2*nsmpl && nvals!=src->n_allele*nsmpl ) \
+ error("Error at %s:%d, the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
+ nvals /= nsmpl; \
+ int all_haploid = nvals==src->n_allele ? 1 : 0; \
+ type_t *src_vals = vals, *dst_vals = vals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ int haploid = all_haploid; \
+ if ( !haploid ) \
+ { \
+ int j; \
+ for (j=0; j<nvals; j++) if ( is_vector_end ) break; \
+ if ( j!=nvals ) haploid = 1; \
+ } \
+ dst_vals[0] = src_vals[0]; \
+ if ( haploid ) \
+ { \
+ dst_vals[1] = src_vals[ialt+1]; \
+ if ( !all_haploid ) set_vector_end; \
+ } \
+ else \
+ { \
+ dst_vals[1] = src_vals[bcf_alleles2gt(0,ialt+1)]; \
+ dst_vals[2] = src_vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
+ } \
+ dst_vals += all_haploid ? 2 : 3; \
+ src_vals += nvals; \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
+ } \
+ else \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+static void squeeze_format_char(char *str, int src_blen, int dst_blen, int n)
+{
+ int i, isrc = 0, idst = 0;
+ for (i=0; i<n; i++)
+ {
+ memmove(str+idst,str+isrc,dst_blen);
+ idst += dst_blen;
+ isrc += src_blen;
+ }
+}
+static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
+ int ret = bcf_get_format_char(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
+ assert( ret>0 );
+
+ kstring_t str;
+ str.m = args->ntmp_arr1;
+ str.l = ret;
+ str.s = (char*) args->tmp_arr1;
+
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( len==BCF_VL_A )
+ {
+ int i, blen = ret/nsmpl, maxlen = 0;
+ char *ptr = str.s;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *tmp = ptr;
+ int len = 0;
+ STR_MOVE_NTH(tmp,tmp,ptr+blen,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ if ( maxlen < len ) maxlen = len;
+ ptr += blen;
+ }
+ if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
+ bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ int i, blen = ret/nsmpl, maxlen = 0;
+ char *ptr = str.s;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *tmp = ptr;
+ int len = 0;
+ STR_MOVE_NTH(ptr,tmp,ptr+blen,0,len);
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ if ( maxlen < len ) maxlen = len;
+ ptr += blen;
+ }
+ if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
+ bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int i, blen = ret/nsmpl, maxlen = 0, i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+ char *ptr = str.s;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *se = ptr, *sx = ptr+blen;
+ int nfields = 1;
+ while ( *se && se<sx )
+ {
+ if ( *se==',' ) nfields++;
+ se++;
+ }
+ assert( nfields==src->n_allele*(src->n_allele+1)/2 || nfields==src->n_allele );
+ int len = 0;
+ if ( nfields==src->n_allele ) // haploid
+ {
+ char *tmp = ptr;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ }
+ else // diploid
+ {
+ char *tmp = ptr;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,iaa-i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ }
+ if ( maxlen < len ) maxlen = len;
+ ptr += blen;
+ }
+ if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
+ bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ }
+ else
+ bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+}
+
+
+static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
+{
+ int i;
+
+ bcf_unpack(line, BCF_UN_ALL);
+
+ // Init the target biallelic lines
+ args->ntmp_lines = line->n_allele-1;
+ if ( args->mtmp_lines < args->ntmp_lines )
+ {
+ args->tmp_lines = (bcf1_t **)realloc(args->tmp_lines,sizeof(bcf1_t*)*args->ntmp_lines);
+ for (i=args->mtmp_lines; i<args->ntmp_lines; i++)
+ args->tmp_lines[i] = NULL;
+ args->mtmp_lines = args->ntmp_lines;
+ }
+ kstring_t tmp = {0,0,0};
+ kputs(line->d.allele[0], &tmp);
+ kputc(',', &tmp);
+ int rlen = tmp.l;
+ int gt_id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"GT");
+ for (i=0; i<args->ntmp_lines; i++) // for each ALT allele
+ {
+ if ( !args->tmp_lines[i] ) args->tmp_lines[i] = bcf_init1();
+ bcf1_t *dst = args->tmp_lines[i];
+ bcf_clear(dst);
+
+ dst->rid = line->rid;
+ dst->pos = line->pos;
+ dst->qual = line->qual;
+
+ // Not quite sure how to handle IDs, they can be assigned to a specific
+ // ALT. For now we leave the ID unchanged for all.
+ bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : ".");
+
+ tmp.l = rlen;
+ kputs(line->d.allele[i+1],&tmp);
+ bcf_update_alleles_str(args->hdr,dst,tmp.s);
+
+ if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt);
+
+ int j;
+ for (j=0; j<line->n_info; j++)
+ {
+ bcf_info_t *info = &line->d.info[j];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) split_info_numeric(args, line, info, i, dst);
+ else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
+ else split_info_string(args, line, info, i, dst);
+ }
+
+ dst->n_sample = line->n_sample;
+ for (j=0; j<line->n_fmt; j++)
+ {
+ bcf_fmt_t *fmt = &line->d.fmt[j];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( fmt->id==gt_id ) split_format_genotype(args, line, fmt, i, dst);
+ else if ( type==BCF_HT_INT || type==BCF_HT_REAL ) split_format_numeric(args, line, fmt, i, dst);
+ else split_format_string(args, line, fmt, i, dst);
+ }
+ }
+ free(tmp.s);
+}
+
+// Enlarge FORMAT array containing nsmpl samples each with nals_ori values
+// to accommodate nvals values for each sample, filling the gaps with missing
+// values. Works also for INFO arrays, with nsmpl set to 1.
+#define ENLARGE_ARRAY(type_t,set_missing,arr,narr_bytes,nsmpl,nvals_ori,nvals) \
+{ \
+ int nbytes_new = (nsmpl)*(nvals)*sizeof(type_t); \
+ hts_expand(uint8_t,nbytes_new,narr_bytes,arr); \
+ int ismpl, k; \
+ for (ismpl=nsmpl-1; ismpl>=0; ismpl--) \
+ { \
+ type_t *dst_ptr = ((type_t*)arr) + ismpl*(nvals); \
+ type_t *src_ptr = ((type_t*)arr) + ismpl*nvals_ori; \
+ memmove(dst_ptr,src_ptr,sizeof(type_t)*nvals_ori); \
+ for (k=nvals_ori; k<nvals; k++) set_missing; \
+ } \
+}
+static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t,set_missing,is_vector_end) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int nvals_ori = bcf_get_info_##type(args->hdr,lines[0],tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( nvals_ori>0 ); \
+ type_t *vals = (type_t*) args->tmp_arr1, *vals2; \
+ int i,k,len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
+ if ( len==BCF_VL_A ) \
+ { \
+ if (nvals_ori!=lines[0]->n_allele - 1) \
+ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \
+ int nvals = dst->n_allele - 1; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \
+ vals = (type_t*) args->tmp_arr1; \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_info_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ if (nvals2!=lines[i]->n_allele-1) \
+ error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
+ } \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ if (nvals_ori!=lines[0]->n_allele) \
+ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \
+ int nvals = dst->n_allele; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \
+ vals = (type_t*) args->tmp_arr1; \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_info_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ if (nvals2!=lines[i]->n_allele) \
+ error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k] ] = vals2[k]; \
+ } \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ /* expecting diploid gt in INFO */ \
+ if (nvals_ori!=lines[0]->n_allele*(lines[0]->n_allele+1)/2) { \
+ fprintf(stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \
+ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \
+ } \
+ int nvals = dst->n_allele*(dst->n_allele+1)/2; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \
+ vals = (type_t*) args->tmp_arr1; \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_info_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ if (nvals2!=lines[i]->n_allele*(lines[i]->n_allele+1)/2) \
+ error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals2 = (type_t*) args->tmp_arr2; \
+ int ia,ib; \
+ k = 0; \
+ for (ia=0; ia<lines[i]->n_allele; ia++) \
+ { \
+ for (ib=0; ib<=ia; ib++) \
+ { \
+ if ( is_vector_end ) break; \
+ int l = bcf_alleles2gt(args->maps[i].map[ia],args->maps[i].map[ib]); \
+ vals[l] = vals2[k]; \
+ k++; \
+ } \
+ } \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ } \
+ else \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, dst_ptr[k]=bcf_int32_missing, vals2[k]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_set_missing(dst_ptr[k]), bcf_float_is_vector_end(vals2[k])); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+static void merge_info_flag(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+ int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+}
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
+static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+
+ kstring_t str;
+ str.m = args->ntmp_arr1;
+ str.l = 0;
+ str.s = (char*) args->tmp_arr1;
+
+ int i, j, len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key);
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int jfrom = len==BCF_VL_A ? 1 : 0;
+ kputc('.',&str);
+ for (i=jfrom+1; i<dst->n_allele; i++) kputs(",.",&str);
+ for (i=0; i<nlines; i++)
+ {
+ bcf_info_t *src = bcf_get_info(args->hdr,lines[i],tag);
+ if (!src) continue;
+ for (j=jfrom; j<lines[i]->n_allele; j++)
+ copy_string_field((char*)src->vptr, j-jfrom, src->len, &str, args->maps[i].map[j]-jfrom);
+ }
+ str.s[str.l] = 0;
+ args->tmp_arr1 = (uint8_t*) str.s;
+ args->ntmp_arr1 = str.m;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int ngts = dst->n_allele*(dst->n_allele+1)/2;
+ kputc('.',&str);
+ for (i=1; i<ngts; i++) kputs(",.",&str);
+ for (i=0; i<nlines; i++)
+ {
+ bcf_info_t *src = bcf_get_info(args->hdr,lines[i],tag);
+ if (!src) continue;
+ int iori, jori, kori = 0;
+ for (iori=0; iori<lines[i]->n_allele; iori++)
+ {
+ int inew = args->maps[i].map[iori];
+ for (jori=0; jori<=iori; jori++)
+ {
+ int jnew = args->maps[i].map[jori];
+ int knew = bcf_alleles2gt(inew,jnew);
+ copy_string_field((char*)src->vptr,kori,src->len,&str,knew);
+ kori++;
+ }
+ }
+ }
+ str.s[str.l] = 0;
+ args->tmp_arr1 = (uint8_t*) str.s;
+ args->ntmp_arr1 = str.m;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else
+ {
+ bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1);
+ }
+}
+static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
+{
+ int ntmp = args->ntmp_arr1 / 4;
+ int ngts = bcf_get_genotypes(args->hdr,lines[0],&args->tmp_arr1,&ntmp);
+ args->ntmp_arr1 = ntmp * 4;
+ assert( ngts >0 );
+
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ ngts /= nsmpl;
+
+ int i, j, k;
+ for (i=1; i<nlines; i++)
+ {
+ int ntmp2 = args->ntmp_arr2 / 4;
+ int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2);
+ args->ntmp_arr2 = ntmp2 * 4;
+ ngts2 /= nsmpl;
+ if ( ngts!=ngts2 ) error("Error at %s:%d: cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1);
+
+ int32_t *gt = (int32_t*) args->tmp_arr1;
+ int32_t *gt2 = (int32_t*) args->tmp_arr2;
+ for (j=0; j<nsmpl; j++)
+ {
+ for (k=0; k<ngts; k++)
+ {
+ if ( gt2[k]==bcf_int32_vector_end ) break;
+ if ( bcf_gt_is_missing(gt2[k]) || bcf_gt_allele(gt2[k])==0 ) continue;
+ if ( gt2[k]==0 ) gt[k] = 0; // missing genotype
+ else
+ {
+ int ial = bcf_gt_allele(gt2[k]);
+ assert( ial<args->maps[i].nals );
+ gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
+ }
+ }
+ gt += ngts;
+ gt2 += ngts;
+ }
+ }
+ bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+}
+static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals)
+{
+ int i, dsrc = size*nals*(nals+1)/2, ddst = size*nals;
+ uint8_t *src_ptr = vals + dsrc, *dst_ptr = vals + ddst;
+ for (i=1; i<nsmpl; i++)
+ {
+ memmove(dst_ptr,src_ptr,ddst);
+ dst_ptr += ddst;
+ src_ptr += dsrc;
+ }
+ return nals;
+}
+static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t,set_missing,is_vector_end,set_vector_end) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int nvals_ori = bcf_get_format_##type(args->hdr,lines[0],tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( nvals_ori>0 ); \
+ type_t *vals2, *vals = (type_t *) args->tmp_arr1; \
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
+ int i, j, k, nsmpl = bcf_hdr_nsamples(args->hdr); \
+ nvals_ori /= nsmpl; \
+ if ( len==BCF_VL_A ) \
+ { \
+ int nvals = dst->n_allele - 1; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,nsmpl,nvals_ori,nvals); \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_format_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* format tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ nvals2 /= nsmpl; \
+ if (nvals2!=lines[i]->n_allele-1) \
+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals = (type_t*) args->tmp_arr1; \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
+ } \
+ vals += nvals; \
+ vals2 += nvals2; \
+ } \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ int nvals = dst->n_allele; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,nsmpl,nvals_ori,nvals); \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_format_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* format tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ nvals2 /= nsmpl; \
+ if (nvals2!=lines[i]->n_allele) \
+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals = (type_t*) args->tmp_arr1; \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k] ] = vals2[k]; \
+ } \
+ vals += nvals; \
+ vals2 += nvals2; \
+ } \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ /* which samples are diploid */ \
+ memset(args->diploid,0,nsmpl); \
+ int all_haploid = 1; \
+ if ( nvals_ori > lines[0]->n_allele ) /* line possibly diploid */ \
+ { \
+ vals2 = (type_t*) args->tmp_arr1; \
+ int ndiploid = lines[0]->n_allele*(lines[0]->n_allele+1)/2; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ if ( !args->diploid[i] ) \
+ { \
+ for (k=0; k<nvals_ori; k++) if ( is_vector_end ) break; \
+ if ( k==ndiploid ) { args->diploid[i] = 1; all_haploid = 0; }\
+ } \
+ vals2 += nvals_ori; \
+ } \
+ } \
+ int nvals = dst->n_allele*(dst->n_allele+1)/2; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,nsmpl,nvals_ori,nvals); \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_format_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* format tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ nvals2 /= nsmpl; \
+ int ndiploid = lines[i]->n_allele*(lines[i]->n_allele+1)/2; \
+ int line_diploid = nvals2==ndiploid ? 1 : 0; \
+ if (!(nvals2==1 || nvals2==lines[i]->n_allele || nvals2==lines[i]->n_allele*(lines[i]->n_allele+1)/2)) \
+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals = (type_t*) args->tmp_arr1; \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ int smpl_diploid = line_diploid; \
+ if ( smpl_diploid ) \
+ { \
+ for (k=0; k<nvals2; k++) if ( is_vector_end ) break; \
+ if ( k!=ndiploid ) smpl_diploid = 0; \
+ } \
+ if ( smpl_diploid && !args->diploid[j] ) { args->diploid[j] = 1; all_haploid = 0; } \
+ if ( !smpl_diploid ) \
+ { \
+ for (k=0; k<lines[i]->n_allele; k++) vals[args->maps[i].map[k]] = vals2[k]; \
+ } \
+ else \
+ { \
+ k = 0; \
+ int ia,ib; \
+ for (ia=0; ia<lines[i]->n_allele; ia++) \
+ { \
+ for (ib=0; ib<=ia; ib++) \
+ { \
+ int l = bcf_alleles2gt(args->maps[i].map[ia],args->maps[i].map[ib]); \
+ vals[l] = vals2[k]; \
+ k++; \
+ } \
+ } \
+ } \
+ vals += nvals; \
+ vals2 += nvals2; \
+ } \
+ } \
+ if ( all_haploid ) \
+ nvals = diploid_to_haploid(sizeof(type_t),nsmpl,dst->n_allele,args->tmp_arr1); \
+ else \
+ {\
+ k = dst->n_allele;\
+ vals2 = (type_t*) args->tmp_arr1;\
+ for (i=0; i<nsmpl; i++)\
+ {\
+ if ( !args->diploid[i] ) set_vector_end;\
+ vals2 += nvals;\
+ }\
+ }\
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ } \
+ else \
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, dst_ptr[k]=bcf_int32_missing, vals2[k]==bcf_int32_vector_end, vals2[k]=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_set_missing(dst_ptr[k]), bcf_float_is_vector_end(vals2[k]), bcf_float_set_vector_end(vals2[k])); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
+
+ int i, j, k, len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G )
+ {
+ int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret);
+ return;
+ }
+
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ for (i=0; i<nsmpl; i++) args->tmp_str[i].l = 0;
+
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int jfrom = len==BCF_VL_A ? 1 : 0;
+ for (i=0; i<nsmpl; i++)
+ {
+ kstring_t *tmp = &args->tmp_str[i];
+ kputc('.',tmp);
+ for (k=jfrom+1; k<dst->n_allele; k++) kputs(",.",tmp);
+ }
+ for (i=0; i<nlines; i++)
+ {
+ int nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ nret /= nsmpl;
+ for (k=0; k<nsmpl; k++)
+ {
+ kstring_t *tmp = &args->tmp_str[k];
+ char *src = (char*)args->tmp_arr1 + k*nret;
+ for (j=jfrom; j<lines[i]->n_allele; j++)
+ copy_string_field(src, j-jfrom, nret, tmp, args->maps[i].map[j]-jfrom);
+ }
+ }
+ }
+ else if ( len==BCF_VL_G )
+ {
+ hts_expand(uint8_t,nsmpl,args->ntmp_arr2,args->tmp_arr2);
+ uint8_t *haploid = args->tmp_arr2;
+ int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ nret /= nsmpl;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *ss = (char*)args->tmp_arr1 + i*nret, *se = ss+nret;
+ int nfields = 1;
+ while ( *ss && ss<se )
+ {
+ if ( *ss==',' ) nfields++;
+ ss++;
+ }
+ if ( nfields==lines[0]->n_allele )
+ {
+ haploid[i] = 1;
+ nfields = dst->n_allele;
+ }
+ else if ( nfields==lines[0]->n_allele*(lines[0]->n_allele+1)/2 )
+ {
+ haploid[i] = 0;
+ nfields = dst->n_allele*(dst->n_allele+1)/2;
+ }
+ else error("The field %s at %s:%d neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),dst->pos+1);
+
+ kstring_t *tmp = &args->tmp_str[i];
+ kputc('.',tmp);
+ for (j=1; j<nfields; j++) kputs(",.",tmp);
+ }
+ for (i=0; i<nlines; i++)
+ {
+ if ( i ) // we already have a copy
+ {
+ nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ nret /= nsmpl;
+ }
+ for (k=0; k<nsmpl; k++)
+ {
+ kstring_t *tmp = &args->tmp_str[k];
+ char *src = (char*)args->tmp_arr1 + k*nret;
+ if ( haploid[k] )
+ {
+ for (j=0; j<lines[i]->n_allele; j++)
+ copy_string_field(src,j,nret, tmp, args->maps[i].map[j]);
+ }
+ else
+ {
+ int iori, jori, kori = 0;
+ for (iori=0; iori<lines[i]->n_allele; iori++)
+ {
+ int inew = args->maps[i].map[iori];
+ for (jori=0; jori<=iori; jori++)
+ {
+ int jnew = args->maps[i].map[jori];
+ int knew = bcf_alleles2gt(inew,jnew);
+ copy_string_field(src,kori,nret,tmp,knew);
+ kori++;
+ }
+ }
+ }
+ }
+ }
+ }
+ kstring_t str;
+ str.m = args->ntmp_arr2;
+ str.l = 0;
+ str.s = (char*) args->tmp_arr2;
+
+ int max_len = 0;
+ for (i=0; i<nsmpl; i++)
+ if ( max_len < args->tmp_str[i].l ) max_len = args->tmp_str[i].l;
+ for (i=0; i<nsmpl; i++)
+ {
+ kstring_t *tmp = &args->tmp_str[i];
+ kputsn(tmp->s,tmp->l,&str);
+ for (j=tmp->l; j<max_len; j++) kputc(0,tmp);
+ }
+ args->ntmp_arr2 = str.m;
+ args->tmp_arr2 = (uint8_t*)str.s;
+ bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+}
+
+char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb); // see vcfmerge.c
+static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t **lines, int nlines)
+{
+ int i;
+ for (i=0; i<nlines; i++)
+ bcf_unpack(lines[i], BCF_UN_ALL);
+
+ dst->rid = lines[0]->rid;
+ dst->pos = lines[0]->pos;
+
+ // take max for QUAL
+ bcf_float_set_missing(dst->qual);
+ for (i=0; i<nlines; i++) {
+ if (bcf_float_is_missing(lines[i]->qual)) continue;
+ if (bcf_float_is_missing(dst->qual) || dst->qual<lines[i]->qual)
+ dst->qual = lines[i]->qual;
+ }
+
+ bcf_update_id(args->hdr, dst, lines[0]->d.id);
+
+ // Merge and set the alleles, create a mapping from source allele indexes to dst idxs
+ hts_expand0(map_t,nlines,args->mmaps,args->maps); // a mapping for each line
+ args->nals = args->maps[0].nals = lines[0]->n_allele;
+ hts_expand(int,args->maps[0].nals,args->maps[0].mals,args->maps[0].map);
+ hts_expand(char*,args->nals,args->mals,args->als);
+ for (i=0; i<args->maps[0].nals; i++)
+ {
+ args->maps[0].map[i] = i;
+ args->als[i] = strdup(lines[0]->d.allele[i]);
+ }
+ for (i=1; i<nlines; i++)
+ {
+ if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id);
+ args->maps[i].nals = lines[i]->n_allele;
+ hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map);
+ args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals);
+ if ( !args->als ) error("Failed to merge alleles at %s:%d\n", bcf_seqname(args->hdr,dst),dst->pos+1);
+ }
+ bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals);
+ for (i=0; i<args->nals; i++)
+ {
+ free(args->als[i]);
+ args->als[i] = NULL;
+ }
+
+ if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
+ for (i=1; i<nlines; i++) {
+ int j;
+ for (j=0; j<lines[i]->d.n_flt; j++) {
+ // if strict_filter, set FILTER to PASS if any site PASS
+ // otherwise accumulate FILTERs
+ if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) {
+ if (args->strict_filter) {
+ bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
+ break;
+ }
+ else
+ continue;
+ }
+ bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]);
+ }
+ }
+
+ // merge info
+ for (i=0; i<lines[0]->n_info; i++)
+ {
+ bcf_info_t *info = &lines[0]->d.info[i];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) merge_info_numeric(args, lines, nlines, info, dst);
+ else if ( type==BCF_HT_FLAG ) merge_info_flag(args, lines, nlines, info, dst);
+ else merge_info_string(args, lines, nlines, info, dst);
+ }
+
+ // merge format
+ int gt_id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"GT");
+ dst->n_sample = lines[0]->n_sample;
+ for (i=0; i<lines[0]->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &lines[0]->d.fmt[i];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( fmt->id==gt_id ) merge_format_genotype(args, lines, nlines, fmt, dst);
+ else if ( type==BCF_HT_INT || type==BCF_HT_REAL ) merge_format_numeric(args, lines, nlines, fmt, dst);
+ else merge_format_string(args, lines, nlines, fmt, dst);
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+static void mrows_schedule(args_t *args, bcf1_t **line)
+{
+ int i,m;
+ if ( args->mrows_collapse==COLLAPSE_ANY // merge all record types together
+ || bcf_get_variant_types(*line)&VCF_SNP // SNP, put into alines
+ || bcf_get_variant_types(*line)==VCF_REF ) // ref
+ {
+ args->nalines++;
+ m = args->malines;
+ hts_expand(bcf1_t*,args->nalines,args->malines,args->alines);
+ for (i=m; i<args->malines; i++) args->alines[i] = bcf_init1();
+ SWAP(bcf1_t*, args->alines[args->nalines-1], *line);
+ }
+ else
+ {
+ args->nblines++;
+ m = args->mblines;
+ hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
+ for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
+ SWAP(bcf1_t*, args->blines[args->nblines-1], *line);
+ }
+}
+static int mrows_ready_to_flush(args_t *args, bcf1_t *line)
+{
+ if ( args->nalines && (args->alines[0]->rid!=line->rid || args->alines[0]->pos!=line->pos) ) return 1;
+ if ( args->nblines && (args->blines[0]->rid!=line->rid || args->blines[0]->pos!=line->pos) ) return 1;
+ return 0;
+}
+static bcf1_t *mrows_flush(args_t *args)
+{
+ if ( args->nblines && args->nalines==1 && bcf_get_variant_types(args->alines[0])==VCF_REF )
+ {
+ // By default, REF lines are merged with SNPs if SNPs and indels are to be kept separately.
+ // However, if there are indels only and a single REF line, merge it with indels.
+ args->nblines++;
+ int i,m = args->mblines;
+ hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
+ for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
+ SWAP(bcf1_t*, args->blines[args->nblines-1], args->alines[0]);
+ args->nalines--;
+ }
+ if ( args->nalines )
+ {
+ if ( args->nalines==1 )
+ {
+ args->nalines = 0;
+ return args->alines[0];
+ }
+ bcf_clear(args->mrow_out);
+ merge_biallelics_to_multiallelic(args, args->mrow_out, args->alines, args->nalines);
+ args->nalines = 0;
+ return args->mrow_out;
+ }
+ else if ( args->nblines )
+ {
+ if ( args->nblines==1 )
+ {
+ args->nblines = 0;
+ return args->blines[0];
+ }
+ bcf_clear(args->mrow_out);
+ merge_biallelics_to_multiallelic(args, args->mrow_out, args->blines, args->nblines);
+ args->nblines = 0;
+ return args->mrow_out;
+ }
+ return NULL;
+}
+static void flush_buffer(args_t *args, htsFile *file, int n)
+{
+ bcf1_t *line;
+ int i, k;
+ for (i=0; i<n; i++)
+ {
+ k = rbuf_shift(&args->rbuf);
+ if ( args->mrows_op==MROWS_MERGE )
+ {
+ if ( mrows_ready_to_flush(args, args->lines[k]) )
+ {
+ while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line);
+ }
+ int merge = 1;
+ if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
+ {
+ if ( !(bcf_get_variant_types(args->lines[k]) & args->mrows_collapse) ) merge = 0;
+ }
+ if ( merge )
+ {
+ mrows_schedule(args, &args->lines[k]);
+ continue;
+ }
+ }
+ bcf_write1(file, args->hdr, args->lines[k]);
+ }
+ if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
+ {
+ while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line);
+ }
+}
+
+static void init_data(args_t *args)
+{
+ args->hdr = args->files->readers[0].header;
+ rbuf_init(&args->rbuf, 100);
+ args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
+ if ( args->ref_fname )
+ {
+ args->fai = fai_load(args->ref_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->ref_fname);
+ }
+ if ( args->mrows_op==MROWS_MERGE )
+ {
+ args->mrow_out = bcf_init1();
+ args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
+ args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ for (i=0; i<args->rbuf.m; i++)
+ if ( args->lines[i] ) bcf_destroy1(args->lines[i]);
+ free(args->lines);
+ for (i=0; i<args->mtmp_lines; i++)
+ if ( args->tmp_lines[i] ) bcf_destroy1(args->tmp_lines[i]);
+ free(args->tmp_lines);
+ for (i=0; i<args->malines; i++)
+ bcf_destroy1(args->alines[i]);
+ free(args->alines);
+ for (i=0; i<args->mblines; i++)
+ bcf_destroy1(args->blines[i]);
+ free(args->blines);
+ for (i=0; i<args->mmaps; i++)
+ free(args->maps[i].map);
+ for (i=0; i<args->ntmp_als; i++)
+ free(args->tmp_als[i].s);
+ free(args->tmp_als);
+ free(args->tmp_als_str.s);
+ if ( args->tmp_str )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) free(args->tmp_str[i].s);
+ free(args->tmp_str);
+ }
+ free(args->maps);
+ free(args->als);
+ free(args->tmp_arr1);
+ free(args->tmp_arr2);
+ free(args->diploid);
+ if ( args->mrow_out ) bcf_destroy1(args->mrow_out);
+ if ( args->fai ) fai_destroy(args->fai);
+ if ( args->mseq ) free(args->seq);
+}
+
+
+static void normalize_line(args_t *args, bcf1_t **line_ptr)
+{
+ bcf1_t *line = *line_ptr;
+ if ( args->fai )
+ {
+ if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
+ if ( args->do_indels )
+ {
+ int ret = realign(args, line);
+
+ // exclude broken VCF lines
+ if ( ret==ERR_REF_MISMATCH && args->check_ref & CHECK_REF_SKIP )
+ {
+ args->nskipped++;
+ return;
+ }
+ if ( ret==ERR_DUP_ALLELE )
+ {
+ if ( args->check_ref & CHECK_REF_FIX )
+ fix_dup_alt(args, line);
+ else if ( args->check_ref==CHECK_REF_EXIT )
+ error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1);
+ }
+ }
+ }
+
+ // insert into sorted buffer
+ rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
+ int i,j;
+ i = j = rbuf_append(&args->rbuf);
+ if ( !args->lines[i] ) args->lines[i] = bcf_init1();
+ SWAP(bcf1_t*, (*line_ptr), args->lines[i]);
+ while ( rbuf_prev(&args->rbuf,&i) )
+ {
+ if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
+ j = i;
+ }
+}
+
+static void normalize_vcf(args_t *args)
+{
+ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out, args->n_threads);
+ bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
+ bcf_hdr_write(out, args->hdr);
+
+ int prev_rid = -1, prev_pos = -1, prev_type = 0;
+ while ( bcf_sr_next_line(args->files) )
+ {
+ args->ntotal++;
+
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ if ( args->rmdup )
+ {
+ int line_type = bcf_get_variant_types(line);
+ if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
+ {
+ if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ }
+ else
+ {
+ prev_rid = line->rid;
+ prev_pos = line->pos;
+ prev_type = 0;
+ }
+ prev_type |= line_type;
+ }
+
+ // still on the same chromosome?
+ int i,j,ilast = rbuf_last(&args->rbuf);
+ if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome
+
+ int split = 0;
+ if ( args->mrows_op==MROWS_SPLIT )
+ {
+ split = 1;
+ if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
+ {
+ if ( !(bcf_get_variant_types(line) & args->mrows_collapse) ) split = 0;
+ }
+ if ( split && line->n_allele>2 )
+ {
+ args->nsplit++;
+ split_multiallelic_to_biallelics(args, line);
+ for (j=0; j<args->ntmp_lines; j++)
+ normalize_line(args, &args->tmp_lines[j]);
+ }
+ else
+ split = 0;
+ }
+ if ( !split )
+ normalize_line(args, &args->files->readers[0].buffer[0]);
+
+ // find out how many sites to flush
+ ilast = rbuf_last(&args->rbuf);
+ j = 0;
+ for (i=-1; rbuf_next(&args->rbuf,&i); )
+ {
+ if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
+ j++;
+ }
+ if ( args->rbuf.n==args->rbuf.m ) j = 1;
+ if ( j>0 ) flush_buffer(args, out, j);
+ }
+ flush_buffer(args, out, args->rbuf.n);
+ hts_close(out);
+
+ fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
+ if ( args->check_ref & CHECK_REF_FIX )
+ fprintf(stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n");
+ fprintf(stderr, " split multiallelic sites into multiple rows; recover multiallelics from\n");
+ fprintf(stderr, " multiple rows.\n");
+ fprintf(stderr, "Usage: bcftools norm [options] <in.vcf.gz>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
+ fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
+ fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
+ fprintf(stderr, " -f, --fasta-ref <file> reference sequence\n");
+ fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfnorm(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->aln_win = 100;
+ args->buf_win = 1000;
+ args->mrows_collapse = COLLAPSE_BOTH;
+ args->do_indels = 1;
+ int region_is_file = 0;
+ int targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"fasta-ref",required_argument,NULL,'f'},
+ {"do-not-normalize",no_argument,NULL,'N'},
+ {"multiallelics",required_argument,NULL,'m'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"site-win",required_argument,NULL,'w'},
+ {"remove-duplicates",no_argument,NULL,'D'},
+ {"rm-dup",required_argument,NULL,'d'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"check-ref",required_argument,NULL,'c'},
+ {"strict-filter",no_argument,NULL,'s'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'N': args->do_indels = 0; break;
+ case 'd':
+ if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1;
+ else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1;
+ else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1;
+ else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1;
+ else error("The argument to -d not recognised: %s\n", optarg);
+ break;
+ case 'm':
+ if ( optarg[0]=='-' ) args->mrows_op = MROWS_SPLIT;
+ else if ( optarg[0]=='+' ) args->mrows_op = MROWS_MERGE;
+ else error("Expected '+' or '-' with -m\n");
+ if ( optarg[1]!=0 )
+ {
+ if ( !strcmp("snps",optarg+1) ) args->mrows_collapse = COLLAPSE_SNPS;
+ else if ( !strcmp("indels",optarg+1) ) args->mrows_collapse = COLLAPSE_INDELS;
+ else if ( !strcmp("both",optarg+1) ) args->mrows_collapse = COLLAPSE_BOTH;
+ else if ( !strcmp("any",optarg+1) ) args->mrows_collapse = COLLAPSE_ANY;
+ else error("The argument to -m not recognised: %s\n", optarg);
+ }
+ break;
+ case 'c':
+ if ( strchr(optarg,'w') ) args->check_ref |= CHECK_REF_WARN;
+ if ( strchr(optarg,'x') ) args->check_ref |= CHECK_REF_SKIP;
+ if ( strchr(optarg,'s') ) args->check_ref |= CHECK_REF_FIX;
+ if ( strchr(optarg,'e') ) args->check_ref = CHECK_REF_EXIT; // overrides the above
+ break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'D':
+ fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n");
+ args->rmdup = COLLAPSE_NONE<<1;
+ break;
+ case 's': args->strict_filter = 1; break;
+ case 'f': args->ref_fname = optarg; break;
+ case 'r': args->region = optarg; break;
+ case 'R': args->region = optarg; region_is_file = 1; break;
+ case 't': args->targets = optarg; break;
+ case 'T': args->targets = optarg; targets_is_file = 1; break;
+ case 'w':
+ args->buf_win = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --site-win %s\n", optarg);
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( argc>optind+1 ) usage();
+ if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) usage();
+ if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n");
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
+ }
+ else fname = argv[optind];
+
+ if ( args->region )
+ {
+ if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->region);
+ }
+ if ( args->targets )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets,targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets);
+ }
+
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
+ init_data(args);
+ normalize_vcf(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c
new file mode 100644
index 0000000..2cdf399
--- /dev/null
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -0,0 +1,1812 @@
+#include "pysam.h"
+
+/* vcfnorm.c -- Left-align and normalize indels.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/faidx.h>
+#include "bcftools.h"
+#include "rbuf.h"
+
+#define CHECK_REF_EXIT 0
+#define CHECK_REF_WARN 1
+#define CHECK_REF_SKIP 2
+#define CHECK_REF_FIX 4
+
+#define MROWS_SPLIT 1
+#define MROWS_MERGE 2
+
+// for -m+, mapping from allele indexes of a single input record
+// to allele indexes of output record
+typedef struct
+{
+ int nals, mals, *map;
+}
+map_t;
+
+typedef struct
+{
+ char *tseq, *seq;
+ int mseq;
+ bcf1_t **lines, **tmp_lines, **alines, **blines, *mrow_out;
+ int ntmp_lines, mtmp_lines, nalines, malines, nblines, mblines;
+ map_t *maps; // mrow map for each buffered record
+ char **als;
+ int mmaps, nals, mals;
+ uint8_t *tmp_arr1, *tmp_arr2, *diploid;
+ int ntmp_arr1, ntmp_arr2;
+ kstring_t *tmp_str;
+ kstring_t *tmp_als, tmp_als_str;
+ int ntmp_als;
+ rbuf_t rbuf;
+ int buf_win; // maximum distance between two records to consider
+ int aln_win; // the realignment window size (maximum repeat size)
+ bcf_srs_t *files; // using the synced reader only for -r option
+ bcf_hdr_t *hdr;
+ faidx_t *fai;
+ struct { int tot, set, swap; } nref;
+ char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
+ int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
+ int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
+}
+args_t;
+
+static inline int replace_iupac_codes(char *seq, int nseq)
+{
+ // Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end
+ int i, n = 0;
+ for (i=0; i<nseq; i++)
+ {
+ char c = toupper(seq[i]);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' ) { seq[i] = 'N'; n++; }
+ }
+ return n;
+}
+
+static void fix_ref(args_t *args, bcf1_t *line)
+{
+ int reflen = strlen(line->d.allele[0]);
+ int i, maxlen = reflen, len;
+ for (i=1; i<line->n_allele; i++)
+ {
+ int len = strlen(line->d.allele[i]);
+ if ( maxlen < len ) maxlen = len;
+ }
+
+ char *ref = faidx_fetch_seq(args->fai, (char*)bcf_seqname(args->hdr,line), line->pos, line->pos+maxlen-1, &len);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1);
+ replace_iupac_codes(ref,len);
+
+ args->nref.tot++;
+
+ // is the REF different?
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+
+ // is the REF allele missing or N?
+ if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') )
+ {
+ line->d.allele[0][0] = ref[0];
+ args->nref.set++;
+ free(ref);
+ bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ return;
+ }
+
+ // does REF contain non-standard bases?
+ if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) )
+ {
+ args->nref.set++;
+ bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ }
+
+ // is it swapped?
+ for (i=1; i<line->n_allele; i++)
+ {
+ int len = strlen(line->d.allele[i]);
+ if ( !strncasecmp(line->d.allele[i],ref,len) ) break;
+ }
+
+ kstring_t str = {0,0,0};
+ if ( i==line->n_allele )
+ {
+ // none of the alternate alleles matches the reference
+ if ( line->n_allele>1 )
+ args->nref.set++;
+ else
+ args->nref.swap++;
+
+ kputs(line->d.allele[0],&str);
+ kputc(',',&str);
+ for (i=1; i<line->n_allele; i++)
+ {
+ kputs(line->d.allele[i],&str);
+ kputc(',',&str);
+ }
+ kputc(ref[0],&str);
+ bcf_update_alleles_str(args->hdr,line,str.s);
+ str.l = 0;
+ }
+ else
+ args->nref.swap++;
+ free(ref);
+
+ // swap the alleles
+ int j;
+ kputs(line->d.allele[i],&str);
+ for (j=1; j<i; j++)
+ {
+ kputc(',',&str);
+ kputs(line->d.allele[j],&str);
+ }
+ kputc(',',&str);
+ kputs(line->d.allele[0],&str);
+ for (j=i+1; j<line->n_allele; j++)
+ {
+ kputc(',',&str);
+ kputs(line->d.allele[j],&str);
+ }
+ bcf_update_alleles_str(args->hdr,line,str.s);
+
+ // swap genotypes
+ int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
+ int ngts = bcf_get_genotypes(args->hdr, line, &args->tmp_arr1, &ntmp);
+ args->ntmp_arr1 = ntmp * sizeof(int32_t);
+ int32_t *gts = (int32_t*) args->tmp_arr1;
+ int ni = 0;
+ for (j=0; j<ngts; j++)
+ {
+ if ( gts[j]==bcf_gt_unphased(0) ) { gts[j] = bcf_gt_unphased(i); ni++; }
+ else if ( gts[j]==bcf_gt_phased(0) ) { gts[j] = bcf_gt_phased(i); ni++; }
+ else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0);
+ else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0);
+ }
+ bcf_update_genotypes(args->hdr,line,gts,ngts);
+
+ // update AC
+ int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp);
+ args->ntmp_arr1 = ntmp * sizeof(int32_t);
+ if ( i <= nac )
+ {
+ int32_t *ac = (int32_t*)args->tmp_arr1;
+ ac[i-1] = ni;
+ bcf_update_info_int32(args->hdr, line, "AC", ac, nac);
+ }
+
+ free(str.s);
+}
+
+static void fix_dup_alt(args_t *args, bcf1_t *line)
+{
+ // update alleles, create a mapping between old and new indexes
+ hts_expand(uint8_t,line->n_allele,args->ntmp_arr1,args->tmp_arr1);
+ args->tmp_arr1[0] = 0; // ref always unchanged
+
+ int i, j, nals = line->n_allele, nals_ori = line->n_allele;
+ for (i=1, j=1; i<line->n_allele; i++)
+ {
+ if ( strcmp(line->d.allele[0],line->d.allele[i]) )
+ {
+ args->tmp_arr1[i] = j++;
+ continue;
+ }
+ args->tmp_arr1[i] = 0;
+ nals--;
+ }
+ for (i=1, j=1; i<line->n_allele; i++)
+ {
+ if ( !args->tmp_arr1[i] ) continue;
+ line->d.allele[j++] = line->d.allele[i];
+ }
+ bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals);
+
+
+ // update genotypes
+ int ntmp = args->ntmp_arr2 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
+ int ngts = bcf_get_genotypes(args->hdr, line, &args->tmp_arr2, &ntmp);
+ args->ntmp_arr2 = ntmp * sizeof(int32_t);
+ int32_t *gts = (int32_t*) args->tmp_arr2;
+ int changed = 0;
+ for (i=0; i<ngts; i++)
+ {
+ if ( bcf_gt_is_missing(gts[i]) || gts[i]==bcf_int32_vector_end ) continue;
+ int ial = bcf_gt_allele(gts[i]);
+ if ( ial<nals_ori && ial==args->tmp_arr1[ial] ) continue;
+ int ial_new = ial<nals_ori ? args->tmp_arr1[ial] : 0;
+ gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new);
+ changed = 1;
+ }
+ if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
+}
+
+#define ERR_DUP_ALLELE -2
+#define ERR_REF_MISMATCH -1
+#define ERR_OK 0
+#define ERR_SYMBOLIC 1
+
+static int realign(args_t *args, bcf1_t *line)
+{
+ bcf_unpack(line, BCF_UN_STR);
+
+ // Sanity check REF
+ int i, nref, reflen = strlen(line->d.allele[0]);
+ char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+ replace_iupac_codes(ref,nref);
+
+ // does REF contain non-standard bases?
+ if ( replace_iupac_codes(line->d.allele[0],reflen) )
+ {
+ args->nchanged++;
+ bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ }
+ if ( strcasecmp(ref,line->d.allele[0]) )
+ {
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(pysamerr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ free(ref);
+ return ERR_REF_MISMATCH;
+ }
+ free(ref);
+ ref = NULL;
+
+ if ( line->n_allele == 1 ) return ERR_OK; // a REF
+
+ // make a copy of each allele for trimming
+ hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
+ kstring_t *als = args->tmp_als;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+
+ als[i].l = 0;
+ kputs(line->d.allele[i], &als[i]);
+
+ if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
+ }
+
+
+ // trim from right
+ int ori_pos = line->pos;
+ while (1)
+ {
+ // is the rightmost base identical in all alleles?
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break;
+ }
+ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+
+ int pad_from_left = 0;
+ for (i=0; i<line->n_allele; i++) // trim all alleles
+ {
+ als[i].l--;
+ if ( !als[i].l ) pad_from_left = 1;
+ }
+ if ( pad_from_left )
+ {
+ int npad = line->pos >= args->aln_win ? args->aln_win : line->pos;
+ free(ref);
+ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad+1);
+ replace_iupac_codes(ref,nref);
+ for (i=0; i<line->n_allele; i++)
+ {
+ ks_resize(&als[i], als[i].l + npad);
+ if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
+ memcpy(als[i].s,ref,npad);
+ als[i].l += npad;
+ }
+ line->pos -= npad;
+ }
+ }
+ free(ref);
+
+ // trim from left
+ int ntrim_left = 0;
+ while (1)
+ {
+ // is the first base identical in all alleles?
+ int min_len = als[0].l - ntrim_left;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
+ if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+ }
+ if ( i!=line->n_allele || min_len==1 ) break; // there are differences, cannot be trimmed
+ ntrim_left++;
+ }
+ if ( ntrim_left )
+ {
+ for (i=0; i<line->n_allele; i++)
+ {
+ memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
+ als[i].l -= ntrim_left;
+ }
+ line->pos += ntrim_left;
+ }
+
+ // Have the alleles changed?
+ als[0].s[ als[0].l ] = 0; // in order for strcmp to work
+ if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+
+ // Create new block of alleles and update
+ args->tmp_als_str.l = 0;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if (i>0) kputc(',',&args->tmp_als_str);
+ kputsn(als[i].s,als[i].l,&args->tmp_als_str);
+ }
+ args->tmp_als_str.s[ args->tmp_als_str.l ] = 0;
+ bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s);
+ args->nchanged++;
+
+ return ERR_OK;
+}
+
+static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int ret = bcf_get_info_##type(args->hdr,src,tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( ret>0 ); \
+ type_t *vals = (type_t*) args->tmp_arr1; \
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
+ if ( len==BCF_VL_A ) \
+ { \
+ assert( ret==src->n_allele-1); \
+ bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ assert( ret==src->n_allele); \
+ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ assert( ret==src->n_allele*(src->n_allele+1)/2 ); \
+ if ( ialt!=0 ) \
+ { \
+ vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
+ vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,3); \
+ } \
+ else \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+// Find n-th field in a comma-separated list and move it to dst.
+// The memory areas may overlap.
+#define STR_MOVE_NTH(dst,src,end,nth,len) \
+{ \
+ char *ss = src, *se = src; \
+ int j = 0; \
+ while ( *se && se<(end) ) \
+ { \
+ if ( *se==',' ) \
+ { \
+ if ( j==nth ) break; \
+ j++; \
+ ss = se+1; \
+ } \
+ se++; \
+ } \
+ if ( j==nth ) \
+ { \
+ int n = se - ss; \
+ memmove((dst),ss,n); \
+ src = se; \
+ len += n; \
+ } \
+ else len = -1; \
+}
+static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+ int ret = bcf_get_info_string(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
+ assert( ret>0 );
+
+ kstring_t str;
+ str.m = args->ntmp_arr1;
+ str.l = ret;
+ str.s = (char*) args->tmp_arr1;
+
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key);
+ if ( len==BCF_VL_A )
+ {
+ char *tmp = str.s;
+ int len = 0;
+ STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len] = 0;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ char *tmp = str.s;
+ int len = 0;
+ STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
+ str.s[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len] = 0;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+ char *tmp = str.s;
+ int len = 0;
+ STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
+ str.s[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ str.s[len] = 0;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+}
+static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+ int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+}
+
+static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
+{
+ int ntmp = args->ntmp_arr1 / 4;
+ int ngts = bcf_get_genotypes(args->hdr,src,&args->tmp_arr1,&ntmp);
+ args->ntmp_arr1 = ntmp * 4;
+ assert( ngts >0 );
+
+ int32_t *gt = (int32_t*) args->tmp_arr1;
+ int i, j, nsmpl = bcf_hdr_nsamples(args->hdr);
+ ngts /= nsmpl;
+ for (i=0; i<nsmpl; i++)
+ {
+ for (j=0; j<ngts; j++)
+ {
+ if ( gt[j]==bcf_int32_vector_end ) break;
+ if ( bcf_gt_is_missing(gt[j]) || bcf_gt_allele(gt[j])==0 ) continue; // missing allele or ref: leave as is
+ if ( bcf_gt_allele(gt[j])==ialt+1 )
+ gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT
+ else
+ gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF
+ }
+ gt += ngts;
+ }
+ bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+}
+static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int nvals = bcf_get_format_##type(args->hdr,src,tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( nvals>0 ); \
+ type_t *vals = (type_t *) args->tmp_arr1; \
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
+ int i, nsmpl = bcf_hdr_nsamples(args->hdr); \
+ if ( nvals==nsmpl ) /* all values are missing */ \
+ { \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ return; \
+ } \
+ if ( len==BCF_VL_A ) \
+ { \
+ assert( nvals==(src->n_allele-1)*nsmpl); \
+ nvals /= nsmpl; \
+ type_t *src_vals = vals, *dst_vals = vals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[ialt]; \
+ dst_vals += 1; \
+ src_vals += nvals; \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ assert( nvals==src->n_allele*nsmpl); \
+ nvals /= nsmpl; \
+ type_t *src_vals = vals, *dst_vals = vals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[0]; \
+ dst_vals[1] = src_vals[ialt+1]; \
+ dst_vals += 2; \
+ src_vals += nvals; \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl*2); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ if ( nvals!=src->n_allele*(src->n_allele+1)/2*nsmpl && nvals!=src->n_allele*nsmpl ) \
+ error("Error at %s:%d, the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
+ nvals /= nsmpl; \
+ int all_haploid = nvals==src->n_allele ? 1 : 0; \
+ type_t *src_vals = vals, *dst_vals = vals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ int haploid = all_haploid; \
+ if ( !haploid ) \
+ { \
+ int j; \
+ for (j=0; j<nvals; j++) if ( is_vector_end ) break; \
+ if ( j!=nvals ) haploid = 1; \
+ } \
+ dst_vals[0] = src_vals[0]; \
+ if ( haploid ) \
+ { \
+ dst_vals[1] = src_vals[ialt+1]; \
+ if ( !all_haploid ) set_vector_end; \
+ } \
+ else \
+ { \
+ dst_vals[1] = src_vals[bcf_alleles2gt(0,ialt+1)]; \
+ dst_vals[2] = src_vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
+ } \
+ dst_vals += all_haploid ? 2 : 3; \
+ src_vals += nvals; \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
+ } \
+ else \
+ bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+static void squeeze_format_char(char *str, int src_blen, int dst_blen, int n)
+{
+ int i, isrc = 0, idst = 0;
+ for (i=0; i<n; i++)
+ {
+ memmove(str+idst,str+isrc,dst_blen);
+ idst += dst_blen;
+ isrc += src_blen;
+ }
+}
+static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
+ int ret = bcf_get_format_char(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
+ assert( ret>0 );
+
+ kstring_t str;
+ str.m = args->ntmp_arr1;
+ str.l = ret;
+ str.s = (char*) args->tmp_arr1;
+
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( len==BCF_VL_A )
+ {
+ int i, blen = ret/nsmpl, maxlen = 0;
+ char *ptr = str.s;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *tmp = ptr;
+ int len = 0;
+ STR_MOVE_NTH(tmp,tmp,ptr+blen,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ if ( maxlen < len ) maxlen = len;
+ ptr += blen;
+ }
+ if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
+ bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ int i, blen = ret/nsmpl, maxlen = 0;
+ char *ptr = str.s;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *tmp = ptr;
+ int len = 0;
+ STR_MOVE_NTH(ptr,tmp,ptr+blen,0,len);
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ if ( maxlen < len ) maxlen = len;
+ ptr += blen;
+ }
+ if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
+ bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int i, blen = ret/nsmpl, maxlen = 0, i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+ char *ptr = str.s;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *se = ptr, *sx = ptr+blen;
+ int nfields = 1;
+ while ( *se && se<sx )
+ {
+ if ( *se==',' ) nfields++;
+ se++;
+ }
+ assert( nfields==src->n_allele*(src->n_allele+1)/2 || nfields==src->n_allele );
+ int len = 0;
+ if ( nfields==src->n_allele ) // haploid
+ {
+ char *tmp = ptr;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ }
+ else // diploid
+ {
+ char *tmp = ptr;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ ptr[len]=','; tmp++; len++;
+ STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,iaa-i0a-1,len);
+ if ( len<0 ) return; // wrong number of fields: skip
+ }
+ if ( maxlen < len ) maxlen = len;
+ ptr += blen;
+ }
+ if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
+ bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ }
+ else
+ bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+}
+
+
+static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
+{
+ int i;
+
+ bcf_unpack(line, BCF_UN_ALL);
+
+ // Init the target biallelic lines
+ args->ntmp_lines = line->n_allele-1;
+ if ( args->mtmp_lines < args->ntmp_lines )
+ {
+ args->tmp_lines = (bcf1_t **)realloc(args->tmp_lines,sizeof(bcf1_t*)*args->ntmp_lines);
+ for (i=args->mtmp_lines; i<args->ntmp_lines; i++)
+ args->tmp_lines[i] = NULL;
+ args->mtmp_lines = args->ntmp_lines;
+ }
+ kstring_t tmp = {0,0,0};
+ kputs(line->d.allele[0], &tmp);
+ kputc(',', &tmp);
+ int rlen = tmp.l;
+ int gt_id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"GT");
+ for (i=0; i<args->ntmp_lines; i++) // for each ALT allele
+ {
+ if ( !args->tmp_lines[i] ) args->tmp_lines[i] = bcf_init1();
+ bcf1_t *dst = args->tmp_lines[i];
+ bcf_clear(dst);
+
+ dst->rid = line->rid;
+ dst->pos = line->pos;
+ dst->qual = line->qual;
+
+ // Not quite sure how to handle IDs, they can be assigned to a specific
+ // ALT. For now we leave the ID unchanged for all.
+ bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : ".");
+
+ tmp.l = rlen;
+ kputs(line->d.allele[i+1],&tmp);
+ bcf_update_alleles_str(args->hdr,dst,tmp.s);
+
+ if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt);
+
+ int j;
+ for (j=0; j<line->n_info; j++)
+ {
+ bcf_info_t *info = &line->d.info[j];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) split_info_numeric(args, line, info, i, dst);
+ else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
+ else split_info_string(args, line, info, i, dst);
+ }
+
+ dst->n_sample = line->n_sample;
+ for (j=0; j<line->n_fmt; j++)
+ {
+ bcf_fmt_t *fmt = &line->d.fmt[j];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( fmt->id==gt_id ) split_format_genotype(args, line, fmt, i, dst);
+ else if ( type==BCF_HT_INT || type==BCF_HT_REAL ) split_format_numeric(args, line, fmt, i, dst);
+ else split_format_string(args, line, fmt, i, dst);
+ }
+ }
+ free(tmp.s);
+}
+
+// Enlarge FORMAT array containing nsmpl samples each with nals_ori values
+// to accommodate nvals values for each sample, filling the gaps with missing
+// values. Works also for INFO arrays, with nsmpl set to 1.
+#define ENLARGE_ARRAY(type_t,set_missing,arr,narr_bytes,nsmpl,nvals_ori,nvals) \
+{ \
+ int nbytes_new = (nsmpl)*(nvals)*sizeof(type_t); \
+ hts_expand(uint8_t,nbytes_new,narr_bytes,arr); \
+ int ismpl, k; \
+ for (ismpl=nsmpl-1; ismpl>=0; ismpl--) \
+ { \
+ type_t *dst_ptr = ((type_t*)arr) + ismpl*(nvals); \
+ type_t *src_ptr = ((type_t*)arr) + ismpl*nvals_ori; \
+ memmove(dst_ptr,src_ptr,sizeof(type_t)*nvals_ori); \
+ for (k=nvals_ori; k<nvals; k++) set_missing; \
+ } \
+}
+static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t,set_missing,is_vector_end) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int nvals_ori = bcf_get_info_##type(args->hdr,lines[0],tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( nvals_ori>0 ); \
+ type_t *vals = (type_t*) args->tmp_arr1, *vals2; \
+ int i,k,len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
+ if ( len==BCF_VL_A ) \
+ { \
+ if (nvals_ori!=lines[0]->n_allele - 1) \
+ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \
+ int nvals = dst->n_allele - 1; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \
+ vals = (type_t*) args->tmp_arr1; \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_info_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ if (nvals2!=lines[i]->n_allele-1) \
+ error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
+ } \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ if (nvals_ori!=lines[0]->n_allele) \
+ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \
+ int nvals = dst->n_allele; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \
+ vals = (type_t*) args->tmp_arr1; \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_info_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ if (nvals2!=lines[i]->n_allele) \
+ error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k] ] = vals2[k]; \
+ } \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ /* expecting diploid gt in INFO */ \
+ if (nvals_ori!=lines[0]->n_allele*(lines[0]->n_allele+1)/2) { \
+ fprintf(pysamerr, "todo: merge Number=G INFO fields for haploid sites\n"); \
+ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \
+ } \
+ int nvals = dst->n_allele*(dst->n_allele+1)/2; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \
+ vals = (type_t*) args->tmp_arr1; \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_info_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ if (nvals2!=lines[i]->n_allele*(lines[i]->n_allele+1)/2) \
+ error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals2 = (type_t*) args->tmp_arr2; \
+ int ia,ib; \
+ k = 0; \
+ for (ia=0; ia<lines[i]->n_allele; ia++) \
+ { \
+ for (ib=0; ib<=ia; ib++) \
+ { \
+ if ( is_vector_end ) break; \
+ int l = bcf_alleles2gt(args->maps[i].map[ia],args->maps[i].map[ib]); \
+ vals[l] = vals2[k]; \
+ k++; \
+ } \
+ } \
+ } \
+ bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ } \
+ else \
+ bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, dst_ptr[k]=bcf_int32_missing, vals2[k]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_set_missing(dst_ptr[k]), bcf_float_is_vector_end(vals2[k])); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+static void merge_info_flag(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+ int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+}
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
+static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
+
+ kstring_t str;
+ str.m = args->ntmp_arr1;
+ str.l = 0;
+ str.s = (char*) args->tmp_arr1;
+
+ int i, j, len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key);
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int jfrom = len==BCF_VL_A ? 1 : 0;
+ kputc('.',&str);
+ for (i=jfrom+1; i<dst->n_allele; i++) kputs(",.",&str);
+ for (i=0; i<nlines; i++)
+ {
+ bcf_info_t *src = bcf_get_info(args->hdr,lines[i],tag);
+ if (!src) continue;
+ for (j=jfrom; j<lines[i]->n_allele; j++)
+ copy_string_field((char*)src->vptr, j-jfrom, src->len, &str, args->maps[i].map[j]-jfrom);
+ }
+ str.s[str.l] = 0;
+ args->tmp_arr1 = (uint8_t*) str.s;
+ args->ntmp_arr1 = str.m;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int ngts = dst->n_allele*(dst->n_allele+1)/2;
+ kputc('.',&str);
+ for (i=1; i<ngts; i++) kputs(",.",&str);
+ for (i=0; i<nlines; i++)
+ {
+ bcf_info_t *src = bcf_get_info(args->hdr,lines[i],tag);
+ if (!src) continue;
+ int iori, jori, kori = 0;
+ for (iori=0; iori<lines[i]->n_allele; iori++)
+ {
+ int inew = args->maps[i].map[iori];
+ for (jori=0; jori<=iori; jori++)
+ {
+ int jnew = args->maps[i].map[jori];
+ int knew = bcf_alleles2gt(inew,jnew);
+ copy_string_field((char*)src->vptr,kori,src->len,&str,knew);
+ kori++;
+ }
+ }
+ }
+ str.s[str.l] = 0;
+ args->tmp_arr1 = (uint8_t*) str.s;
+ args->ntmp_arr1 = str.m;
+ bcf_update_info_string(args->hdr,dst,tag,str.s);
+ }
+ else
+ {
+ bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1);
+ }
+}
+static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
+{
+ int ntmp = args->ntmp_arr1 / 4;
+ int ngts = bcf_get_genotypes(args->hdr,lines[0],&args->tmp_arr1,&ntmp);
+ args->ntmp_arr1 = ntmp * 4;
+ assert( ngts >0 );
+
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ ngts /= nsmpl;
+
+ int i, j, k;
+ for (i=1; i<nlines; i++)
+ {
+ int ntmp2 = args->ntmp_arr2 / 4;
+ int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2);
+ args->ntmp_arr2 = ntmp2 * 4;
+ ngts2 /= nsmpl;
+ if ( ngts!=ngts2 ) error("Error at %s:%d: cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1);
+
+ int32_t *gt = (int32_t*) args->tmp_arr1;
+ int32_t *gt2 = (int32_t*) args->tmp_arr2;
+ for (j=0; j<nsmpl; j++)
+ {
+ for (k=0; k<ngts; k++)
+ {
+ if ( gt2[k]==bcf_int32_vector_end ) break;
+ if ( bcf_gt_is_missing(gt2[k]) || bcf_gt_allele(gt2[k])==0 ) continue;
+ if ( gt2[k]==0 ) gt[k] = 0; // missing genotype
+ else
+ {
+ int ial = bcf_gt_allele(gt2[k]);
+ assert( ial<args->maps[i].nals );
+ gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
+ }
+ }
+ gt += ngts;
+ gt2 += ngts;
+ }
+ }
+ bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+}
+static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals)
+{
+ int i, dsrc = size*nals*(nals+1)/2, ddst = size*nals;
+ uint8_t *src_ptr = vals + dsrc, *dst_ptr = vals + ddst;
+ for (i=1; i<nsmpl; i++)
+ {
+ memmove(dst_ptr,src_ptr,ddst);
+ dst_ptr += ddst;
+ src_ptr += dsrc;
+ }
+ return nals;
+}
+static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
+{
+ #define BRANCH_NUMERIC(type,type_t,set_missing,is_vector_end,set_vector_end) \
+ { \
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
+ int ntmp = args->ntmp_arr1 / sizeof(type_t); \
+ int nvals_ori = bcf_get_format_##type(args->hdr,lines[0],tag,&args->tmp_arr1,&ntmp); \
+ args->ntmp_arr1 = ntmp * sizeof(type_t); \
+ assert( nvals_ori>0 ); \
+ type_t *vals2, *vals = (type_t *) args->tmp_arr1; \
+ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
+ int i, j, k, nsmpl = bcf_hdr_nsamples(args->hdr); \
+ nvals_ori /= nsmpl; \
+ if ( len==BCF_VL_A ) \
+ { \
+ int nvals = dst->n_allele - 1; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,nsmpl,nvals_ori,nvals); \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_format_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* format tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ nvals2 /= nsmpl; \
+ if (nvals2!=lines[i]->n_allele-1) \
+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals = (type_t*) args->tmp_arr1; \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
+ } \
+ vals += nvals; \
+ vals2 += nvals2; \
+ } \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ } \
+ else if ( len==BCF_VL_R ) \
+ { \
+ int nvals = dst->n_allele; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,nsmpl,nvals_ori,nvals); \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_format_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* format tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ nvals2 /= nsmpl; \
+ if (nvals2!=lines[i]->n_allele) \
+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals = (type_t*) args->tmp_arr1; \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ for (k=0; k<nvals2; k++) \
+ { \
+ if ( is_vector_end ) break; \
+ vals[ args->maps[i].map[k] ] = vals2[k]; \
+ } \
+ vals += nvals; \
+ vals2 += nvals2; \
+ } \
+ } \
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ } \
+ else if ( len==BCF_VL_G ) \
+ { \
+ /* which samples are diploid */ \
+ memset(args->diploid,0,nsmpl); \
+ int all_haploid = 1; \
+ if ( nvals_ori > lines[0]->n_allele ) /* line possibly diploid */ \
+ { \
+ vals2 = (type_t*) args->tmp_arr1; \
+ int ndiploid = lines[0]->n_allele*(lines[0]->n_allele+1)/2; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ if ( !args->diploid[i] ) \
+ { \
+ for (k=0; k<nvals_ori; k++) if ( is_vector_end ) break; \
+ if ( k==ndiploid ) { args->diploid[i] = 1; all_haploid = 0; }\
+ } \
+ vals2 += nvals_ori; \
+ } \
+ } \
+ int nvals = dst->n_allele*(dst->n_allele+1)/2; \
+ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,nsmpl,nvals_ori,nvals); \
+ for (i=1; i<nlines; i++) \
+ { \
+ int ntmp2 = args->ntmp_arr2 / sizeof(type_t); \
+ int nvals2 = bcf_get_format_##type(args->hdr,lines[i],tag,&args->tmp_arr2,&ntmp2); \
+ if (nvals2<0) continue; /* format tag does not exist in this record, skip */ \
+ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \
+ nvals2 /= nsmpl; \
+ int ndiploid = lines[i]->n_allele*(lines[i]->n_allele+1)/2; \
+ int line_diploid = nvals2==ndiploid ? 1 : 0; \
+ if (!(nvals2==1 || nvals2==lines[i]->n_allele || nvals2==lines[i]->n_allele*(lines[i]->n_allele+1)/2)) \
+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \
+ vals = (type_t*) args->tmp_arr1; \
+ vals2 = (type_t*) args->tmp_arr2; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ int smpl_diploid = line_diploid; \
+ if ( smpl_diploid ) \
+ { \
+ for (k=0; k<nvals2; k++) if ( is_vector_end ) break; \
+ if ( k!=ndiploid ) smpl_diploid = 0; \
+ } \
+ if ( smpl_diploid && !args->diploid[j] ) { args->diploid[j] = 1; all_haploid = 0; } \
+ if ( !smpl_diploid ) \
+ { \
+ for (k=0; k<lines[i]->n_allele; k++) vals[args->maps[i].map[k]] = vals2[k]; \
+ } \
+ else \
+ { \
+ k = 0; \
+ int ia,ib; \
+ for (ia=0; ia<lines[i]->n_allele; ia++) \
+ { \
+ for (ib=0; ib<=ia; ib++) \
+ { \
+ int l = bcf_alleles2gt(args->maps[i].map[ia],args->maps[i].map[ib]); \
+ vals[l] = vals2[k]; \
+ k++; \
+ } \
+ } \
+ } \
+ vals += nvals; \
+ vals2 += nvals2; \
+ } \
+ } \
+ if ( all_haploid ) \
+ nvals = diploid_to_haploid(sizeof(type_t),nsmpl,dst->n_allele,args->tmp_arr1); \
+ else \
+ {\
+ k = dst->n_allele;\
+ vals2 = (type_t*) args->tmp_arr1;\
+ for (i=0; i<nsmpl; i++)\
+ {\
+ if ( !args->diploid[i] ) set_vector_end;\
+ vals2 += nvals;\
+ }\
+ }\
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ } \
+ else \
+ bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
+ }
+ switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
+ {
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, dst_ptr[k]=bcf_int32_missing, vals2[k]==bcf_int32_vector_end, vals2[k]=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_set_missing(dst_ptr[k]), bcf_float_is_vector_end(vals2[k]), bcf_float_set_vector_end(vals2[k])); break;
+ }
+ #undef BRANCH_NUMERIC
+}
+static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
+{
+ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
+
+ int i, j, k, len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G )
+ {
+ int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret);
+ return;
+ }
+
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ for (i=0; i<nsmpl; i++) args->tmp_str[i].l = 0;
+
+ if ( len==BCF_VL_A || len==BCF_VL_R )
+ {
+ int jfrom = len==BCF_VL_A ? 1 : 0;
+ for (i=0; i<nsmpl; i++)
+ {
+ kstring_t *tmp = &args->tmp_str[i];
+ kputc('.',tmp);
+ for (k=jfrom+1; k<dst->n_allele; k++) kputs(",.",tmp);
+ }
+ for (i=0; i<nlines; i++)
+ {
+ int nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ nret /= nsmpl;
+ for (k=0; k<nsmpl; k++)
+ {
+ kstring_t *tmp = &args->tmp_str[k];
+ char *src = (char*)args->tmp_arr1 + k*nret;
+ for (j=jfrom; j<lines[i]->n_allele; j++)
+ copy_string_field(src, j-jfrom, nret, tmp, args->maps[i].map[j]-jfrom);
+ }
+ }
+ }
+ else if ( len==BCF_VL_G )
+ {
+ hts_expand(uint8_t,nsmpl,args->ntmp_arr2,args->tmp_arr2);
+ uint8_t *haploid = args->tmp_arr2;
+ int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ nret /= nsmpl;
+ for (i=0; i<nsmpl; i++)
+ {
+ char *ss = (char*)args->tmp_arr1 + i*nret, *se = ss+nret;
+ int nfields = 1;
+ while ( *ss && ss<se )
+ {
+ if ( *ss==',' ) nfields++;
+ ss++;
+ }
+ if ( nfields==lines[0]->n_allele )
+ {
+ haploid[i] = 1;
+ nfields = dst->n_allele;
+ }
+ else if ( nfields==lines[0]->n_allele*(lines[0]->n_allele+1)/2 )
+ {
+ haploid[i] = 0;
+ nfields = dst->n_allele*(dst->n_allele+1)/2;
+ }
+ else error("The field %s at %s:%d neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),dst->pos+1);
+
+ kstring_t *tmp = &args->tmp_str[i];
+ kputc('.',tmp);
+ for (j=1; j<nfields; j++) kputs(",.",tmp);
+ }
+ for (i=0; i<nlines; i++)
+ {
+ if ( i ) // we already have a copy
+ {
+ nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
+ if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ nret /= nsmpl;
+ }
+ for (k=0; k<nsmpl; k++)
+ {
+ kstring_t *tmp = &args->tmp_str[k];
+ char *src = (char*)args->tmp_arr1 + k*nret;
+ if ( haploid[k] )
+ {
+ for (j=0; j<lines[i]->n_allele; j++)
+ copy_string_field(src,j,nret, tmp, args->maps[i].map[j]);
+ }
+ else
+ {
+ int iori, jori, kori = 0;
+ for (iori=0; iori<lines[i]->n_allele; iori++)
+ {
+ int inew = args->maps[i].map[iori];
+ for (jori=0; jori<=iori; jori++)
+ {
+ int jnew = args->maps[i].map[jori];
+ int knew = bcf_alleles2gt(inew,jnew);
+ copy_string_field(src,kori,nret,tmp,knew);
+ kori++;
+ }
+ }
+ }
+ }
+ }
+ }
+ kstring_t str;
+ str.m = args->ntmp_arr2;
+ str.l = 0;
+ str.s = (char*) args->tmp_arr2;
+
+ int max_len = 0;
+ for (i=0; i<nsmpl; i++)
+ if ( max_len < args->tmp_str[i].l ) max_len = args->tmp_str[i].l;
+ for (i=0; i<nsmpl; i++)
+ {
+ kstring_t *tmp = &args->tmp_str[i];
+ kputsn(tmp->s,tmp->l,&str);
+ for (j=tmp->l; j<max_len; j++) kputc(0,tmp);
+ }
+ args->ntmp_arr2 = str.m;
+ args->tmp_arr2 = (uint8_t*)str.s;
+ bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+}
+
+char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb); // see vcfmerge.c
+static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t **lines, int nlines)
+{
+ int i;
+ for (i=0; i<nlines; i++)
+ bcf_unpack(lines[i], BCF_UN_ALL);
+
+ dst->rid = lines[0]->rid;
+ dst->pos = lines[0]->pos;
+
+ // take max for QUAL
+ bcf_float_set_missing(dst->qual);
+ for (i=0; i<nlines; i++) {
+ if (bcf_float_is_missing(lines[i]->qual)) continue;
+ if (bcf_float_is_missing(dst->qual) || dst->qual<lines[i]->qual)
+ dst->qual = lines[i]->qual;
+ }
+
+ bcf_update_id(args->hdr, dst, lines[0]->d.id);
+
+ // Merge and set the alleles, create a mapping from source allele indexes to dst idxs
+ hts_expand0(map_t,nlines,args->mmaps,args->maps); // a mapping for each line
+ args->nals = args->maps[0].nals = lines[0]->n_allele;
+ hts_expand(int,args->maps[0].nals,args->maps[0].mals,args->maps[0].map);
+ hts_expand(char*,args->nals,args->mals,args->als);
+ for (i=0; i<args->maps[0].nals; i++)
+ {
+ args->maps[0].map[i] = i;
+ args->als[i] = strdup(lines[0]->d.allele[i]);
+ }
+ for (i=1; i<nlines; i++)
+ {
+ if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id);
+ args->maps[i].nals = lines[i]->n_allele;
+ hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map);
+ args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals);
+ if ( !args->als ) error("Failed to merge alleles at %s:%d\n", bcf_seqname(args->hdr,dst),dst->pos+1);
+ }
+ bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals);
+ for (i=0; i<args->nals; i++)
+ {
+ free(args->als[i]);
+ args->als[i] = NULL;
+ }
+
+ if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
+ for (i=1; i<nlines; i++) {
+ int j;
+ for (j=0; j<lines[i]->d.n_flt; j++) {
+ // if strict_filter, set FILTER to PASS if any site PASS
+ // otherwise accumulate FILTERs
+ if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) {
+ if (args->strict_filter) {
+ bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
+ break;
+ }
+ else
+ continue;
+ }
+ bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]);
+ }
+ }
+
+ // merge info
+ for (i=0; i<lines[0]->n_info; i++)
+ {
+ bcf_info_t *info = &lines[0]->d.info[i];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) merge_info_numeric(args, lines, nlines, info, dst);
+ else if ( type==BCF_HT_FLAG ) merge_info_flag(args, lines, nlines, info, dst);
+ else merge_info_string(args, lines, nlines, info, dst);
+ }
+
+ // merge format
+ int gt_id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"GT");
+ dst->n_sample = lines[0]->n_sample;
+ for (i=0; i<lines[0]->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &lines[0]->d.fmt[i];
+ int type = bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id);
+ if ( fmt->id==gt_id ) merge_format_genotype(args, lines, nlines, fmt, dst);
+ else if ( type==BCF_HT_INT || type==BCF_HT_REAL ) merge_format_numeric(args, lines, nlines, fmt, dst);
+ else merge_format_string(args, lines, nlines, fmt, dst);
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+static void mrows_schedule(args_t *args, bcf1_t **line)
+{
+ int i,m;
+ if ( args->mrows_collapse==COLLAPSE_ANY // merge all record types together
+ || bcf_get_variant_types(*line)&VCF_SNP // SNP, put into alines
+ || bcf_get_variant_types(*line)==VCF_REF ) // ref
+ {
+ args->nalines++;
+ m = args->malines;
+ hts_expand(bcf1_t*,args->nalines,args->malines,args->alines);
+ for (i=m; i<args->malines; i++) args->alines[i] = bcf_init1();
+ SWAP(bcf1_t*, args->alines[args->nalines-1], *line);
+ }
+ else
+ {
+ args->nblines++;
+ m = args->mblines;
+ hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
+ for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
+ SWAP(bcf1_t*, args->blines[args->nblines-1], *line);
+ }
+}
+static int mrows_ready_to_flush(args_t *args, bcf1_t *line)
+{
+ if ( args->nalines && (args->alines[0]->rid!=line->rid || args->alines[0]->pos!=line->pos) ) return 1;
+ if ( args->nblines && (args->blines[0]->rid!=line->rid || args->blines[0]->pos!=line->pos) ) return 1;
+ return 0;
+}
+static bcf1_t *mrows_flush(args_t *args)
+{
+ if ( args->nblines && args->nalines==1 && bcf_get_variant_types(args->alines[0])==VCF_REF )
+ {
+ // By default, REF lines are merged with SNPs if SNPs and indels are to be kept separately.
+ // However, if there are indels only and a single REF line, merge it with indels.
+ args->nblines++;
+ int i,m = args->mblines;
+ hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
+ for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
+ SWAP(bcf1_t*, args->blines[args->nblines-1], args->alines[0]);
+ args->nalines--;
+ }
+ if ( args->nalines )
+ {
+ if ( args->nalines==1 )
+ {
+ args->nalines = 0;
+ return args->alines[0];
+ }
+ bcf_clear(args->mrow_out);
+ merge_biallelics_to_multiallelic(args, args->mrow_out, args->alines, args->nalines);
+ args->nalines = 0;
+ return args->mrow_out;
+ }
+ else if ( args->nblines )
+ {
+ if ( args->nblines==1 )
+ {
+ args->nblines = 0;
+ return args->blines[0];
+ }
+ bcf_clear(args->mrow_out);
+ merge_biallelics_to_multiallelic(args, args->mrow_out, args->blines, args->nblines);
+ args->nblines = 0;
+ return args->mrow_out;
+ }
+ return NULL;
+}
+static void flush_buffer(args_t *args, htsFile *file, int n)
+{
+ bcf1_t *line;
+ int i, k;
+ for (i=0; i<n; i++)
+ {
+ k = rbuf_shift(&args->rbuf);
+ if ( args->mrows_op==MROWS_MERGE )
+ {
+ if ( mrows_ready_to_flush(args, args->lines[k]) )
+ {
+ while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line);
+ }
+ int merge = 1;
+ if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
+ {
+ if ( !(bcf_get_variant_types(args->lines[k]) & args->mrows_collapse) ) merge = 0;
+ }
+ if ( merge )
+ {
+ mrows_schedule(args, &args->lines[k]);
+ continue;
+ }
+ }
+ bcf_write1(file, args->hdr, args->lines[k]);
+ }
+ if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
+ {
+ while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line);
+ }
+}
+
+static void init_data(args_t *args)
+{
+ args->hdr = args->files->readers[0].header;
+ rbuf_init(&args->rbuf, 100);
+ args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
+ if ( args->ref_fname )
+ {
+ args->fai = fai_load(args->ref_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->ref_fname);
+ }
+ if ( args->mrows_op==MROWS_MERGE )
+ {
+ args->mrow_out = bcf_init1();
+ args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
+ args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ for (i=0; i<args->rbuf.m; i++)
+ if ( args->lines[i] ) bcf_destroy1(args->lines[i]);
+ free(args->lines);
+ for (i=0; i<args->mtmp_lines; i++)
+ if ( args->tmp_lines[i] ) bcf_destroy1(args->tmp_lines[i]);
+ free(args->tmp_lines);
+ for (i=0; i<args->malines; i++)
+ bcf_destroy1(args->alines[i]);
+ free(args->alines);
+ for (i=0; i<args->mblines; i++)
+ bcf_destroy1(args->blines[i]);
+ free(args->blines);
+ for (i=0; i<args->mmaps; i++)
+ free(args->maps[i].map);
+ for (i=0; i<args->ntmp_als; i++)
+ free(args->tmp_als[i].s);
+ free(args->tmp_als);
+ free(args->tmp_als_str.s);
+ if ( args->tmp_str )
+ {
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) free(args->tmp_str[i].s);
+ free(args->tmp_str);
+ }
+ free(args->maps);
+ free(args->als);
+ free(args->tmp_arr1);
+ free(args->tmp_arr2);
+ free(args->diploid);
+ if ( args->mrow_out ) bcf_destroy1(args->mrow_out);
+ if ( args->fai ) fai_destroy(args->fai);
+ if ( args->mseq ) free(args->seq);
+}
+
+
+static void normalize_line(args_t *args, bcf1_t **line_ptr)
+{
+ bcf1_t *line = *line_ptr;
+ if ( args->fai )
+ {
+ if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
+ if ( args->do_indels )
+ {
+ int ret = realign(args, line);
+
+ // exclude broken VCF lines
+ if ( ret==ERR_REF_MISMATCH && args->check_ref & CHECK_REF_SKIP )
+ {
+ args->nskipped++;
+ return;
+ }
+ if ( ret==ERR_DUP_ALLELE )
+ {
+ if ( args->check_ref & CHECK_REF_FIX )
+ fix_dup_alt(args, line);
+ else if ( args->check_ref==CHECK_REF_EXIT )
+ error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1);
+ else if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(pysamerr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1);
+ }
+ }
+ }
+
+ // insert into sorted buffer
+ rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
+ int i,j;
+ i = j = rbuf_append(&args->rbuf);
+ if ( !args->lines[i] ) args->lines[i] = bcf_init1();
+ SWAP(bcf1_t*, (*line_ptr), args->lines[i]);
+ while ( rbuf_prev(&args->rbuf,&i) )
+ {
+ if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
+ j = i;
+ }
+}
+
+static void normalize_vcf(args_t *args)
+{
+ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(out, args->n_threads);
+ bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
+ bcf_hdr_write(out, args->hdr);
+
+ int prev_rid = -1, prev_pos = -1, prev_type = 0;
+ while ( bcf_sr_next_line(args->files) )
+ {
+ args->ntotal++;
+
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ if ( args->rmdup )
+ {
+ int line_type = bcf_get_variant_types(line);
+ if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
+ {
+ if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ }
+ else
+ {
+ prev_rid = line->rid;
+ prev_pos = line->pos;
+ prev_type = 0;
+ }
+ prev_type |= line_type;
+ }
+
+ // still on the same chromosome?
+ int i,j,ilast = rbuf_last(&args->rbuf);
+ if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome
+
+ int split = 0;
+ if ( args->mrows_op==MROWS_SPLIT )
+ {
+ split = 1;
+ if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
+ {
+ if ( !(bcf_get_variant_types(line) & args->mrows_collapse) ) split = 0;
+ }
+ if ( split && line->n_allele>2 )
+ {
+ args->nsplit++;
+ split_multiallelic_to_biallelics(args, line);
+ for (j=0; j<args->ntmp_lines; j++)
+ normalize_line(args, &args->tmp_lines[j]);
+ }
+ else
+ split = 0;
+ }
+ if ( !split )
+ normalize_line(args, &args->files->readers[0].buffer[0]);
+
+ // find out how many sites to flush
+ ilast = rbuf_last(&args->rbuf);
+ j = 0;
+ for (i=-1; rbuf_next(&args->rbuf,&i); )
+ {
+ if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
+ j++;
+ }
+ if ( args->rbuf.n==args->rbuf.m ) j = 1;
+ if ( j>0 ) flush_buffer(args, out, j);
+ }
+ flush_buffer(args, out, args->rbuf.n);
+ hts_close(out);
+
+ fprintf(pysamerr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
+ if ( args->check_ref & CHECK_REF_FIX )
+ fprintf(pysamerr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
+}
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n");
+ fprintf(pysamerr, " split multiallelic sites into multiple rows; recover multiallelics from\n");
+ fprintf(pysamerr, " multiple rows.\n");
+ fprintf(pysamerr, "Usage: bcftools norm [options] <in.vcf.gz>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
+ fprintf(pysamerr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
+ fprintf(pysamerr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
+ fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence\n");
+ fprintf(pysamerr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(pysamerr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfnorm(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->aln_win = 100;
+ args->buf_win = 1000;
+ args->mrows_collapse = COLLAPSE_BOTH;
+ args->do_indels = 1;
+ int region_is_file = 0;
+ int targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",no_argument,NULL,'h'},
+ {"fasta-ref",required_argument,NULL,'f'},
+ {"do-not-normalize",no_argument,NULL,'N'},
+ {"multiallelics",required_argument,NULL,'m'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"site-win",required_argument,NULL,'w'},
+ {"remove-duplicates",no_argument,NULL,'D'},
+ {"rm-dup",required_argument,NULL,'d'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"check-ref",required_argument,NULL,'c'},
+ {"strict-filter",no_argument,NULL,'s'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'N': args->do_indels = 0; break;
+ case 'd':
+ if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1;
+ else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1;
+ else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1;
+ else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1;
+ else error("The argument to -d not recognised: %s\n", optarg);
+ break;
+ case 'm':
+ if ( optarg[0]=='-' ) args->mrows_op = MROWS_SPLIT;
+ else if ( optarg[0]=='+' ) args->mrows_op = MROWS_MERGE;
+ else error("Expected '+' or '-' with -m\n");
+ if ( optarg[1]!=0 )
+ {
+ if ( !strcmp("snps",optarg+1) ) args->mrows_collapse = COLLAPSE_SNPS;
+ else if ( !strcmp("indels",optarg+1) ) args->mrows_collapse = COLLAPSE_INDELS;
+ else if ( !strcmp("both",optarg+1) ) args->mrows_collapse = COLLAPSE_BOTH;
+ else if ( !strcmp("any",optarg+1) ) args->mrows_collapse = COLLAPSE_ANY;
+ else error("The argument to -m not recognised: %s\n", optarg);
+ }
+ break;
+ case 'c':
+ if ( strchr(optarg,'w') ) args->check_ref |= CHECK_REF_WARN;
+ if ( strchr(optarg,'x') ) args->check_ref |= CHECK_REF_SKIP;
+ if ( strchr(optarg,'s') ) args->check_ref |= CHECK_REF_FIX;
+ if ( strchr(optarg,'e') ) args->check_ref = CHECK_REF_EXIT; // overrides the above
+ break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'D':
+ fprintf(pysamerr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n");
+ args->rmdup = COLLAPSE_NONE<<1;
+ break;
+ case 's': args->strict_filter = 1; break;
+ case 'f': args->ref_fname = optarg; break;
+ case 'r': args->region = optarg; break;
+ case 'R': args->region = optarg; region_is_file = 1; break;
+ case 't': args->targets = optarg; break;
+ case 'T': args->targets = optarg; targets_is_file = 1; break;
+ case 'w':
+ args->buf_win = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --site-win %s\n", optarg);
+ break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( argc>optind+1 ) usage();
+ if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) usage();
+ if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n");
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
+ }
+ else fname = argv[optind];
+
+ if ( args->region )
+ {
+ if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->region);
+ }
+ if ( args->targets )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets,targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets);
+ }
+
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
+ init_data(args);
+ normalize_vcf(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c
new file mode 100644
index 0000000..e2ca04a
--- /dev/null
+++ b/bcftools/vcfplugin.c
@@ -0,0 +1,614 @@
+/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include <dlfcn.h>
+#include "bcftools.h"
+#include "vcmp.h"
+#include "filter.h"
+
+typedef struct _plugin_t plugin_t;
+
+/**
+ * Plugin API:
+ * ----------
+ * const char *about(void)
+ * - short description used by 'bcftools plugin -l'
+ *
+ * const char *usage(void)
+ * - longer description used by 'bcftools +name -h'
+ *
+ * int run(int argc, char **argv)
+ * - if implemented, the control is immediately handed over to the plugin,
+ * none of the init/process/destroy functions is called. Return 0 on
+ * success or non-zero value on error.
+ *
+ * int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr)
+ * - called once at startup, allows to initialize local variables.
+ * Return 1 to suppress normal VCF/BCF header output, -1 on critical
+ * errors, 0 otherwise.
+ *
+ * bcf1_t *process(bcf1_t *rec)
+ * - called for each VCF record, return NULL for no output
+ *
+ * void destroy(void)
+ * - called after all lines have been processed to clean up
+ */
+typedef void (*dl_version_f) (const char **, const char **);
+typedef int (*dl_run_f) (int, char **);
+typedef int (*dl_init_f) (int, char **, bcf_hdr_t *, bcf_hdr_t *);
+typedef char* (*dl_about_f) (void);
+typedef char* (*dl_usage_f) (void);
+typedef bcf1_t* (*dl_process_f) (bcf1_t *);
+typedef void (*dl_destroy_f) (void);
+
+struct _plugin_t
+{
+ int argc;
+ char *name, **argv;
+ dl_version_f version;
+ dl_run_f run;
+ dl_init_f init;
+ dl_about_f about;
+ dl_usage_f usage;
+ dl_process_f process;
+ dl_destroy_f destroy;
+ void *handle;
+};
+
+
+struct _args_t;
+
+typedef struct _rm_tag_t
+{
+ char *key;
+ int hdr_id;
+ void (*handler)(struct _args_t *, bcf1_t *, struct _rm_tag_t *);
+}
+rm_tag_t;
+
+typedef struct
+{
+ char **cols;
+ int ncols, mcols;
+ char **als;
+ int nals, mals;
+ kstring_t line;
+ int rid, start, end;
+}
+annot_line_t;
+
+typedef struct _annot_col_t
+{
+ int icol, replace;
+ char *hdr_key;
+ int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+}
+annot_col_t;
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr, *hdr_out;
+ htsFile *out_fh;
+ int output_type, n_threads;
+
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
+ plugin_t plugin;
+ int nplugin_paths;
+ char **plugin_paths;
+
+ char **argv, *output_fname, *regions_list, *targets_list;
+ int argc, drop_header, verbose;
+}
+args_t;
+
+char *msprintf(const char *fmt, ...);
+
+static void add_plugin_paths(args_t *args, const char *path)
+{
+ while (1)
+ {
+ size_t len = strcspn(path, ":");
+
+ if ( len == 0 )
+ {
+#ifdef PLUGINPATH
+ add_plugin_paths(args, PLUGINPATH);
+#endif
+ }
+ else
+ {
+ char *dir = (char *) malloc(len + 1);
+ strncpy(dir, path, len);
+ dir[len] = '\0';
+
+ struct stat st;
+ if ( stat(dir, &st) == 0 )
+ {
+ args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
+ args->plugin_paths[args->nplugin_paths] = dir;
+ args->nplugin_paths++;
+ if ( args->verbose ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
+ }
+ else
+ {
+ if ( args->verbose ) fprintf(stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
+ free(dir);
+ }
+
+ }
+
+ path += len;
+ if ( *path == ':' ) path++;
+ else break;
+ }
+}
+
+static void init_plugin_paths(args_t *args)
+{
+ if ( args->nplugin_paths!=-1 ) return;
+
+ args->nplugin_paths = 0;
+ args->plugin_paths = NULL;
+
+ char *path = getenv("BCFTOOLS_PLUGINS");
+ add_plugin_paths(args, path ? path : "");
+}
+
+static void *dlopen_plugin(args_t *args, const char *fname)
+{
+ init_plugin_paths(args);
+
+ void *handle;
+ char *tmp;
+ if ( fname[0]!='/' ) // not an absolute path
+ {
+ int i;
+ for (i=0; i<args->nplugin_paths; i++)
+ {
+ tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
+ handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
+ if ( args->verbose )
+ {
+ if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
+ else fprintf(stderr,"%s:\n\tdlopen .. ok\n", tmp);
+ }
+ free(tmp);
+ if ( handle ) return handle;
+ }
+ }
+
+ handle = dlopen(fname, RTLD_NOW);
+ if ( args->verbose )
+ {
+ if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
+ else fprintf(stderr,"%s:\n\tdlopen .. ok\n", fname);
+ }
+
+ return handle;
+}
+
+static void print_plugin_usage_hint(void)
+{
+ fprintf(stderr, "\nNo functional bcftools plugins were found");
+ if ( !getenv("BCFTOOLS_PLUGINS") )
+ fprintf(stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
+ else
+ fprintf(stderr,
+ " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
+ "- Is the plugin path correct?\n\n"
+ "- Are all shared libraries, namely libhts.so, accessible? Verify with\n"
+ " on Mac OS X: `otool -L your/plugin.so` and set DYLD_LIBRARY_PATH if they are not\n"
+ " on Linux: `ldd your/plugin.so` and set LD_LIBRARY_PATH if they are not\n"
+ "\n"
+ "- If not installed systemwide, set the environment variable LD_LIBRARY_PATH (linux) or\n"
+ "DYLD_LIBRARY_PATH (mac) to include directory where *libhts.so* is located.\n"
+ "\n"
+ "- Run \"bcftools plugin -lv\" for more detailed error output.\n"
+ "\n",
+ getenv("BCFTOOLS_PLUGINS")
+ );
+}
+
+static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin)
+{
+ plugin->name = strdup(fname);
+
+ plugin->handle = dlopen_plugin(args, fname);
+ if ( !plugin->handle )
+ {
+ if ( exit_on_error )
+ {
+ print_plugin_usage_hint();
+ error("Could not load \"%s\".\n\n", fname);
+ }
+ return -1;
+ }
+
+ dlerror();
+ plugin->init = (dl_init_f) dlsym(plugin->handle, "init");
+ char *ret = dlerror();
+ if ( ret )
+ plugin->init = NULL;
+ else
+ if ( args->verbose ) fprintf(stderr,"\tinit .. ok\n");
+
+ plugin->run = (dl_run_f) dlsym(plugin->handle, "run");
+ ret = dlerror();
+ if ( ret )
+ plugin->run = NULL;
+ else
+ if ( args->verbose ) fprintf(stderr,"\trun .. ok\n");
+
+ if ( !plugin->init && !plugin->run )
+ {
+ if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name);
+ else if ( args->verbose ) fprintf(stderr,"\tinit/run .. not found\n");
+ return -1;
+ }
+
+ plugin->version = (dl_version_f) dlsym(plugin->handle, "version");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name);
+ else if ( args->verbose ) fprintf(stderr,"\tversion .. not found\n");
+ return -1;
+ }
+
+ plugin->about = (dl_about_f) dlsym(plugin->handle, "about");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret);
+ return -1;
+ }
+
+ plugin->usage = (dl_about_f) dlsym(plugin->handle, "usage");
+ ret = dlerror();
+ if ( ret )
+ plugin->usage = plugin->about;
+
+ if ( plugin->run ) return 0;
+
+ plugin->process = (dl_process_f) dlsym(plugin->handle, "process");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret);
+ return -1;
+ }
+
+ plugin->destroy = (dl_destroy_f) dlsym(plugin->handle, "destroy");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void init_plugin(args_t *args)
+{
+ static int warned_bcftools = 0, warned_htslib = 0;
+
+ int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
+ if ( ret<0 ) error("The plugin exited with an error.\n");
+ const char *bver, *hver;
+ args->plugin.version(&bver, &hver);
+ if ( strcmp(bver,bcftools_version()) && !warned_bcftools )
+ {
+ fprintf(stderr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver);
+ warned_bcftools = 1;
+ }
+ if ( strcmp(hver,hts_version()) && !warned_htslib )
+ {
+ fprintf(stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
+ warned_htslib = 1;
+ }
+ args->drop_header += ret;
+}
+
+static int cmp_plugin_name(const void *p1, const void *p2)
+{
+ plugin_t *a = (plugin_t*) p1;
+ plugin_t *b = (plugin_t*) p2;
+ return strcmp(a->name,b->name);
+}
+
+static int list_plugins(args_t *args)
+{
+ plugin_t *plugins = NULL;
+ int nplugins = 0, mplugins = 0;
+
+ init_plugin_paths(args);
+
+ kstring_t str = {0,0,0};
+ int i;
+ for (i=0; i<args->nplugin_paths; i++)
+ {
+ DIR *dp = opendir(args->plugin_paths[i]);
+ if ( dp==NULL ) continue;
+
+ struct dirent *ep;
+ while ( (ep=readdir(dp)) )
+ {
+ int len = strlen(ep->d_name);
+ if ( strcasecmp(".so",ep->d_name+len-3) ) continue;
+ str.l = 0;
+ ksprintf(&str,"%s/%s", args->plugin_paths[i],ep->d_name);
+ hts_expand(plugin_t, nplugins+1, mplugins, plugins);
+ if ( load_plugin(args, str.s, 0, &plugins[nplugins]) < 0 ) continue;
+ nplugins++;
+ str.l = 0;
+ kputs(ep->d_name, &str);
+ int l = str.l - 1;
+ while ( l>=0 && str.s[l]!='.' ) l--;
+ if ( l>=0 ) str.s[l] = 0;
+ free(plugins[nplugins-1].name);
+ plugins[nplugins-1].name = strdup(str.s); // use a short name
+ }
+ closedir(dp);
+ }
+ if ( nplugins )
+ {
+ qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name);
+
+ for (i=0; i<nplugins; i++)
+ printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about());
+ printf("\n");
+ }
+ else
+ print_plugin_usage_hint();
+ free(str.s);
+ return nplugins ? 0 : 1;
+}
+
+static void init_data(args_t *args)
+{
+ args->hdr = args->files->readers[0].header;
+ args->hdr_out = bcf_hdr_dup(args->hdr);
+
+ init_plugin(args);
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
+ if ( !args->drop_header )
+ {
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ bcf_hdr_write(args->out_fh, args->hdr_out);
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ free(args->plugin.name);
+ if ( args->plugin.destroy ) args->plugin.destroy();
+ dlclose(args->plugin.handle);
+ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
+ if ( args->nplugin_paths>0 )
+ {
+ int i;
+ for (i=0; i<args->nplugin_paths; i++) free(args->plugin_paths[i]);
+ free(args->plugin_paths);
+ }
+ if ( args->filter )
+ filter_destroy(args->filter);
+ if (args->out_fh) hts_close(args->out_fh);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Run user defined plugin\n");
+ fprintf(stderr, "Usage: bcftools plugin <name> [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
+ fprintf(stderr, " bcftools +name [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "VCF input options:\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, "VCF output options:\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "Plugin options:\n");
+ fprintf(stderr, " -h, --help list plugin's options\n");
+ fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
+ fprintf(stderr, " -v, --verbose print debugging information on plugin failure\n");
+ fprintf(stderr, " -V, --version print version string and exit\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_plugin(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->nplugin_paths = -1;
+ int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
+
+ if ( argc==1 ) usage(args);
+ char *plugin_name = NULL;
+ if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; }
+
+ static struct option loptions[] =
+ {
+ {"version",no_argument,NULL,'V'},
+ {"verbose",no_argument,NULL,'v'},
+ {"help",no_argument,NULL,'h'},
+ {"list-plugins",no_argument,NULL,'l'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"include",required_argument,NULL,'i'},
+ {"exclude",required_argument,NULL,'e'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'V': version_only = 1; break;
+ case 'v': args->verbose = 1; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'l': plist_only = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?':
+ case 'h': usage_only = 1; break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( plist_only ) return list_plugins(args);
+ if ( usage_only && ! plugin_name ) usage(args);
+
+ load_plugin(args, plugin_name, 1, &args->plugin);
+ if ( version_only )
+ {
+ const char *bver, *hver;
+ args->plugin.version(&bver, &hver);
+ printf("bcftools %s using htslib %s\n", bcftools_version(), hts_version());
+ printf("plugin at %s using htslib %s\n\n", bver, hver);
+ return 0;
+ }
+
+ if ( usage_only )
+ {
+ if ( args->plugin.usage )
+ fprintf(stderr,"%s",args->plugin.usage());
+ else
+ fprintf(stderr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name);
+ return 0;
+ }
+
+ if ( args->plugin.run )
+ {
+ int iopt = optind; optind = 0;
+ int ret = args->plugin.run(argc-iopt, argv+iopt);
+ destroy_data(args);
+ free(args);
+ return ret;
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc || argv[optind][0]=='-' )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ args->plugin.argc = argc - optind + 1;
+ args->plugin.argv = argv + optind - 1;
+ }
+ else
+ {
+ fname = argv[optind];
+ args->plugin.argc = argc - optind;
+ args->plugin.argv = argv + optind;
+ }
+ optind = 0;
+
+ args->files = bcf_sr_init();
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ args->files->collapse |= COLLAPSE_SOME;
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+ line = args->plugin.process(line);
+ if ( line ) bcf_write1(args->out_fh, args->hdr_out, line);
+ }
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c
new file mode 100644
index 0000000..5c29993
--- /dev/null
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -0,0 +1,616 @@
+#include "pysam.h"
+
+/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include <dlfcn.h>
+#include "bcftools.h"
+#include "vcmp.h"
+#include "filter.h"
+
+typedef struct _plugin_t plugin_t;
+
+/**
+ * Plugin API:
+ * ----------
+ * const char *about(void)
+ * - short description used by 'bcftools plugin -l'
+ *
+ * const char *usage(void)
+ * - longer description used by 'bcftools +name -h'
+ *
+ * int run(int argc, char **argv)
+ * - if implemented, the control is immediately handed over to the plugin,
+ * none of the init/process/destroy functions is called. Return 0 on
+ * success or non-zero value on error.
+ *
+ * int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr)
+ * - called once at startup, allows to initialize local variables.
+ * Return 1 to suppress normal VCF/BCF header output, -1 on critical
+ * errors, 0 otherwise.
+ *
+ * bcf1_t *process(bcf1_t *rec)
+ * - called for each VCF record, return NULL for no output
+ *
+ * void destroy(void)
+ * - called after all lines have been processed to clean up
+ */
+typedef void (*dl_version_f) (const char **, const char **);
+typedef int (*dl_run_f) (int, char **);
+typedef int (*dl_init_f) (int, char **, bcf_hdr_t *, bcf_hdr_t *);
+typedef char* (*dl_about_f) (void);
+typedef char* (*dl_usage_f) (void);
+typedef bcf1_t* (*dl_process_f) (bcf1_t *);
+typedef void (*dl_destroy_f) (void);
+
+struct _plugin_t
+{
+ int argc;
+ char *name, **argv;
+ dl_version_f version;
+ dl_run_f run;
+ dl_init_f init;
+ dl_about_f about;
+ dl_usage_f usage;
+ dl_process_f process;
+ dl_destroy_f destroy;
+ void *handle;
+};
+
+
+struct _args_t;
+
+typedef struct _rm_tag_t
+{
+ char *key;
+ int hdr_id;
+ void (*handler)(struct _args_t *, bcf1_t *, struct _rm_tag_t *);
+}
+rm_tag_t;
+
+typedef struct
+{
+ char **cols;
+ int ncols, mcols;
+ char **als;
+ int nals, mals;
+ kstring_t line;
+ int rid, start, end;
+}
+annot_line_t;
+
+typedef struct _annot_col_t
+{
+ int icol, replace;
+ char *hdr_key;
+ int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+}
+annot_col_t;
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr, *hdr_out;
+ htsFile *out_fh;
+ int output_type, n_threads;
+
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
+ plugin_t plugin;
+ int nplugin_paths;
+ char **plugin_paths;
+
+ char **argv, *output_fname, *regions_list, *targets_list;
+ int argc, drop_header, verbose;
+}
+args_t;
+
+char *msprintf(const char *fmt, ...);
+
+static void add_plugin_paths(args_t *args, const char *path)
+{
+ while (1)
+ {
+ size_t len = strcspn(path, ":");
+
+ if ( len == 0 )
+ {
+#ifdef PLUGINPATH
+ add_plugin_paths(args, PLUGINPATH);
+#endif
+ }
+ else
+ {
+ char *dir = (char *) malloc(len + 1);
+ strncpy(dir, path, len);
+ dir[len] = '\0';
+
+ struct stat st;
+ if ( stat(dir, &st) == 0 )
+ {
+ args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
+ args->plugin_paths[args->nplugin_paths] = dir;
+ args->nplugin_paths++;
+ if ( args->verbose ) fprintf(pysamerr, "plugin directory %s .. ok\n", dir);
+ }
+ else
+ {
+ if ( args->verbose ) fprintf(pysamerr, "plugin directory %s .. %s\n", dir, strerror(errno));
+ free(dir);
+ }
+
+ }
+
+ path += len;
+ if ( *path == ':' ) path++;
+ else break;
+ }
+}
+
+static void init_plugin_paths(args_t *args)
+{
+ if ( args->nplugin_paths!=-1 ) return;
+
+ args->nplugin_paths = 0;
+ args->plugin_paths = NULL;
+
+ char *path = getenv("BCFTOOLS_PLUGINS");
+ add_plugin_paths(args, path ? path : "");
+}
+
+static void *dlopen_plugin(args_t *args, const char *fname)
+{
+ init_plugin_paths(args);
+
+ void *handle;
+ char *tmp;
+ if ( fname[0]!='/' ) // not an absolute path
+ {
+ int i;
+ for (i=0; i<args->nplugin_paths; i++)
+ {
+ tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
+ handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
+ if ( args->verbose )
+ {
+ if ( !handle ) fprintf(pysamerr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
+ else fprintf(pysamerr,"%s:\n\tdlopen .. ok\n", tmp);
+ }
+ free(tmp);
+ if ( handle ) return handle;
+ }
+ }
+
+ handle = dlopen(fname, RTLD_NOW);
+ if ( args->verbose )
+ {
+ if ( !handle ) fprintf(pysamerr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
+ else fprintf(pysamerr,"%s:\n\tdlopen .. ok\n", fname);
+ }
+
+ return handle;
+}
+
+static void print_plugin_usage_hint(void)
+{
+ fprintf(pysamerr, "\nNo functional bcftools plugins were found");
+ if ( !getenv("BCFTOOLS_PLUGINS") )
+ fprintf(pysamerr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
+ else
+ fprintf(pysamerr,
+ " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
+ "- Is the plugin path correct?\n\n"
+ "- Are all shared libraries, namely libhts.so, accessible? Verify with\n"
+ " on Mac OS X: `otool -L your/plugin.so` and set DYLD_LIBRARY_PATH if they are not\n"
+ " on Linux: `ldd your/plugin.so` and set LD_LIBRARY_PATH if they are not\n"
+ "\n"
+ "- If not installed systemwide, set the environment variable LD_LIBRARY_PATH (linux) or\n"
+ "DYLD_LIBRARY_PATH (mac) to include directory where *libhts.so* is located.\n"
+ "\n"
+ "- Run \"bcftools plugin -lv\" for more detailed error output.\n"
+ "\n",
+ getenv("BCFTOOLS_PLUGINS")
+ );
+}
+
+static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin)
+{
+ plugin->name = strdup(fname);
+
+ plugin->handle = dlopen_plugin(args, fname);
+ if ( !plugin->handle )
+ {
+ if ( exit_on_error )
+ {
+ print_plugin_usage_hint();
+ error("Could not load \"%s\".\n\n", fname);
+ }
+ return -1;
+ }
+
+ dlerror();
+ plugin->init = (dl_init_f) dlsym(plugin->handle, "init");
+ char *ret = dlerror();
+ if ( ret )
+ plugin->init = NULL;
+ else
+ if ( args->verbose ) fprintf(pysamerr,"\tinit .. ok\n");
+
+ plugin->run = (dl_run_f) dlsym(plugin->handle, "run");
+ ret = dlerror();
+ if ( ret )
+ plugin->run = NULL;
+ else
+ if ( args->verbose ) fprintf(pysamerr,"\trun .. ok\n");
+
+ if ( !plugin->init && !plugin->run )
+ {
+ if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name);
+ else if ( args->verbose ) fprintf(pysamerr,"\tinit/run .. not found\n");
+ return -1;
+ }
+
+ plugin->version = (dl_version_f) dlsym(plugin->handle, "version");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name);
+ else if ( args->verbose ) fprintf(pysamerr,"\tversion .. not found\n");
+ return -1;
+ }
+
+ plugin->about = (dl_about_f) dlsym(plugin->handle, "about");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret);
+ return -1;
+ }
+
+ plugin->usage = (dl_about_f) dlsym(plugin->handle, "usage");
+ ret = dlerror();
+ if ( ret )
+ plugin->usage = plugin->about;
+
+ if ( plugin->run ) return 0;
+
+ plugin->process = (dl_process_f) dlsym(plugin->handle, "process");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret);
+ return -1;
+ }
+
+ plugin->destroy = (dl_destroy_f) dlsym(plugin->handle, "destroy");
+ ret = dlerror();
+ if ( ret )
+ {
+ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void init_plugin(args_t *args)
+{
+ static int warned_bcftools = 0, warned_htslib = 0;
+
+ int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
+ if ( ret<0 ) error("The plugin exited with an error.\n");
+ const char *bver, *hver;
+ args->plugin.version(&bver, &hver);
+ if ( strcmp(bver,bcftools_version()) && !warned_bcftools )
+ {
+ fprintf(pysamerr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver);
+ warned_bcftools = 1;
+ }
+ if ( strcmp(hver,hts_version()) && !warned_htslib )
+ {
+ fprintf(pysamerr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
+ warned_htslib = 1;
+ }
+ args->drop_header += ret;
+}
+
+static int cmp_plugin_name(const void *p1, const void *p2)
+{
+ plugin_t *a = (plugin_t*) p1;
+ plugin_t *b = (plugin_t*) p2;
+ return strcmp(a->name,b->name);
+}
+
+static int list_plugins(args_t *args)
+{
+ plugin_t *plugins = NULL;
+ int nplugins = 0, mplugins = 0;
+
+ init_plugin_paths(args);
+
+ kstring_t str = {0,0,0};
+ int i;
+ for (i=0; i<args->nplugin_paths; i++)
+ {
+ DIR *dp = opendir(args->plugin_paths[i]);
+ if ( dp==NULL ) continue;
+
+ struct dirent *ep;
+ while ( (ep=readdir(dp)) )
+ {
+ int len = strlen(ep->d_name);
+ if ( strcasecmp(".so",ep->d_name+len-3) ) continue;
+ str.l = 0;
+ ksprintf(&str,"%s/%s", args->plugin_paths[i],ep->d_name);
+ hts_expand(plugin_t, nplugins+1, mplugins, plugins);
+ if ( load_plugin(args, str.s, 0, &plugins[nplugins]) < 0 ) continue;
+ nplugins++;
+ str.l = 0;
+ kputs(ep->d_name, &str);
+ int l = str.l - 1;
+ while ( l>=0 && str.s[l]!='.' ) l--;
+ if ( l>=0 ) str.s[l] = 0;
+ free(plugins[nplugins-1].name);
+ plugins[nplugins-1].name = strdup(str.s); // use a short name
+ }
+ closedir(dp);
+ }
+ if ( nplugins )
+ {
+ qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name);
+
+ for (i=0; i<nplugins; i++)
+ printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about());
+ printf("\n");
+ }
+ else
+ print_plugin_usage_hint();
+ free(str.s);
+ return nplugins ? 0 : 1;
+}
+
+static void init_data(args_t *args)
+{
+ args->hdr = args->files->readers[0].header;
+ args->hdr_out = bcf_hdr_dup(args->hdr);
+
+ init_plugin(args);
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
+ if ( !args->drop_header )
+ {
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ bcf_hdr_write(args->out_fh, args->hdr_out);
+ }
+}
+
+static void destroy_data(args_t *args)
+{
+ free(args->plugin.name);
+ if ( args->plugin.destroy ) args->plugin.destroy();
+ dlclose(args->plugin.handle);
+ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
+ if ( args->nplugin_paths>0 )
+ {
+ int i;
+ for (i=0; i<args->nplugin_paths; i++) free(args->plugin_paths[i]);
+ free(args->plugin_paths);
+ }
+ if ( args->filter )
+ filter_destroy(args->filter);
+ if (args->out_fh) hts_close(args->out_fh);
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Run user defined plugin\n");
+ fprintf(pysamerr, "Usage: bcftools plugin <name> [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
+ fprintf(pysamerr, " bcftools +name [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "VCF input options:\n");
+ fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, "VCF output options:\n");
+ fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysamerr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "Plugin options:\n");
+ fprintf(pysamerr, " -h, --help list plugin's options\n");
+ fprintf(pysamerr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
+ fprintf(pysamerr, " -v, --verbose print debugging information on plugin failure\n");
+ fprintf(pysamerr, " -V, --version print version string and exit\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_plugin(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_fname = "-";
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ args->nplugin_paths = -1;
+ int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
+
+ if ( argc==1 ) usage(args);
+ char *plugin_name = NULL;
+ if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; }
+
+ static struct option loptions[] =
+ {
+ {"version",no_argument,NULL,'V'},
+ {"verbose",no_argument,NULL,'v'},
+ {"help",no_argument,NULL,'h'},
+ {"list-plugins",no_argument,NULL,'l'},
+ {"output",required_argument,NULL,'o'},
+ {"output-type",required_argument,NULL,'O'},
+ {"threads",required_argument,NULL,9},
+ {"include",required_argument,NULL,'i'},
+ {"exclude",required_argument,NULL,'e'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {NULL,0,NULL,0}
+ };
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'V': version_only = 1; break;
+ case 'v': args->verbose = 1; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'l': plist_only = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?':
+ case 'h': usage_only = 1; break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( plist_only ) return list_plugins(args);
+ if ( usage_only && ! plugin_name ) usage(args);
+
+ load_plugin(args, plugin_name, 1, &args->plugin);
+ if ( version_only )
+ {
+ const char *bver, *hver;
+ args->plugin.version(&bver, &hver);
+ printf("bcftools %s using htslib %s\n", bcftools_version(), hts_version());
+ printf("plugin at %s using htslib %s\n\n", bver, hver);
+ return 0;
+ }
+
+ if ( usage_only )
+ {
+ if ( args->plugin.usage )
+ fprintf(pysamerr,"%s",args->plugin.usage());
+ else
+ fprintf(pysamerr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name);
+ return 0;
+ }
+
+ if ( args->plugin.run )
+ {
+ int iopt = optind; optind = 0;
+ int ret = args->plugin.run(argc-iopt, argv+iopt);
+ destroy_data(args);
+ free(args);
+ return ret;
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc || argv[optind][0]=='-' )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ args->plugin.argc = argc - optind + 1;
+ args->plugin.argv = argv + optind - 1;
+ }
+ else
+ {
+ fname = argv[optind];
+ args->plugin.argc = argc - optind;
+ args->plugin.argv = argv + optind;
+ }
+ optind = 0;
+
+ args->files = bcf_sr_init();
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ args->files->collapse |= COLLAPSE_SOME;
+ }
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+ line = args->plugin.process(line);
+ if ( line ) bcf_write1(args->out_fh, args->hdr_out, line);
+ }
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c
new file mode 100644
index 0000000..ab4c100
--- /dev/null
+++ b/bcftools/vcfquery.c
@@ -0,0 +1,373 @@
+/* vcfquery.c -- Extracts fields from VCF/BCF file.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "convert.h"
+
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct
+{
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ convert_t *convert;
+ bcf_srs_t *files;
+ bcf_hdr_t *header;
+ int nsamples, *samples, sample_is_file;
+ char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out;
+ int argc, list_columns, print_header, allow_undef_tags;
+ FILE *out;
+}
+args_t;
+
+static void destroy_list(char **list, int n)
+{
+ int i;
+ for (i=0; i<n; i++)
+ free(list[i]);
+ free(list);
+}
+
+static void init_data(args_t *args)
+{
+ args->header = args->files->readers[0].header;
+
+ int i, nsamples = 0, *samples = NULL;
+ if ( args->sample_list && strcmp("-",args->sample_list) )
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
+ if ( ret<0 ) error("Error parsing the sample list\n");
+ else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
+ }
+
+ if ( args->sample_list[0]!='^' )
+ {
+ // the sample ordering may be different if not negated
+ int n;
+ char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
+ if ( !smpls ) error("Could not parse %s\n", args->sample_list);
+ if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
+ error("The number of samples does not match, perhaps some are present multiple times?\n");
+ nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
+ samples = (int*) malloc(sizeof(int)*nsamples);
+ for (i=0; i<n; i++)
+ {
+ samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
+ free(smpls[i]);
+ }
+ free(smpls);
+ }
+ }
+ args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+ if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
+ free(samples);
+
+ int max_unpack = convert_max_unpack(args->convert);
+ if ( args->filter_str )
+ {
+ args->filter = filter_init(args->header, args->filter_str);
+ max_unpack |= filter_max_unpack(args->filter);
+ }
+ args->files->max_unpack = max_unpack;
+}
+
+static void destroy_data(args_t *args)
+{
+ convert_destroy(args->convert);
+ if ( args->filter )
+ filter_destroy(args->filter);
+ free(args->samples);
+}
+
+static void query_vcf(args_t *args)
+{
+ kstring_t str = {0,0,0};
+
+ if ( args->print_header )
+ {
+ convert_header(args->convert,&str);
+ fwrite(str.s, str.l, 1, args->out);
+ }
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ if ( !bcf_sr_has_line(args->files,0) ) continue;
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ bcf_unpack(line, args->files->max_unpack);
+
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( str.l )
+ fwrite(str.s, str.l, 1, args->out);
+ }
+ if ( str.m ) free(str.s);
+}
+
+static void list_columns(args_t *args)
+{
+ int i;
+ bcf_sr_t *reader = &args->files->readers[0];
+ for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
+ printf("%s\n", reader->header->samples[i]);
+}
+
+static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
+{
+ char **dst = (char**) malloc(sizeof(char*)*nsrc);
+ int i;
+ for (i=0; i<nsrc; i++) dst[i] = strdup(src[i]);
+ return dst;
+}
+static int compare_header(bcf_hdr_t *hdr, char **a, int na, char **b, int nb)
+{
+ if ( na!=nb ) return na-nb;
+ int i;
+ for (i=0; i<na; i++)
+ if ( strcmp(a[i],b[i]) ) return 1;
+ return 0;
+}
+
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n");
+ fprintf(stderr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -c, --collapse <string> collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -f, --format <string> see man page for details\n");
+ fprintf(stderr, " -H, --print-header print header\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -l, --list-samples print the list of samples and exit\n");
+ fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --samples <list> list of samples to include\n");
+ fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n");
+ fprintf(stderr, " -v, --vcf-list <file> process multiple VCFs listed in the file\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Examples:\n");
+ fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfquery(int argc, char *argv[])
+{
+ int c, collapse = 0;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"list-samples",0,0,'l'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"format",1,0,'f'},
+ {"output-file",1,0,'o'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"annots",1,0,'a'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"print-header",0,0,'H'},
+ {"collapse",1,0,'c'},
+ {"vcf-list",1,0,'v'},
+ {"allow-undef-tags",0,0,'u'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'o': args->fn_out = optarg; break;
+ case 'f': args->format_str = strdup(optarg); break;
+ case 'H': args->print_header = 1; break;
+ case 'v': args->vcf_list = optarg; break;
+ case 'c':
+ if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
+ case 'a':
+ {
+ kstring_t str = {0,0,0};
+ kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str);
+ char *p = optarg;
+ while ( *p )
+ {
+ if ( *p==',' )
+ kputs("\t%", &str);
+ else
+ kputc(*p, &str);
+ p++;
+ }
+ kputc('\n', &str);
+ args->format_str = str.s;
+ break;
+ }
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'l': args->list_columns = 1; break;
+ case 'u': args->allow_undef_tags = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";
+ }
+ else fname = argv[optind];
+
+ if ( args->list_columns )
+ {
+ if ( !fname ) error("Missing the VCF file name\n");
+ args->files = bcf_sr_init();
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ list_columns(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+ }
+
+ if ( !args->format_str ) usage();
+ args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout;
+ if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
+
+ if ( !args->vcf_list )
+ {
+ if ( !fname ) usage();
+ args->files = bcf_sr_init();
+ args->files->collapse = collapse;
+ if ( optind+1 < argc ) args->files->require_index = 1;
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ while ( fname )
+ {
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ fname = ++optind < argc ? argv[optind] : NULL;
+ }
+ init_data(args);
+ query_vcf(args);
+ free(args->format_str);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ fclose(args->out);
+ free(args);
+ return 0;
+ }
+
+ // multiple VCFs
+ int i, k, nfiles, prev_nsamples = 0;
+ char **fnames, **prev_samples = NULL;
+ fnames = hts_readlist(args->vcf_list, 1, &nfiles);
+ if ( !nfiles ) error("No files in %s?\n", args->vcf_list);
+ for (i=0; i<nfiles; i++)
+ {
+ args->files = bcf_sr_init();
+ args->files->collapse = collapse;
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( optind < argc ) args->files->require_index = 1;
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum));
+ for (k=optind; k<argc; k++)
+ if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum));
+ init_data(args);
+ if ( i==0 )
+ prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header));
+ else
+ {
+ args->print_header = 0;
+ if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) )
+ error("Different samples in %s and %s\n", fnames[i-1],fnames[i]);
+ }
+ query_vcf(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ }
+ fclose(args->out);
+ destroy_list(fnames, nfiles);
+ destroy_list(prev_samples, prev_nsamples);
+ free(args->format_str);
+ free(args);
+ return 0;
+}
+
+
diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c
new file mode 100644
index 0000000..1265b57
--- /dev/null
+++ b/bcftools/vcfquery.c.pysam.c
@@ -0,0 +1,375 @@
+#include "pysam.h"
+
+/* vcfquery.c -- Extracts fields from VCF/BCF file.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "convert.h"
+
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct
+{
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ convert_t *convert;
+ bcf_srs_t *files;
+ bcf_hdr_t *header;
+ int nsamples, *samples, sample_is_file;
+ char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out;
+ int argc, list_columns, print_header, allow_undef_tags;
+ FILE *out;
+}
+args_t;
+
+static void destroy_list(char **list, int n)
+{
+ int i;
+ for (i=0; i<n; i++)
+ free(list[i]);
+ free(list);
+}
+
+static void init_data(args_t *args)
+{
+ args->header = args->files->readers[0].header;
+
+ int i, nsamples = 0, *samples = NULL;
+ if ( args->sample_list && strcmp("-",args->sample_list) )
+ {
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
+ if ( ret<0 ) error("Error parsing the sample list\n");
+ else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
+ }
+
+ if ( args->sample_list[0]!='^' )
+ {
+ // the sample ordering may be different if not negated
+ int n;
+ char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
+ if ( !smpls ) error("Could not parse %s\n", args->sample_list);
+ if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
+ error("The number of samples does not match, perhaps some are present multiple times?\n");
+ nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
+ samples = (int*) malloc(sizeof(int)*nsamples);
+ for (i=0; i<n; i++)
+ {
+ samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
+ free(smpls[i]);
+ }
+ free(smpls);
+ }
+ }
+ args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+ if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
+ free(samples);
+
+ int max_unpack = convert_max_unpack(args->convert);
+ if ( args->filter_str )
+ {
+ args->filter = filter_init(args->header, args->filter_str);
+ max_unpack |= filter_max_unpack(args->filter);
+ }
+ args->files->max_unpack = max_unpack;
+}
+
+static void destroy_data(args_t *args)
+{
+ convert_destroy(args->convert);
+ if ( args->filter )
+ filter_destroy(args->filter);
+ free(args->samples);
+}
+
+static void query_vcf(args_t *args)
+{
+ kstring_t str = {0,0,0};
+
+ if ( args->print_header )
+ {
+ convert_header(args->convert,&str);
+ fwrite(str.s, str.l, 1, args->out);
+ }
+
+ while ( bcf_sr_next_line(args->files) )
+ {
+ if ( !bcf_sr_has_line(args->files,0) ) continue;
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ bcf_unpack(line, args->files->max_unpack);
+
+ if ( args->filter )
+ {
+ int pass = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+ if ( !pass ) continue;
+ }
+
+ str.l = 0;
+ convert_line(args->convert, line, &str);
+ if ( str.l )
+ fwrite(str.s, str.l, 1, args->out);
+ }
+ if ( str.m ) free(str.s);
+}
+
+static void list_columns(args_t *args)
+{
+ int i;
+ bcf_sr_t *reader = &args->files->readers[0];
+ for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
+ printf("%s\n", reader->header->samples[i]);
+}
+
+static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
+{
+ char **dst = (char**) malloc(sizeof(char*)*nsrc);
+ int i;
+ for (i=0; i<nsrc; i++) dst[i] = strdup(src[i]);
+ return dst;
+}
+static int compare_header(bcf_hdr_t *hdr, char **a, int na, char **b, int nb)
+{
+ if ( na!=nb ) return na-nb;
+ int i;
+ for (i=0; i<na; i++)
+ if ( strcmp(a[i],b[i]) ) return 1;
+ return 0;
+}
+
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n");
+ fprintf(pysamerr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -c, --collapse <string> collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
+ fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysamerr, " -f, --format <string> see man page for details\n");
+ fprintf(pysamerr, " -H, --print-header print header\n");
+ fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(pysamerr, " -l, --list-samples print the list of samples and exit\n");
+ fprintf(pysamerr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --samples <list> list of samples to include\n");
+ fprintf(pysamerr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -u, --allow-undef-tags print \".\" for undefined tags\n");
+ fprintf(pysamerr, " -v, --vcf-list <file> process multiple VCFs listed in the file\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Examples:\n");
+ fprintf(pysamerr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfquery(int argc, char *argv[])
+{
+ int c, collapse = 0;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"list-samples",0,0,'l'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"format",1,0,'f'},
+ {"output-file",1,0,'o'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"annots",1,0,'a'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"print-header",0,0,'H'},
+ {"collapse",1,0,'c'},
+ {"vcf-list",1,0,'v'},
+ {"allow-undef-tags",0,0,'u'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'o': args->fn_out = optarg; break;
+ case 'f': args->format_str = strdup(optarg); break;
+ case 'H': args->print_header = 1; break;
+ case 'v': args->vcf_list = optarg; break;
+ case 'c':
+ if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
+ case 'a':
+ {
+ kstring_t str = {0,0,0};
+ kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str);
+ char *p = optarg;
+ while ( *p )
+ {
+ if ( *p==',' )
+ kputs("\t%", &str);
+ else
+ kputc(*p, &str);
+ p++;
+ }
+ kputc('\n', &str);
+ args->format_str = str.s;
+ break;
+ }
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'l': args->list_columns = 1; break;
+ case 'u': args->allow_undef_tags = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";
+ }
+ else fname = argv[optind];
+
+ if ( args->list_columns )
+ {
+ if ( !fname ) error("Missing the VCF file name\n");
+ args->files = bcf_sr_init();
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ list_columns(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+ }
+
+ if ( !args->format_str ) usage();
+ args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout;
+ if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
+
+ if ( !args->vcf_list )
+ {
+ if ( !fname ) usage();
+ args->files = bcf_sr_init();
+ args->files->collapse = collapse;
+ if ( optind+1 < argc ) args->files->require_index = 1;
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ while ( fname )
+ {
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ fname = ++optind < argc ? argv[optind] : NULL;
+ }
+ init_data(args);
+ query_vcf(args);
+ free(args->format_str);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ fclose(args->out);
+ free(args);
+ return 0;
+ }
+
+ // multiple VCFs
+ int i, k, nfiles, prev_nsamples = 0;
+ char **fnames, **prev_samples = NULL;
+ fnames = hts_readlist(args->vcf_list, 1, &nfiles);
+ if ( !nfiles ) error("No files in %s?\n", args->vcf_list);
+ for (i=0; i<nfiles; i++)
+ {
+ args->files = bcf_sr_init();
+ args->files->collapse = collapse;
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( optind < argc ) args->files->require_index = 1;
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum));
+ for (k=optind; k<argc; k++)
+ if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum));
+ init_data(args);
+ if ( i==0 )
+ prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header));
+ else
+ {
+ args->print_header = 0;
+ if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) )
+ error("Different samples in %s and %s\n", fnames[i-1],fnames[i]);
+ }
+ query_vcf(args);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ }
+ fclose(args->out);
+ destroy_list(fnames, nfiles);
+ destroy_list(prev_samples, prev_nsamples);
+ free(args->format_str);
+ free(args);
+ return 0;
+}
+
+
diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c
new file mode 100644
index 0000000..fa64b79
--- /dev/null
+++ b/bcftools/vcfroh.c
@@ -0,0 +1,794 @@
+/* vcfroh.c -- HMM model for detecting runs of autozygosity.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kstring.h>
+#include <htslib/kseq.h>
+#include "bcftools.h"
+#include "HMM.h"
+
+#define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies
+#define STATE_AZ 1 // autozygous state
+
+/** Genetic map */
+typedef struct
+{
+ int pos;
+ double rate;
+}
+genmap_t;
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ double t2AZ, t2HW; // P(AZ|HW) and P(HW|AZ) parameters
+ double unseen_PL, dflt_AF;
+
+ char *genmap_fname;
+ genmap_t *genmap;
+ int ngenmap, mgenmap, igenmap;
+ double rec_rate; // constant recombination rate if > 0
+
+ hmm_t *hmm;
+ double *eprob; // emission probs [2*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+ int nrids, *rids, *rid_offs; // multiple chroms with vi_training
+
+ int32_t *itmp;
+ int nitmp, mitmp;
+ float *AFs;
+ int mAFs;
+
+ double pl2p[256], *pdg;
+ int32_t skip_rid, prev_rid, prev_pos;
+
+ int ntot, nused; // some stats to detect if things didn't go awfully wrong
+ int ismpl, nsmpl; // index of query sample
+ char *estimate_AF, *sample; // list of samples for AF estimate and query sample
+ char **argv, *targets_list, *regions_list, *af_fname, *af_tag;
+ int argc, fake_PLs, snps_only, vi_training;
+}
+args_t;
+
+void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+
+void *smalloc(size_t size)
+{
+ void *mem = malloc(size);
+ if ( !mem ) error("malloc: Could not allocate %d bytes\n", (int)size);
+ return mem;
+}
+
+static void init_data(args_t *args)
+{
+ args->prev_rid = args->skip_rid = -1;
+ args->hdr = args->files->readers[0].header;
+
+ if ( !args->sample )
+ {
+ if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
+ args->sample = strdup(args->hdr->samples[0]);
+ }
+ if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
+
+ // Set samples
+ kstring_t str = {0,0,0};
+ if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
+ {
+ int i, n;
+ char **smpls = hts_readlist(args->estimate_AF, 1, &n);
+
+ // Make sure the query sample is included
+ for (i=0; i<n; i++)
+ if ( !strcmp(args->sample,smpls[i]) ) break;
+
+ // Add the query sample if not present
+ if ( i!=n ) kputs(args->sample, &str);
+
+ for (i=0; i<n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(smpls[i], &str);
+ free(smpls[i]);
+ }
+ free(smpls);
+ }
+ else if ( !args->estimate_AF )
+ kputs(args->sample, &str);
+
+ if ( str.l )
+ {
+ int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
+ else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
+ }
+
+ if ( args->af_tag )
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
+ error("No such INFO tag in the VCF: %s\n", args->af_tag);
+
+ args->nsmpl = bcf_hdr_nsamples(args->hdr);
+ args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
+ free(str.s);
+
+ int i;
+ for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);
+
+ // Init transition matrix and HMM
+ double tprob[4];
+ MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
+ MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW;
+ MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
+ MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+
+ if ( args->genmap_fname )
+ {
+ args->hmm = hmm_init(2, tprob, 0);
+ hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
+ }
+ else if ( args->rec_rate > 0 )
+ {
+ args->hmm = hmm_init(2, tprob, 0);
+ hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);
+
+ }
+ else
+ args->hmm = hmm_init(2, tprob, 10000);
+
+ // print header
+ printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ printf("# The command line was:\tbcftools %s", args->argv[0]);
+ for (i=1; i<args->argc; i++)
+ printf(" %s",args->argv[i]);
+ printf("\n#\n");
+ printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
+}
+
+static void destroy_data(args_t *args)
+{
+ free(args->sites);
+ free(args->eprob);
+ free(args->sample);
+ free(args->rids);
+ free(args->rid_offs);
+ hmm_destroy(args->hmm);
+ bcf_sr_destroy(args->files);
+ free(args->itmp); free(args->AFs); free(args->pdg);
+ free(args->genmap);
+}
+
+static int load_genmap(args_t *args, bcf1_t *line)
+{
+ if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; }
+
+ kstring_t str = {0,0,0};
+ char *fname = strstr(args->genmap_fname,"{CHROM}");
+ if ( fname )
+ {
+ kputsn(args->genmap_fname, fname - args->genmap_fname, &str);
+ kputs(bcf_seqname(args->hdr,line), &str);
+ kputs(fname+7,&str);
+ fname = str.s;
+ }
+ else
+ fname = args->genmap_fname;
+
+ htsFile *fp = hts_open(fname, "rb");
+ if ( !fp )
+ {
+ args->ngenmap = 0;
+ return -1;
+ }
+
+ hts_getline(fp, KS_SEP_LINE, &str);
+ if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") )
+ error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s);
+
+ args->ngenmap = args->igenmap = 0;
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ args->ngenmap++;
+ hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
+ genmap_t *gm = &args->genmap[args->ngenmap-1];
+
+ char *tmp, *end;
+ gm->pos = strtol(str.s, &tmp, 10);
+ if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);
+
+ // skip second column
+ tmp++;
+ while ( *tmp && !isspace(*tmp) ) tmp++;
+
+ // read the genetic map in cM
+ gm->rate = strtod(tmp+1, &end);
+ if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
+ }
+ if ( !args->ngenmap ) error("Genetic map empty?\n");
+ int i;
+ for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
+ if ( hts_close(fp) ) error("Close failed\n");
+ free(str.s);
+ return 0;
+}
+
+static double get_genmap_rate(args_t *args, int start, int end)
+{
+ // position i to be equal to or smaller than start
+ int i = args->igenmap;
+ if ( args->genmap[i].pos > start )
+ {
+ while ( i>0 && args->genmap[i].pos > start ) i--;
+ }
+ else
+ {
+ while ( i+1<args->ngenmap && args->genmap[i+1].pos < start ) i++;
+ }
+ // position j to be equal or larger than end
+ int j = i;
+ while ( j+1<args->ngenmap && args->genmap[j].pos < end ) j++;
+
+ if ( i==j )
+ {
+ args->igenmap = i;
+ return 0;
+ }
+
+ if ( start < args->genmap[i].pos ) start = args->genmap[i].pos;
+ if ( end > args->genmap[j].pos ) end = args->genmap[j].pos;
+ double rate = (args->genmap[j].rate - args->genmap[i].rate)/(args->genmap[j].pos - args->genmap[i].pos) * (end-start);
+ args->igenmap = j;
+ return rate;
+}
+
+void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+{
+ args_t *args = (args_t*) data;
+ double ci = get_genmap_rate(args, pos - prev_pos, pos);
+ MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
+ MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
+}
+
+void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+{
+ args_t *args = (args_t*) data;
+ double ci = (pos - prev_pos) * args->rec_rate;
+ MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
+ MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
+}
+
+
+/**
+ * This function implements the HMM model:
+ * D = Data, AZ = autozygosity, HW = Hardy-Weinberg (non-autozygosity),
+ * f = non-ref allele frequency
+ *
+ * Emission probabilities:
+ * oAZ = P_i(D|AZ) = (1-f)*P(D|RR) + f*P(D|AA)
+ * oHW = P_i(D|HW) = (1-f)^2 * P(D|RR) + f^2 * P(D|AA) + 2*f*(1-f)*P(D|RA)
+ *
+ * Transition probabilities:
+ * tAZ = P(AZ|HW) .. parameter
+ * tHW = P(HW|AZ) .. parameter
+ *
+ * ci = P_i(C) .. probability of cross-over at site i, from genetic map
+ *
+ * AZi = P_i(AZ) .. probability of site i being AZ/non-AZ, scaled so that AZi+HWi = 1
+ * HWi = P_i(HW)
+ *
+ * P_i(AZ|HW) = P(AZ|HW) * ci * HW{i-1} = tAZ * ci * (1 - AZ{i-1})
+ * P_i(HW|AZ) = P(HW|AZ) * ci * AZ{i-1} = tHW * ci * AZ{i-1}
+ * P_i(AZ|AZ) = 1 - P_i(HW|AZ)
+ * P_i(HW|HW) = 1 - P_i(AZ|HW)
+ *
+ */
+
+static void flush_viterbi(args_t *args)
+{
+ int i,j;
+
+ if ( !args->nsites ) return;
+
+ if ( !args->vi_training )
+ {
+ // single viterbi pass, one chromsome
+ hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
+ hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
+ double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
+
+ const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+ for (i=0; i<args->nsites; i++)
+ {
+ int state = vpath[i*2]==STATE_AZ ? 1 : 0;
+ double *pval = fwd + i*2;
+ printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
+ }
+ return;
+ }
+
+ // viterbi training, multiple chromosomes
+ double t2az_prev, t2hw_prev;
+ double deltaz, delthw;
+ int niter = 0;
+ do
+ {
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ;
+ t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW;
+ double tcounts[] = { 0,0,0,0 };
+ for (i=0; i<args->nrids; i++)
+ {
+ // run viterbi for each chromosomes. eprob and sites contain
+ // multiple chromosomes, rid_offs mark the boundaries
+ int ioff = args->rid_offs[i];
+ int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
+ hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+
+ // what transitions were observed: add to the total counts
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+ for (j=1; j<nsites; j++)
+ {
+ // count the number of transitions
+ int prev_state = vpath[2*(j-1)];
+ int curr_state = vpath[2*j];
+ MAT(tcounts,2,curr_state,prev_state) += 1;
+ }
+ }
+
+ // update the transition matrix tprob
+ for (i=0; i<2; i++)
+ {
+ int n = 0;
+ for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
+ if ( !n) error("fixme: state %d not observed\n", i+1);
+ for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n;
+ }
+ if ( args->genmap_fname || args->rec_rate > 0 )
+ hmm_set_tprob(args->hmm, tcounts, 0);
+ else
+ hmm_set_tprob(args->hmm, tcounts, 10000);
+
+ tprob_arr = hmm_get_tprob(args->hmm);
+ deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
+ delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
+ niter++;
+
+ fprintf(stderr,"%d: %f %f\n", niter,deltaz,delthw);
+ }
+ while ( deltaz > 0.0 || delthw > 0.0 );
+ fprintf(stderr, "Viterbi training converged in %d iterations to", niter);
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(stderr, " %f", MAT(tprob_arr,2,i,j));
+ fprintf(stderr, "\n");
+
+ // output the results
+ for (i=0; i<args->nrids; i++)
+ {
+ int ioff = args->rid_offs[i];
+ int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
+ hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
+ for (j=0; j<nsites; j++)
+ {
+ printf("%s\t%d\t%d\t..\n", chr,args->sites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0);
+ }
+ }
+}
+
+static void push_rid(args_t *args, int rid)
+{
+ args->nrids++;
+ args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int));
+ args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int));
+ args->rids[ args->nrids-1 ] = rid;
+ args->rid_offs[ args->nrids-1 ] = args->nsites;
+}
+
+int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
+{
+ if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match
+
+ int i;
+ for (i=0; i<tgt->nals; i++)
+ if ( strcmp(line->d.allele[i],tgt->als[i]) ) break; // we could be smarter, see vcmp
+ if ( i<tgt->nals ) return -1;
+
+ char *tmp, *str = tgt->line.s;
+ i = 0;
+ while ( *str && i<3 )
+ {
+ if ( *str=='\t' ) i++;
+ str++;
+ }
+ *alt_freq = strtod(str, &tmp);
+ if ( *tmp && !isspace(*tmp) )
+ {
+ if ( str[0]=='.' && (!str[1] || isspace(str[1])) ) return -1; // missing value
+ error("Could not parse: [%s]\n", tgt->line.s);
+ }
+ if ( *alt_freq<0 || *alt_freq>1 ) error("Could not parse AF: [%s]\n", tgt->line.s);
+ return 0;
+}
+
+int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
+{
+ if ( !args->nitmp )
+ {
+ args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
+ if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
+ args->nitmp /= args->nsmpl;
+ }
+
+ int i, nalt = 0, nref = 0;
+ for (i=0; i<args->nsmpl; i++)
+ {
+ int32_t *gt = &args->itmp[i*args->nitmp];
+
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ if ( bcf_gt_allele(gt[0]) ) nalt++;
+ else nref++;
+
+ if ( bcf_gt_allele(gt[1]) ) nalt++;
+ else nref++;
+ }
+ if ( !nalt && !nref ) return -1;
+
+ *alt_freq = (double)nalt / (nalt + nref);
+ return 0;
+}
+
+
+int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
+{
+ args->nitmp = 0;
+
+ // Set allele frequency
+ int ret;
+ if ( args->af_tag )
+ {
+ // Use an INFO tag provided by the user
+ ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
+ if ( ret==1 )
+ *alt_freq = args->AFs[0];
+ if ( ret==-2 )
+ error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
+ }
+ else if ( args->af_fname )
+ {
+ // Read AF from a file
+ ret = read_AF(args->files->targets, line, alt_freq);
+ }
+ else
+ {
+ // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
+ ret = -1;
+ if ( !args->estimate_AF )
+ {
+ int AC = -1, AN = 0;
+ ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
+ if ( ret==1 )
+ {
+ AN = args->itmp[0];
+ ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
+ if ( ret>0 )
+ AC = args->itmp[0];
+ }
+ if ( AN<=0 || AC<0 )
+ ret = -1;
+ else
+ *alt_freq = (double) AC/AN;
+ }
+ if ( ret==-1 )
+ ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp
+ }
+
+ if ( ret<0 ) return ret;
+ if ( *alt_freq==0.0 )
+ {
+ if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0
+ *alt_freq = args->dflt_AF;
+ }
+
+ // Set P(D|G)
+ if ( args->fake_PLs )
+ {
+ if ( !args->nitmp )
+ {
+ args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
+ if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
+ args->nitmp /= args->nsmpl;
+ }
+
+ int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;
+
+ int a = bcf_gt_allele(gt[0]);
+ int b = bcf_gt_allele(gt[1]);
+ if ( a!=b )
+ {
+ pdg[0] = pdg[2] = args->unseen_PL;
+ pdg[1] = 1 - 2*args->unseen_PL;
+ }
+ else if ( a==0 )
+ {
+ pdg[0] = 1 - 2*args->unseen_PL;
+ pdg[1] = pdg[2] = args->unseen_PL;
+ }
+ else
+ {
+ pdg[0] = pdg[1] = args->unseen_PL;
+ pdg[2] = 1 - 2*args->unseen_PL;
+ }
+ }
+ else
+ {
+ args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
+ if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid?
+ args->nitmp /= args->nsmpl;
+
+ int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
+ pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
+ pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
+ pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;
+
+ double sum = pdg[0] + pdg[1] + pdg[2];
+ if ( !sum ) return -1;
+ pdg[0] /= sum;
+ pdg[1] /= sum;
+ pdg[2] /= sum;
+ }
+
+ return 0;
+}
+
+static void vcfroh(args_t *args, bcf1_t *line)
+{
+ // Are we done?
+ if ( !line )
+ {
+ flush_viterbi(args);
+ return;
+ }
+ args->ntot++;
+
+ // Skip unwanted lines
+ if ( line->rid == args->skip_rid ) return;
+ if ( line->n_allele==1 ) return; // no ALT allele
+ if ( line->n_allele!=2 ) return; // only biallelic sites
+ if ( args->snps_only && !bcf_is_snp(line) ) return;
+
+ // Initialize genetic map
+ int skip_rid = 0;
+ if ( args->prev_rid<0 )
+ {
+ args->prev_rid = line->rid;
+ args->prev_pos = line->pos;
+ skip_rid = load_genmap(args, line);
+ if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
+ }
+
+ // New chromosome?
+ if ( args->prev_rid!=line->rid )
+ {
+ skip_rid = load_genmap(args, line);
+ if ( args->vi_training )
+ {
+ if ( !skip_rid ) push_rid(args, line->rid);
+ }
+ else
+ {
+ flush_viterbi(args);
+ args->nsites = 0;
+ }
+ args->prev_rid = line->rid;
+ args->prev_pos = line->pos;
+ }
+
+ if ( skip_rid )
+ {
+ fprintf(stderr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line));
+ args->skip_rid = line->rid;
+ return;
+ }
+ if ( args->prev_pos > line->pos ) error("The file is not sorted?!\n");
+
+ args->prev_rid = line->rid;
+ args->prev_pos = line->pos;
+
+
+ // Ready for the new site
+ int m = args->msites;
+ hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
+ if ( args->msites!=m )
+ args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);
+
+ // Set likelihoods and alternate allele frequencies
+ double alt_freq, pdg[3];
+ if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong
+
+ args->nused++;
+
+ // Calculate emission probabilities P(D|AZ) and P(D|HW)
+ double *eprob = &args->eprob[2*args->nsites];
+ eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
+ eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
+
+ args->sites[args->nsites] = line->pos;
+ args->nsites++;
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: HMM model for detecting runs of autozygosity.\n");
+ fprintf(stderr, "Usage: bcftools roh [options] <in.vcf.gz>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "General Options:\n");
+ fprintf(stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
+ fprintf(stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
+ fprintf(stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(stderr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
+ fprintf(stderr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
+ fprintf(stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
+ fprintf(stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
+ fprintf(stderr, " -M, --rec-rate <float> constant recombination rate per bp\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --sample <sample> sample to analyze\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "HMM Options:\n");
+ fprintf(stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
+ fprintf(stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
+ fprintf(stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfroh(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->t2AZ = 6.7e-8;
+ args->t2HW = 5e-9;
+ args->rec_rate = 0;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"AF-tag",1,0,0},
+ {"AF-file",1,0,1},
+ {"AF-dflt",1,0,2},
+ {"estimate-AF",1,0,'e'},
+ {"GTs-only",1,0,'G'},
+ {"sample",1,0,'s'},
+ {"hw-to-az",1,0,'a'},
+ {"az-to-hw",1,0,'H'},
+ {"viterbi-training",0,0,'V'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"genetic-map",1,0,'m'},
+ {"rec-rate",1,0,'M'},
+ {"skip-indels",0,0,'I'},
+ {0,0,0,0}
+ };
+
+ int naf_opts = 0;
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 0: args->af_tag = optarg; naf_opts++; break;
+ case 1: args->af_fname = optarg; naf_opts++; break;
+ case 2:
+ args->dflt_AF = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
+ break;
+ case 'e': args->estimate_AF = optarg; naf_opts++; break;
+ case 'I': args->snps_only = 1; break;
+ case 'G':
+ args->fake_PLs = 1;
+ args->unseen_PL = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -G %s\n", optarg);
+ args->unseen_PL = pow(10,-args->unseen_PL/10.);
+ break;
+ case 'm': args->genmap_fname = optarg; break;
+ case 'M':
+ args->rec_rate = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -M %s\n", optarg);
+ break;
+ case 's': args->sample = strdup(optarg); break;
+ case 'a':
+ args->t2AZ = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -a %s\n", optarg);
+ break;
+ case 'H':
+ args->t2HW = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -H %s\n", optarg);
+ break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'V': args->vi_training = 1; break;
+ case 'h':
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( argc<optind+1 ) usage(args);
+ if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ);
+ if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW);
+ if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n");
+ if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n");
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->af_fname )
+ {
+ if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
+ error("Failed to read the targets: %s\n", args->af_fname);
+ }
+ if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ vcfroh(args, args->files->readers[0].buffer[0]);
+ }
+ vcfroh(args, NULL);
+ fprintf(stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ destroy_data(args);
+ free(args);
+ return 0;
+}
+
+
diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c
new file mode 100644
index 0000000..92a9a4f
--- /dev/null
+++ b/bcftools/vcfroh.c.pysam.c
@@ -0,0 +1,796 @@
+#include "pysam.h"
+
+/* vcfroh.c -- HMM model for detecting runs of autozygosity.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/kstring.h>
+#include <htslib/kseq.h>
+#include "bcftools.h"
+#include "HMM.h"
+
+#define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies
+#define STATE_AZ 1 // autozygous state
+
+/** Genetic map */
+typedef struct
+{
+ int pos;
+ double rate;
+}
+genmap_t;
+
+typedef struct _args_t
+{
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr;
+ double t2AZ, t2HW; // P(AZ|HW) and P(HW|AZ) parameters
+ double unseen_PL, dflt_AF;
+
+ char *genmap_fname;
+ genmap_t *genmap;
+ int ngenmap, mgenmap, igenmap;
+ double rec_rate; // constant recombination rate if > 0
+
+ hmm_t *hmm;
+ double *eprob; // emission probs [2*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+ int nrids, *rids, *rid_offs; // multiple chroms with vi_training
+
+ int32_t *itmp;
+ int nitmp, mitmp;
+ float *AFs;
+ int mAFs;
+
+ double pl2p[256], *pdg;
+ int32_t skip_rid, prev_rid, prev_pos;
+
+ int ntot, nused; // some stats to detect if things didn't go awfully wrong
+ int ismpl, nsmpl; // index of query sample
+ char *estimate_AF, *sample; // list of samples for AF estimate and query sample
+ char **argv, *targets_list, *regions_list, *af_fname, *af_tag;
+ int argc, fake_PLs, snps_only, vi_training;
+}
+args_t;
+
+void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+
+void *smalloc(size_t size)
+{
+ void *mem = malloc(size);
+ if ( !mem ) error("malloc: Could not allocate %d bytes\n", (int)size);
+ return mem;
+}
+
+static void init_data(args_t *args)
+{
+ args->prev_rid = args->skip_rid = -1;
+ args->hdr = args->files->readers[0].header;
+
+ if ( !args->sample )
+ {
+ if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
+ args->sample = strdup(args->hdr->samples[0]);
+ }
+ if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
+
+ // Set samples
+ kstring_t str = {0,0,0};
+ if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
+ {
+ int i, n;
+ char **smpls = hts_readlist(args->estimate_AF, 1, &n);
+
+ // Make sure the query sample is included
+ for (i=0; i<n; i++)
+ if ( !strcmp(args->sample,smpls[i]) ) break;
+
+ // Add the query sample if not present
+ if ( i!=n ) kputs(args->sample, &str);
+
+ for (i=0; i<n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(smpls[i], &str);
+ free(smpls[i]);
+ }
+ free(smpls);
+ }
+ else if ( !args->estimate_AF )
+ kputs(args->sample, &str);
+
+ if ( str.l )
+ {
+ int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
+ else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
+ }
+
+ if ( args->af_tag )
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
+ error("No such INFO tag in the VCF: %s\n", args->af_tag);
+
+ args->nsmpl = bcf_hdr_nsamples(args->hdr);
+ args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
+ free(str.s);
+
+ int i;
+ for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);
+
+ // Init transition matrix and HMM
+ double tprob[4];
+ MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
+ MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW;
+ MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
+ MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+
+ if ( args->genmap_fname )
+ {
+ args->hmm = hmm_init(2, tprob, 0);
+ hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
+ }
+ else if ( args->rec_rate > 0 )
+ {
+ args->hmm = hmm_init(2, tprob, 0);
+ hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);
+
+ }
+ else
+ args->hmm = hmm_init(2, tprob, 10000);
+
+ // print header
+ printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ printf("# The command line was:\tbcftools %s", args->argv[0]);
+ for (i=1; i<args->argc; i++)
+ printf(" %s",args->argv[i]);
+ printf("\n#\n");
+ printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
+}
+
+static void destroy_data(args_t *args)
+{
+ free(args->sites);
+ free(args->eprob);
+ free(args->sample);
+ free(args->rids);
+ free(args->rid_offs);
+ hmm_destroy(args->hmm);
+ bcf_sr_destroy(args->files);
+ free(args->itmp); free(args->AFs); free(args->pdg);
+ free(args->genmap);
+}
+
+static int load_genmap(args_t *args, bcf1_t *line)
+{
+ if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; }
+
+ kstring_t str = {0,0,0};
+ char *fname = strstr(args->genmap_fname,"{CHROM}");
+ if ( fname )
+ {
+ kputsn(args->genmap_fname, fname - args->genmap_fname, &str);
+ kputs(bcf_seqname(args->hdr,line), &str);
+ kputs(fname+7,&str);
+ fname = str.s;
+ }
+ else
+ fname = args->genmap_fname;
+
+ htsFile *fp = hts_open(fname, "rb");
+ if ( !fp )
+ {
+ args->ngenmap = 0;
+ return -1;
+ }
+
+ hts_getline(fp, KS_SEP_LINE, &str);
+ if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") )
+ error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s);
+
+ args->ngenmap = args->igenmap = 0;
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ args->ngenmap++;
+ hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
+ genmap_t *gm = &args->genmap[args->ngenmap-1];
+
+ char *tmp, *end;
+ gm->pos = strtol(str.s, &tmp, 10);
+ if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);
+
+ // skip second column
+ tmp++;
+ while ( *tmp && !isspace(*tmp) ) tmp++;
+
+ // read the genetic map in cM
+ gm->rate = strtod(tmp+1, &end);
+ if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
+ }
+ if ( !args->ngenmap ) error("Genetic map empty?\n");
+ int i;
+ for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
+ if ( hts_close(fp) ) error("Close failed\n");
+ free(str.s);
+ return 0;
+}
+
+static double get_genmap_rate(args_t *args, int start, int end)
+{
+ // position i to be equal to or smaller than start
+ int i = args->igenmap;
+ if ( args->genmap[i].pos > start )
+ {
+ while ( i>0 && args->genmap[i].pos > start ) i--;
+ }
+ else
+ {
+ while ( i+1<args->ngenmap && args->genmap[i+1].pos < start ) i++;
+ }
+ // position j to be equal or larger than end
+ int j = i;
+ while ( j+1<args->ngenmap && args->genmap[j].pos < end ) j++;
+
+ if ( i==j )
+ {
+ args->igenmap = i;
+ return 0;
+ }
+
+ if ( start < args->genmap[i].pos ) start = args->genmap[i].pos;
+ if ( end > args->genmap[j].pos ) end = args->genmap[j].pos;
+ double rate = (args->genmap[j].rate - args->genmap[i].rate)/(args->genmap[j].pos - args->genmap[i].pos) * (end-start);
+ args->igenmap = j;
+ return rate;
+}
+
+void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+{
+ args_t *args = (args_t*) data;
+ double ci = get_genmap_rate(args, pos - prev_pos, pos);
+ MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
+ MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
+}
+
+void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+{
+ args_t *args = (args_t*) data;
+ double ci = (pos - prev_pos) * args->rec_rate;
+ MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
+ MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
+ MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
+}
+
+
+/**
+ * This function implements the HMM model:
+ * D = Data, AZ = autozygosity, HW = Hardy-Weinberg (non-autozygosity),
+ * f = non-ref allele frequency
+ *
+ * Emission probabilities:
+ * oAZ = P_i(D|AZ) = (1-f)*P(D|RR) + f*P(D|AA)
+ * oHW = P_i(D|HW) = (1-f)^2 * P(D|RR) + f^2 * P(D|AA) + 2*f*(1-f)*P(D|RA)
+ *
+ * Transition probabilities:
+ * tAZ = P(AZ|HW) .. parameter
+ * tHW = P(HW|AZ) .. parameter
+ *
+ * ci = P_i(C) .. probability of cross-over at site i, from genetic map
+ *
+ * AZi = P_i(AZ) .. probability of site i being AZ/non-AZ, scaled so that AZi+HWi = 1
+ * HWi = P_i(HW)
+ *
+ * P_i(AZ|HW) = P(AZ|HW) * ci * HW{i-1} = tAZ * ci * (1 - AZ{i-1})
+ * P_i(HW|AZ) = P(HW|AZ) * ci * AZ{i-1} = tHW * ci * AZ{i-1}
+ * P_i(AZ|AZ) = 1 - P_i(HW|AZ)
+ * P_i(HW|HW) = 1 - P_i(AZ|HW)
+ *
+ */
+
+static void flush_viterbi(args_t *args)
+{
+ int i,j;
+
+ if ( !args->nsites ) return;
+
+ if ( !args->vi_training )
+ {
+ // single viterbi pass, one chromsome
+ hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
+ hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
+ double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
+
+ const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+ for (i=0; i<args->nsites; i++)
+ {
+ int state = vpath[i*2]==STATE_AZ ? 1 : 0;
+ double *pval = fwd + i*2;
+ printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
+ }
+ return;
+ }
+
+ // viterbi training, multiple chromosomes
+ double t2az_prev, t2hw_prev;
+ double deltaz, delthw;
+ int niter = 0;
+ do
+ {
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ;
+ t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW;
+ double tcounts[] = { 0,0,0,0 };
+ for (i=0; i<args->nrids; i++)
+ {
+ // run viterbi for each chromosomes. eprob and sites contain
+ // multiple chromosomes, rid_offs mark the boundaries
+ int ioff = args->rid_offs[i];
+ int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
+ hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+
+ // what transitions were observed: add to the total counts
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+ for (j=1; j<nsites; j++)
+ {
+ // count the number of transitions
+ int prev_state = vpath[2*(j-1)];
+ int curr_state = vpath[2*j];
+ MAT(tcounts,2,curr_state,prev_state) += 1;
+ }
+ }
+
+ // update the transition matrix tprob
+ for (i=0; i<2; i++)
+ {
+ int n = 0;
+ for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
+ if ( !n) error("fixme: state %d not observed\n", i+1);
+ for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n;
+ }
+ if ( args->genmap_fname || args->rec_rate > 0 )
+ hmm_set_tprob(args->hmm, tcounts, 0);
+ else
+ hmm_set_tprob(args->hmm, tcounts, 10000);
+
+ tprob_arr = hmm_get_tprob(args->hmm);
+ deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
+ delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
+ niter++;
+
+ fprintf(pysamerr,"%d: %f %f\n", niter,deltaz,delthw);
+ }
+ while ( deltaz > 0.0 || delthw > 0.0 );
+ fprintf(pysamerr, "Viterbi training converged in %d iterations to", niter);
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(pysamerr, " %f", MAT(tprob_arr,2,i,j));
+ fprintf(pysamerr, "\n");
+
+ // output the results
+ for (i=0; i<args->nrids; i++)
+ {
+ int ioff = args->rid_offs[i];
+ int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
+ hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
+ for (j=0; j<nsites; j++)
+ {
+ printf("%s\t%d\t%d\t..\n", chr,args->sites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0);
+ }
+ }
+}
+
+static void push_rid(args_t *args, int rid)
+{
+ args->nrids++;
+ args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int));
+ args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int));
+ args->rids[ args->nrids-1 ] = rid;
+ args->rid_offs[ args->nrids-1 ] = args->nsites;
+}
+
+int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
+{
+ if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match
+
+ int i;
+ for (i=0; i<tgt->nals; i++)
+ if ( strcmp(line->d.allele[i],tgt->als[i]) ) break; // we could be smarter, see vcmp
+ if ( i<tgt->nals ) return -1;
+
+ char *tmp, *str = tgt->line.s;
+ i = 0;
+ while ( *str && i<3 )
+ {
+ if ( *str=='\t' ) i++;
+ str++;
+ }
+ *alt_freq = strtod(str, &tmp);
+ if ( *tmp && !isspace(*tmp) )
+ {
+ if ( str[0]=='.' && (!str[1] || isspace(str[1])) ) return -1; // missing value
+ error("Could not parse: [%s]\n", tgt->line.s);
+ }
+ if ( *alt_freq<0 || *alt_freq>1 ) error("Could not parse AF: [%s]\n", tgt->line.s);
+ return 0;
+}
+
+int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
+{
+ if ( !args->nitmp )
+ {
+ args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
+ if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
+ args->nitmp /= args->nsmpl;
+ }
+
+ int i, nalt = 0, nref = 0;
+ for (i=0; i<args->nsmpl; i++)
+ {
+ int32_t *gt = &args->itmp[i*args->nitmp];
+
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ if ( bcf_gt_allele(gt[0]) ) nalt++;
+ else nref++;
+
+ if ( bcf_gt_allele(gt[1]) ) nalt++;
+ else nref++;
+ }
+ if ( !nalt && !nref ) return -1;
+
+ *alt_freq = (double)nalt / (nalt + nref);
+ return 0;
+}
+
+
+int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
+{
+ args->nitmp = 0;
+
+ // Set allele frequency
+ int ret;
+ if ( args->af_tag )
+ {
+ // Use an INFO tag provided by the user
+ ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
+ if ( ret==1 )
+ *alt_freq = args->AFs[0];
+ if ( ret==-2 )
+ error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
+ }
+ else if ( args->af_fname )
+ {
+ // Read AF from a file
+ ret = read_AF(args->files->targets, line, alt_freq);
+ }
+ else
+ {
+ // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
+ ret = -1;
+ if ( !args->estimate_AF )
+ {
+ int AC = -1, AN = 0;
+ ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
+ if ( ret==1 )
+ {
+ AN = args->itmp[0];
+ ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
+ if ( ret>0 )
+ AC = args->itmp[0];
+ }
+ if ( AN<=0 || AC<0 )
+ ret = -1;
+ else
+ *alt_freq = (double) AC/AN;
+ }
+ if ( ret==-1 )
+ ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp
+ }
+
+ if ( ret<0 ) return ret;
+ if ( *alt_freq==0.0 )
+ {
+ if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0
+ *alt_freq = args->dflt_AF;
+ }
+
+ // Set P(D|G)
+ if ( args->fake_PLs )
+ {
+ if ( !args->nitmp )
+ {
+ args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
+ if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
+ args->nitmp /= args->nsmpl;
+ }
+
+ int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;
+
+ int a = bcf_gt_allele(gt[0]);
+ int b = bcf_gt_allele(gt[1]);
+ if ( a!=b )
+ {
+ pdg[0] = pdg[2] = args->unseen_PL;
+ pdg[1] = 1 - 2*args->unseen_PL;
+ }
+ else if ( a==0 )
+ {
+ pdg[0] = 1 - 2*args->unseen_PL;
+ pdg[1] = pdg[2] = args->unseen_PL;
+ }
+ else
+ {
+ pdg[0] = pdg[1] = args->unseen_PL;
+ pdg[2] = 1 - 2*args->unseen_PL;
+ }
+ }
+ else
+ {
+ args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
+ if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid?
+ args->nitmp /= args->nsmpl;
+
+ int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
+ pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
+ pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
+ pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;
+
+ double sum = pdg[0] + pdg[1] + pdg[2];
+ if ( !sum ) return -1;
+ pdg[0] /= sum;
+ pdg[1] /= sum;
+ pdg[2] /= sum;
+ }
+
+ return 0;
+}
+
+static void vcfroh(args_t *args, bcf1_t *line)
+{
+ // Are we done?
+ if ( !line )
+ {
+ flush_viterbi(args);
+ return;
+ }
+ args->ntot++;
+
+ // Skip unwanted lines
+ if ( line->rid == args->skip_rid ) return;
+ if ( line->n_allele==1 ) return; // no ALT allele
+ if ( line->n_allele!=2 ) return; // only biallelic sites
+ if ( args->snps_only && !bcf_is_snp(line) ) return;
+
+ // Initialize genetic map
+ int skip_rid = 0;
+ if ( args->prev_rid<0 )
+ {
+ args->prev_rid = line->rid;
+ args->prev_pos = line->pos;
+ skip_rid = load_genmap(args, line);
+ if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
+ }
+
+ // New chromosome?
+ if ( args->prev_rid!=line->rid )
+ {
+ skip_rid = load_genmap(args, line);
+ if ( args->vi_training )
+ {
+ if ( !skip_rid ) push_rid(args, line->rid);
+ }
+ else
+ {
+ flush_viterbi(args);
+ args->nsites = 0;
+ }
+ args->prev_rid = line->rid;
+ args->prev_pos = line->pos;
+ }
+
+ if ( skip_rid )
+ {
+ fprintf(pysamerr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line));
+ args->skip_rid = line->rid;
+ return;
+ }
+ if ( args->prev_pos > line->pos ) error("The file is not sorted?!\n");
+
+ args->prev_rid = line->rid;
+ args->prev_pos = line->pos;
+
+
+ // Ready for the new site
+ int m = args->msites;
+ hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
+ if ( args->msites!=m )
+ args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);
+
+ // Set likelihoods and alternate allele frequencies
+ double alt_freq, pdg[3];
+ if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong
+
+ args->nused++;
+
+ // Calculate emission probabilities P(D|AZ) and P(D|HW)
+ double *eprob = &args->eprob[2*args->nsites];
+ eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
+ eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
+
+ args->sites[args->nsites] = line->pos;
+ args->nsites++;
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: HMM model for detecting runs of autozygosity.\n");
+ fprintf(pysamerr, "Usage: bcftools roh [options] <in.vcf.gz>\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "General Options:\n");
+ fprintf(pysamerr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
+ fprintf(pysamerr, " --AF-tag <TAG> use TAG for allele frequency\n");
+ fprintf(pysamerr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(pysamerr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
+ fprintf(pysamerr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
+ fprintf(pysamerr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
+ fprintf(pysamerr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
+ fprintf(pysamerr, " -M, --rec-rate <float> constant recombination rate per bp\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --sample <sample> sample to analyze\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "HMM Options:\n");
+ fprintf(pysamerr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
+ fprintf(pysamerr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
+ fprintf(pysamerr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfroh(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->t2AZ = 6.7e-8;
+ args->t2HW = 5e-9;
+ args->rec_rate = 0;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"AF-tag",1,0,0},
+ {"AF-file",1,0,1},
+ {"AF-dflt",1,0,2},
+ {"estimate-AF",1,0,'e'},
+ {"GTs-only",1,0,'G'},
+ {"sample",1,0,'s'},
+ {"hw-to-az",1,0,'a'},
+ {"az-to-hw",1,0,'H'},
+ {"viterbi-training",0,0,'V'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"genetic-map",1,0,'m'},
+ {"rec-rate",1,0,'M'},
+ {"skip-indels",0,0,'I'},
+ {0,0,0,0}
+ };
+
+ int naf_opts = 0;
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 0: args->af_tag = optarg; naf_opts++; break;
+ case 1: args->af_fname = optarg; naf_opts++; break;
+ case 2:
+ args->dflt_AF = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
+ break;
+ case 'e': args->estimate_AF = optarg; naf_opts++; break;
+ case 'I': args->snps_only = 1; break;
+ case 'G':
+ args->fake_PLs = 1;
+ args->unseen_PL = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -G %s\n", optarg);
+ args->unseen_PL = pow(10,-args->unseen_PL/10.);
+ break;
+ case 'm': args->genmap_fname = optarg; break;
+ case 'M':
+ args->rec_rate = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -M %s\n", optarg);
+ break;
+ case 's': args->sample = strdup(optarg); break;
+ case 'a':
+ args->t2AZ = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -a %s\n", optarg);
+ break;
+ case 'H':
+ args->t2HW = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: -H %s\n", optarg);
+ break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'V': args->vi_training = 1; break;
+ case 'h':
+ case '?': usage(args); break;
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( argc<optind+1 ) usage(args);
+ if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ);
+ if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW);
+ if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n");
+ if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n");
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->af_fname )
+ {
+ if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
+ error("Failed to read the targets: %s\n", args->af_fname);
+ }
+ if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->files) )
+ {
+ vcfroh(args, args->files->readers[0].buffer[0]);
+ }
+ vcfroh(args, NULL);
+ fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ destroy_data(args);
+ free(args);
+ return 0;
+}
+
+
diff --git a/bcftools/vcfsom.c b/bcftools/vcfsom.c
new file mode 100644
index 0000000..03181e9
--- /dev/null
+++ b/bcftools/vcfsom.c
@@ -0,0 +1,715 @@
+/* vcfsom.c -- SOM (Self-Organizing Map) filtering.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <time.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <inttypes.h>
+#include "bcftools.h"
+
+#define SOM_TRAIN 1
+#define SOM_CLASSIFY 2
+
+typedef struct
+{
+ int ndim; // dimension of the map (2D, 3D, ...)
+ int nbin; // number of bins in th map
+ int size; // pow(nbin,ndim)
+ int kdim; // dimension of the input vectors
+ int nt, t; // total number of learning cycles and the current cycle
+ double *w, *c; // weights and counts (sum of learning influence)
+ double learn; // learning rate
+ double bmu_th; // best-matching unit threshold
+ int *a_idx, *b_idx; // temp arrays for traversing variable number of nested loops
+ double *div; // dtto
+}
+som_t;
+
+typedef struct
+{
+ // SOM parameters
+ double bmu_th, learn;
+ int ndim, nbin, ntrain, t;
+ int nfold; // n-fold cross validation = the number of SOMs
+ som_t **som;
+
+ // annots reader's data
+ htsFile *file; // reader
+ kstring_t str; // temporary string for the reader
+ int dclass, mvals;
+ double *vals;
+
+ // training data
+ double *train_dat;
+ int *train_class, mtrain_class, mtrain_dat;
+
+ int rand_seed, good_class, bad_class;
+ char **argv, *fname, *prefix;
+ int argc, action, train_bad, merge;
+}
+args_t;
+
+static void usage(void);
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+void mkdir_p(const char *fmt, ...);
+
+char *msprintf(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ int n = vsnprintf(NULL, 0, fmt, ap) + 2;
+ va_end(ap);
+
+ char *str = (char*)malloc(n);
+ va_start(ap, fmt);
+ vsnprintf(str, n, fmt, ap);
+ va_end(ap);
+
+ return str;
+}
+
+/*
+ * char *t, *p = str;
+ * t = column_next(p, '\t');
+ * if ( strlen("<something>")==t-p && !strncmp(p,"<something>",t-p) ) printf("found!\n");
+ *
+ * char *t;
+ * t = column_next(str, '\t'); if ( !*t ) error("expected field\n", str);
+ * t = column_next(t+1, '\t'); if ( !*t ) error("expected field\n", str);
+ */
+static inline char *column_next(char *start, char delim)
+{
+ char *end = start;
+ while (*end && *end!=delim) end++;
+ return end;
+}
+/**
+ * annots_reader_next() - reads next line from annots.tab.gz and sets: class, vals
+ * Returns 1 on successful read or 0 if no further record could be read.
+ */
+int annots_reader_next(args_t *args)
+{
+ args->str.l = 0;
+ if ( hts_getline(args->file,'\n',&args->str)<=0 ) return 0;
+
+ char *t, *line = args->str.s;
+
+ if ( !args->mvals )
+ {
+ t = line;
+ while ( *t )
+ {
+ if ( *t=='\t' ) args->mvals++;
+ t++;
+ }
+ args->vals = (double*) malloc(args->mvals*sizeof(double));
+ }
+
+ // class
+ args->dclass = atoi(line);
+ t = column_next(line, '\t');
+
+ // values
+ int i;
+ for (i=0; i<args->mvals; i++)
+ {
+ if ( !*t ) error("Could not parse %d-th data field: is the line truncated?\nThe line was: [%s]\n",i+2,line);
+ args->vals[i] = atof(++t);
+ t = column_next(t,'\t');
+ }
+ return 1;
+}
+void annots_reader_reset(args_t *args)
+{
+ if ( args->file ) hts_close(args->file);
+ if ( !args->fname ) error("annots_reader_reset: no fname\n");
+ args->file = hts_open(args->fname, "r");
+}
+void annots_reader_close(args_t *args)
+{
+ hts_close(args->file);
+}
+
+static void som_write_map(char *prefix, som_t **som, int nsom)
+{
+ FILE *fp = open_file(NULL,"w","%s.som",prefix);
+ fwrite("SOMv1",5,1,fp);
+ fwrite(&nsom,sizeof(int),1,fp);
+ int i;
+ for (i=0; i<nsom; i++)
+ {
+ fwrite(&som[i]->size,sizeof(int),1,fp);
+ fwrite(&som[i]->kdim,sizeof(int),1,fp);
+ fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp);
+ fwrite(som[i]->c,sizeof(double),som[i]->size,fp);
+ }
+ if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
+}
+static som_t** som_load_map(char *prefix, int *nsom)
+{
+ FILE *fp = open_file(NULL,"r","%s.som",prefix);
+ char buf[5];
+ if ( fread(buf,5,1,fp)!=1 || strncmp(buf,"SOMv1",5) ) error("Could not parse %s.som\n", prefix);
+
+ if ( fread(nsom,sizeof(int),1,fp)!=1 ) error("Could not read %s.som\n", prefix);
+ som_t **som = (som_t**)malloc(*nsom*sizeof(som_t*));
+
+ int i;
+ for (i=0; i<*nsom; i++)
+ {
+ som[i] = (som_t*) calloc(1,sizeof(som_t));
+ if ( fread(&som[i]->size,sizeof(int),1,fp) != 1 ) error("Could not read %s.som\n", prefix);
+ if ( fread(&som[i]->kdim,sizeof(int),1,fp) != 1 ) error("Could not read %s.som\n", prefix);
+ som[i]->w = (double*) malloc(sizeof(double)*som[i]->size*som[i]->kdim);
+ som[i]->c = (double*) malloc(sizeof(double)*som[i]->size);
+ if ( fread(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp) != som[i]->size*som[i]->kdim ) error("Could not read from %s.som\n", prefix);
+ if ( fread(som[i]->c,sizeof(double),som[i]->size,fp) != som[i]->size ) error("Could not read from %s.som\n", prefix);
+ }
+ if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
+ return som;
+}
+static void som_create_plot(som_t *som, char *prefix)
+{
+ if ( som->ndim!=2 ) return;
+
+ char *fname;
+ FILE *fp = open_file(&fname,"w","%s.py",prefix);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "\n"
+ "dat = [\n"
+ );
+ int i,j;
+ double *val = som->c;
+ for (i=0; i<som->nbin; i++)
+ {
+ fprintf(fp,"[");
+ for (j=0; j<som->nbin; j++)
+ {
+ if ( j>0 ) fprintf(fp,",");
+ fprintf(fp,"%e", *val);
+ val++;
+ }
+ fprintf(fp,"],\n");
+ }
+ fprintf(fp,
+ "]\n"
+ "fig = plt.figure()\n"
+ "ax1 = plt.subplot(111)\n"
+ "im1 = ax1.imshow(dat)\n"
+ "fig.colorbar(im1)\n"
+ "plt.savefig('%s.png')\n"
+ "plt.close()\n"
+ "\n", prefix
+ );
+ fclose(fp);
+ free(fname);
+}
+// Find the best matching unit: the node with minimum distance from the input vector
+static inline int som_find_bmu(som_t *som, double *vec, double *dist)
+{
+ double *ptr = som->w;
+ double min_dist = HUGE_VAL;
+ int min_idx = 0;
+
+ int i, k;
+ for (i=0; i<som->size; i++)
+ {
+ double dist = 0;
+ for (k=0; k<som->kdim; k++)
+ dist += (vec[k] - ptr[k]) * (vec[k] - ptr[k]);
+ if ( dist < min_dist )
+ {
+ min_dist = dist;
+ min_idx = i;
+ }
+ ptr += som->kdim;
+ }
+
+ if ( dist ) *dist = min_dist;
+ return min_idx;
+}
+static inline double som_get_score(som_t *som, double *vec, double bmu_th)
+{
+ double *ptr = som->w;
+ double min_dist = HUGE_VAL;
+
+ int i, k;
+ for (i=0; i<som->size; i++)
+ {
+ if ( som->c[i] >= bmu_th )
+ {
+ double dist = 0;
+ for (k=0; k<som->kdim; k++)
+ dist += (vec[k] - ptr[k]) * (vec[k] - ptr[k]);
+ if ( dist < min_dist ) min_dist = dist;
+ }
+ ptr += som->kdim;
+ }
+ return sqrt(min_dist);
+}
+// Convert flat index to that of a k-dimensional cube
+static inline void som_idx_to_ndim(som_t *som, int idx, int *ndim)
+{
+ int i;
+ double sub = 0;
+
+ ndim[0] = idx/som->div[0];
+ for (i=1; i<som->ndim; i++)
+ {
+ sub += ndim[i-1] * som->div[i-1];
+ ndim[i] = (idx - sub)/som->div[i];
+ }
+}
+static void som_train_site(som_t *som, double *vec, int update_counts)
+{
+ // update learning rate and learning radius
+ som->t++;
+ double dt = exp(-som->t/som->nt);
+ double learning_rate = som->learn * dt;
+ double radius = som->nbin * dt; radius *= radius;
+
+ // find the best matching unit and its indexes
+ int min_idx = som_find_bmu(som, vec, NULL);
+ som_idx_to_ndim(som, min_idx, som->a_idx);
+
+ // update the weights: traverse the map and make all nodes within the
+ // radius more similar to the input vector
+ double *ptr = som->w;
+ double *cnt = som->c;
+ int i, j, k;
+ for (i=0; i<som->size; i++)
+ {
+ som_idx_to_ndim(som, i, som->b_idx);
+ double dist = 0;
+ for (j=0; j<som->ndim; j++)
+ dist += (som->a_idx[j] - som->b_idx[j]) * (som->a_idx[j] - som->b_idx[j]);
+ if ( dist <= radius )
+ {
+ double influence = exp(-dist*dist*0.5/radius) * learning_rate;
+ for (k=0; k<som->kdim; k++)
+ ptr[k] += influence * (vec[k] - ptr[k]);
+
+ // Bad sites may help to shape the map, but only nodes with big enough
+ // influence will be used for classification.
+ if ( update_counts ) *cnt += influence;
+ }
+ ptr += som->kdim;
+ cnt++;
+ }
+}
+static void som_norm_counts(som_t *som)
+{
+ int i;
+ double max = 0;
+ for (i=0; i<som->size; i++)
+ if ( max < som->c[i] ) max = som->c[i];
+ for (i=0; i<som->size; i++)
+ som->c[i] /= max;
+}
+static som_t *som_init(args_t *args)
+{
+ som_t *som = (som_t*) calloc(1,sizeof(som_t));
+ som->ndim = args->ndim;
+ som->nbin = args->nbin;
+ som->kdim = args->mvals;
+ som->nt = args->ntrain;
+ som->learn = args->learn;
+ som->bmu_th = args->bmu_th;
+ som->size = pow(som->nbin,som->ndim);
+ som->w = (double*) malloc(sizeof(double)*som->size*som->kdim);
+ if ( !som->w ) error("Could not alloc %d bytes [nbin=%d ndim=%d kdim=%d]\n", sizeof(double)*som->size*som->kdim,som->nbin,som->ndim,som->kdim);
+ som->c = (double*) calloc(som->size,sizeof(double));
+ if ( !som->w ) error("Could not alloc %d bytes [nbin=%d ndim=%d]\n", sizeof(double)*som->size,som->nbin,som->ndim);
+ int i;
+ for (i=0; i<som->size*som->kdim; i++)
+ som->w[i] = (double)random()/RAND_MAX;
+ som->a_idx = (int*) malloc(sizeof(int)*som->ndim);
+ som->b_idx = (int*) malloc(sizeof(int)*som->ndim);
+ som->div = (double*) malloc(sizeof(double)*som->ndim);
+ for (i=0; i<som->ndim; i++)
+ som->div[i] = pow(som->nbin,som->ndim-i-1);
+ return som;
+}
+static void som_destroy(som_t *som)
+{
+ free(som->a_idx); free(som->b_idx); free(som->div);
+ free(som->w); free(som->c);
+ free(som);
+}
+
+static void init_data(args_t *args)
+{
+ // Get first line to learn the vector size
+ annots_reader_reset(args);
+ annots_reader_next(args);
+
+ if ( args->action==SOM_CLASSIFY )
+ args->som = som_load_map(args->prefix,&args->nfold);
+}
+static void destroy_data(args_t *args)
+{
+ int i;
+ if ( args->som )
+ {
+ for (i=0; i<args->nfold; i++) som_destroy(args->som[i]);
+ }
+ free(args->train_dat);
+ free(args->train_class);
+ free(args->som);
+ free(args->vals);
+ free(args->str.s);
+}
+
+#define MERGE_MIN 0
+#define MERGE_MAX 1
+#define MERGE_AVG 2
+static double get_min_score(args_t *args, int iskip)
+{
+ int i;
+ double score, min_score = HUGE_VAL;
+ for (i=0; i<args->nfold; i++)
+ {
+ if ( i==iskip ) continue;
+ score = som_get_score(args->som[i], args->vals, args->bmu_th);
+ if ( i==0 || score < min_score ) min_score = score;
+ }
+ return min_score;
+}
+static double get_max_score(args_t *args, int iskip)
+{
+ int i;
+ double score, max_score = -HUGE_VAL;
+ for (i=0; i<args->nfold; i++)
+ {
+ if ( i==iskip ) continue;
+ score = som_get_score(args->som[i], args->vals, args->bmu_th);
+ if ( i==0 || max_score < score ) max_score = score;
+ }
+ return max_score;
+}
+static double get_avg_score(args_t *args, int iskip)
+{
+ int i, n = 0;
+ double score = 0;
+ for (i=0; i<args->nfold; i++)
+ {
+ if ( i==iskip ) continue;
+ score += som_get_score(args->som[i], args->vals, args->bmu_th);
+ n++;
+ }
+ return score/n;
+}
+static int cmpfloat_desc(const void *a, const void *b)
+{
+ float fa = *((float*)a);
+ float fb = *((float*)b);
+ if ( fa<fb ) return 1;
+ if ( fa>fb ) return -1;
+ return 0;
+}
+
+static void create_eval_plot(args_t *args)
+{
+ FILE *fp = open_file(NULL,"w","%s.eval.py", args->prefix);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "\n"
+ "import csv\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "dat = []\n"
+ "with open('%s.eval', 'rb') as f:\n"
+ "\treader = csv.reader(f, 'tab')\n"
+ "\tfor row in reader:\n"
+ "\t\tif row[0][0]!='#': dat.append(row)\n"
+ "\n"
+ "fig = plt.figure()\n"
+ "ax1 = plt.subplot(111)\n"
+ "ax1.plot([x[0] for x in dat],[x[1] for x in dat],'g',label='Good')\n"
+ "ax1.plot([x[0] for x in dat],[x[2] for x in dat],'r',label='Bad')\n"
+ "ax1.set_xlabel('SOM score')\n"
+ "ax1.set_ylabel('Number of training sites')\n"
+ "ax1.legend(loc='best',prop={'size':8},frameon=False)\n"
+ "plt.savefig('%s.eval.png')\n"
+ "plt.close()\n"
+ "\n", args->prefix,args->prefix
+ );
+ fclose(fp);
+}
+
+static void do_train(args_t *args)
+{
+ // read training sites
+ int i, igood = 0, ibad = 0, ngood = 0, nbad = 0, ntrain = 0;
+ annots_reader_reset(args);
+ while ( annots_reader_next(args) )
+ {
+ // determine which of the nfold's SOMs to train
+ int isom = 0;
+ if ( args->dclass == args->good_class )
+ {
+ if ( ++igood >= args->nfold ) igood = 0;
+ isom = igood;
+ ngood++;
+ }
+ else if ( args->dclass == args->bad_class )
+ {
+ if ( ++ibad >= args->nfold ) ibad = 0;
+ isom = ibad;
+ nbad++;
+ }
+ else
+ error("Could not determine the class: %d (vs %d and %d)\n", args->dclass,args->good_class,args->bad_class);
+
+ // save the values for evaluation
+ ntrain++;
+ hts_expand(double, ntrain*args->mvals, args->mtrain_dat, args->train_dat);
+ hts_expand(int, ntrain, args->mtrain_class, args->train_class);
+ memcpy(args->train_dat+(ntrain-1)*args->mvals, args->vals, args->mvals*sizeof(double));
+ args->train_class[ntrain-1] = (args->dclass==args->good_class ? 1 : 0) | isom<<1; // store class + chunk used for training
+ }
+ annots_reader_close(args);
+
+ // init maps
+ if ( !args->ntrain ) args->ntrain = ngood/args->nfold;
+ srandom(args->rand_seed);
+ args->som = (som_t**) malloc(sizeof(som_t*)*args->nfold);
+ for (i=0; i<args->nfold; i++) args->som[i] = som_init(args);
+
+ // train
+ for (i=0; i<ntrain; i++)
+ {
+ int is_good = args->train_class[i] & 1;
+ int isom = args->train_class[i] >> 1;
+ if ( is_good || args->train_bad )
+ som_train_site(args->som[isom], args->train_dat+i*args->mvals, is_good);
+ }
+
+ // norm and create plots
+ for (i=0; i<args->nfold; i++)
+ {
+ som_norm_counts(args->som[i]);
+ if ( args->prefix )
+ {
+ char *bname = msprintf("%s.som.%d", args->prefix,i);
+ som_create_plot(args->som[i], bname);
+ free(bname);
+ }
+ }
+
+ // evaluate
+ float *good = (float*) malloc(sizeof(float)*ngood); assert(good);
+ float *bad = (float*) malloc(sizeof(float)*nbad); assert(bad);
+ igood = ibad = 0;
+ double max_score = sqrt(args->som[0]->kdim);
+ for (i=0; i<ntrain; i++)
+ {
+ double score = 0;
+ int is_good = args->train_class[i] & 1;
+ int isom = args->train_class[i] >> 1; // this vector was used for training isom-th SOM, skip
+ if ( args->nfold==1 ) isom = -1;
+ memcpy(args->vals, args->train_dat+i*args->mvals, args->mvals*sizeof(double));
+ switch (args->merge)
+ {
+ case MERGE_MIN: score = get_min_score(args, isom); break;
+ case MERGE_MAX: score = get_max_score(args, isom); break;
+ case MERGE_AVG: score = get_avg_score(args, isom); break;
+ }
+ score = 1.0 - score/max_score;
+ if ( is_good )
+ good[igood++] = score;
+ else
+ bad[ibad++] = score;
+ }
+ qsort(good, ngood, sizeof(float), cmpfloat_desc);
+ qsort(bad, nbad, sizeof(float), cmpfloat_desc);
+ FILE *fp = NULL;
+ if ( args->prefix ) fp = open_file(NULL,"w","%s.eval", args->prefix);
+ igood = 0;
+ ibad = 0;
+ float prev_score = good[0]>bad[0] ? good[0] : bad[0];
+ int printed = 0;
+ while ( igood<ngood || ibad<nbad )
+ {
+ if ( igood<ngood && good[igood]==prev_score ) { igood++; continue; }
+ if ( ibad<nbad && bad[ibad]==prev_score ) { ibad++; continue; }
+ if ( fp )
+ fprintf(fp,"%e\t%f\t%f\n", prev_score, (float)igood/ngood, (float)ibad/nbad);
+ if ( !printed && (float)igood/ngood > 0.9 )
+ {
+ printf("%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
+ printed = 1;
+ }
+
+ if ( igood<ngood && ibad<nbad ) prev_score = good[igood]>bad[ibad] ? good[igood] : bad[ibad];
+ else if ( igood<ngood ) prev_score = good[igood];
+ else prev_score = bad[ibad];
+ }
+ if ( !printed ) printf("%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
+ if ( fp )
+ {
+ if ( fclose(fp) ) error("%s.eval: fclose failed: %s\n",args->prefix,strerror(errno));
+ create_eval_plot(args);
+ som_write_map(args->prefix, args->som, args->nfold);
+ }
+
+ free(good);
+ free(bad);
+}
+
+static void do_classify(args_t *args)
+{
+ annots_reader_reset(args);
+ double max_score = sqrt(args->som[0]->kdim);
+ while ( annots_reader_next(args) )
+ {
+ double score = 0;
+ switch (args->merge)
+ {
+ case MERGE_MIN: score = get_min_score(args, -1); break;
+ case MERGE_MAX: score = get_max_score(args, -1); break;
+ case MERGE_AVG: score = get_avg_score(args, -1); break;
+ }
+ printf("%e\n", 1.0 - score/max_score);
+ }
+ annots_reader_close(args);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: SOM (Self-Organizing Map) filtering.\n");
+ fprintf(stderr, "Usage: bcftools som --train [options] <annots.tab.gz>\n");
+ fprintf(stderr, " bcftools som --classify [options]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Model training options:\n");
+ fprintf(stderr, " -f, --nfold <int> n-fold cross-validation (number of maps) [5]\n");
+ fprintf(stderr, " -p, --prefix <string> prefix of output files\n");
+ fprintf(stderr, " -s, --size <int> map size [20]\n");
+ fprintf(stderr, " -t, --train \n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Classifying options:\n");
+ fprintf(stderr, " -c, --classify \n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Experimental training options (no reason to change):\n");
+ fprintf(stderr, " -b, --bmu-threshold <float> threshold for selection of best-matching unit [0.9]\n");
+ fprintf(stderr, " -d, --som-dimension <int> SOM dimension [2]\n");
+ fprintf(stderr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n");
+ fprintf(stderr, " -l, --learning-rate <float> learning rate [1.0]\n");
+ fprintf(stderr, " -m, --merge <min|max|avg> -f merge algorithm [avg]\n");
+ fprintf(stderr, " -n, --ntrain-sites <int> effective number of training sites [number of good sites]\n");
+ fprintf(stderr, " -r, --random-seed <int> random seed, 0 for time() [1]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfsom(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->nbin = 20;
+ args->learn = 1.0;
+ args->bmu_th = 0.9;
+ args->nfold = 5;
+ args->rand_seed = 1;
+ args->ndim = 2;
+ args->bad_class = 1;
+ args->good_class = 2;
+ args->merge = MERGE_AVG;
+ args->train_bad = 1;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"prefix",1,0,'p'},
+ {"ntrain-sites",1,0,'n'},
+ {"random-seed",1,0,'r'},
+ {"bmu-threshold",1,0,'b'},
+ {"exclude-bad",0,0,'e'},
+ {"learning-rate",1,0,'l'},
+ {"size",1,0,'s'},
+ {"som-dimension",1,0,'d'},
+ {"nfold",1,0,'f'},
+ {"merge",1,0,'m'},
+ {"train",0,0,'t'},
+ {"classify",0,0,'c'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "htcp:n:r:b:l:s:f:d:m:e",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'e': args->train_bad = 0; break;
+ case 'm':
+ if ( !strcmp(optarg,"min") ) args->merge = MERGE_MIN;
+ else if ( !strcmp(optarg,"max") ) args->merge = MERGE_MAX;
+ else if ( !strcmp(optarg,"avg") ) args->merge = MERGE_AVG;
+ else error("The -m method not recognised: %s\n", optarg);
+ break;
+ case 'p': args->prefix = optarg; break;
+ case 'n': args->ntrain = atoi(optarg); break;
+ case 'r': args->rand_seed = atoi(optarg); break;
+ case 'b': args->bmu_th = atof(optarg); break;
+ case 'l': args->learn = atof(optarg); break;
+ case 's': args->nbin = atoi(optarg); break;
+ case 'f': args->nfold = atoi(optarg); break;
+ case 'd':
+ args->ndim = atoi(optarg);
+ if ( args->ndim<2 ) error("Expected -d >=2, got %d\n", args->ndim);
+ if ( args->ndim>3 ) fprintf(stderr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim);
+ break;
+ case 't': args->action = SOM_TRAIN; break;
+ case 'c': args->action = SOM_CLASSIFY; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( !args->rand_seed ) args->rand_seed = time(NULL);
+ if ( argc!=optind+1 ) usage();
+ args->fname = argv[optind];
+ init_data(args);
+
+ if ( args->action == SOM_TRAIN ) do_train(args);
+ else if ( args->action == SOM_CLASSIFY ) do_classify(args);
+
+ destroy_data(args);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c
new file mode 100644
index 0000000..32e7213
--- /dev/null
+++ b/bcftools/vcfsom.c.pysam.c
@@ -0,0 +1,717 @@
+#include "pysam.h"
+
+/* vcfsom.c -- SOM (Self-Organizing Map) filtering.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <time.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <inttypes.h>
+#include "bcftools.h"
+
+#define SOM_TRAIN 1
+#define SOM_CLASSIFY 2
+
+typedef struct
+{
+ int ndim; // dimension of the map (2D, 3D, ...)
+ int nbin; // number of bins in th map
+ int size; // pow(nbin,ndim)
+ int kdim; // dimension of the input vectors
+ int nt, t; // total number of learning cycles and the current cycle
+ double *w, *c; // weights and counts (sum of learning influence)
+ double learn; // learning rate
+ double bmu_th; // best-matching unit threshold
+ int *a_idx, *b_idx; // temp arrays for traversing variable number of nested loops
+ double *div; // dtto
+}
+som_t;
+
+typedef struct
+{
+ // SOM parameters
+ double bmu_th, learn;
+ int ndim, nbin, ntrain, t;
+ int nfold; // n-fold cross validation = the number of SOMs
+ som_t **som;
+
+ // annots reader's data
+ htsFile *file; // reader
+ kstring_t str; // temporary string for the reader
+ int dclass, mvals;
+ double *vals;
+
+ // training data
+ double *train_dat;
+ int *train_class, mtrain_class, mtrain_dat;
+
+ int rand_seed, good_class, bad_class;
+ char **argv, *fname, *prefix;
+ int argc, action, train_bad, merge;
+}
+args_t;
+
+static void usage(void);
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+void mkdir_p(const char *fmt, ...);
+
+char *msprintf(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ int n = vsnprintf(NULL, 0, fmt, ap) + 2;
+ va_end(ap);
+
+ char *str = (char*)malloc(n);
+ va_start(ap, fmt);
+ vsnprintf(str, n, fmt, ap);
+ va_end(ap);
+
+ return str;
+}
+
+/*
+ * char *t, *p = str;
+ * t = column_next(p, '\t');
+ * if ( strlen("<something>")==t-p && !strncmp(p,"<something>",t-p) ) printf("found!\n");
+ *
+ * char *t;
+ * t = column_next(str, '\t'); if ( !*t ) error("expected field\n", str);
+ * t = column_next(t+1, '\t'); if ( !*t ) error("expected field\n", str);
+ */
+static inline char *column_next(char *start, char delim)
+{
+ char *end = start;
+ while (*end && *end!=delim) end++;
+ return end;
+}
+/**
+ * annots_reader_next() - reads next line from annots.tab.gz and sets: class, vals
+ * Returns 1 on successful read or 0 if no further record could be read.
+ */
+int annots_reader_next(args_t *args)
+{
+ args->str.l = 0;
+ if ( hts_getline(args->file,'\n',&args->str)<=0 ) return 0;
+
+ char *t, *line = args->str.s;
+
+ if ( !args->mvals )
+ {
+ t = line;
+ while ( *t )
+ {
+ if ( *t=='\t' ) args->mvals++;
+ t++;
+ }
+ args->vals = (double*) malloc(args->mvals*sizeof(double));
+ }
+
+ // class
+ args->dclass = atoi(line);
+ t = column_next(line, '\t');
+
+ // values
+ int i;
+ for (i=0; i<args->mvals; i++)
+ {
+ if ( !*t ) error("Could not parse %d-th data field: is the line truncated?\nThe line was: [%s]\n",i+2,line);
+ args->vals[i] = atof(++t);
+ t = column_next(t,'\t');
+ }
+ return 1;
+}
+void annots_reader_reset(args_t *args)
+{
+ if ( args->file ) hts_close(args->file);
+ if ( !args->fname ) error("annots_reader_reset: no fname\n");
+ args->file = hts_open(args->fname, "r");
+}
+void annots_reader_close(args_t *args)
+{
+ hts_close(args->file);
+}
+
+static void som_write_map(char *prefix, som_t **som, int nsom)
+{
+ FILE *fp = open_file(NULL,"w","%s.som",prefix);
+ fwrite("SOMv1",5,1,fp);
+ fwrite(&nsom,sizeof(int),1,fp);
+ int i;
+ for (i=0; i<nsom; i++)
+ {
+ fwrite(&som[i]->size,sizeof(int),1,fp);
+ fwrite(&som[i]->kdim,sizeof(int),1,fp);
+ fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp);
+ fwrite(som[i]->c,sizeof(double),som[i]->size,fp);
+ }
+ if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
+}
+static som_t** som_load_map(char *prefix, int *nsom)
+{
+ FILE *fp = open_file(NULL,"r","%s.som",prefix);
+ char buf[5];
+ if ( fread(buf,5,1,fp)!=1 || strncmp(buf,"SOMv1",5) ) error("Could not parse %s.som\n", prefix);
+
+ if ( fread(nsom,sizeof(int),1,fp)!=1 ) error("Could not read %s.som\n", prefix);
+ som_t **som = (som_t**)malloc(*nsom*sizeof(som_t*));
+
+ int i;
+ for (i=0; i<*nsom; i++)
+ {
+ som[i] = (som_t*) calloc(1,sizeof(som_t));
+ if ( fread(&som[i]->size,sizeof(int),1,fp) != 1 ) error("Could not read %s.som\n", prefix);
+ if ( fread(&som[i]->kdim,sizeof(int),1,fp) != 1 ) error("Could not read %s.som\n", prefix);
+ som[i]->w = (double*) malloc(sizeof(double)*som[i]->size*som[i]->kdim);
+ som[i]->c = (double*) malloc(sizeof(double)*som[i]->size);
+ if ( fread(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp) != som[i]->size*som[i]->kdim ) error("Could not read from %s.som\n", prefix);
+ if ( fread(som[i]->c,sizeof(double),som[i]->size,fp) != som[i]->size ) error("Could not read from %s.som\n", prefix);
+ }
+ if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
+ return som;
+}
+static void som_create_plot(som_t *som, char *prefix)
+{
+ if ( som->ndim!=2 ) return;
+
+ char *fname;
+ FILE *fp = open_file(&fname,"w","%s.py",prefix);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "\n"
+ "dat = [\n"
+ );
+ int i,j;
+ double *val = som->c;
+ for (i=0; i<som->nbin; i++)
+ {
+ fprintf(fp,"[");
+ for (j=0; j<som->nbin; j++)
+ {
+ if ( j>0 ) fprintf(fp,",");
+ fprintf(fp,"%e", *val);
+ val++;
+ }
+ fprintf(fp,"],\n");
+ }
+ fprintf(fp,
+ "]\n"
+ "fig = plt.figure()\n"
+ "ax1 = plt.subplot(111)\n"
+ "im1 = ax1.imshow(dat)\n"
+ "fig.colorbar(im1)\n"
+ "plt.savefig('%s.png')\n"
+ "plt.close()\n"
+ "\n", prefix
+ );
+ fclose(fp);
+ free(fname);
+}
+// Find the best matching unit: the node with minimum distance from the input vector
+static inline int som_find_bmu(som_t *som, double *vec, double *dist)
+{
+ double *ptr = som->w;
+ double min_dist = HUGE_VAL;
+ int min_idx = 0;
+
+ int i, k;
+ for (i=0; i<som->size; i++)
+ {
+ double dist = 0;
+ for (k=0; k<som->kdim; k++)
+ dist += (vec[k] - ptr[k]) * (vec[k] - ptr[k]);
+ if ( dist < min_dist )
+ {
+ min_dist = dist;
+ min_idx = i;
+ }
+ ptr += som->kdim;
+ }
+
+ if ( dist ) *dist = min_dist;
+ return min_idx;
+}
+static inline double som_get_score(som_t *som, double *vec, double bmu_th)
+{
+ double *ptr = som->w;
+ double min_dist = HUGE_VAL;
+
+ int i, k;
+ for (i=0; i<som->size; i++)
+ {
+ if ( som->c[i] >= bmu_th )
+ {
+ double dist = 0;
+ for (k=0; k<som->kdim; k++)
+ dist += (vec[k] - ptr[k]) * (vec[k] - ptr[k]);
+ if ( dist < min_dist ) min_dist = dist;
+ }
+ ptr += som->kdim;
+ }
+ return sqrt(min_dist);
+}
+// Convert flat index to that of a k-dimensional cube
+static inline void som_idx_to_ndim(som_t *som, int idx, int *ndim)
+{
+ int i;
+ double sub = 0;
+
+ ndim[0] = idx/som->div[0];
+ for (i=1; i<som->ndim; i++)
+ {
+ sub += ndim[i-1] * som->div[i-1];
+ ndim[i] = (idx - sub)/som->div[i];
+ }
+}
+static void som_train_site(som_t *som, double *vec, int update_counts)
+{
+ // update learning rate and learning radius
+ som->t++;
+ double dt = exp(-som->t/som->nt);
+ double learning_rate = som->learn * dt;
+ double radius = som->nbin * dt; radius *= radius;
+
+ // find the best matching unit and its indexes
+ int min_idx = som_find_bmu(som, vec, NULL);
+ som_idx_to_ndim(som, min_idx, som->a_idx);
+
+ // update the weights: traverse the map and make all nodes within the
+ // radius more similar to the input vector
+ double *ptr = som->w;
+ double *cnt = som->c;
+ int i, j, k;
+ for (i=0; i<som->size; i++)
+ {
+ som_idx_to_ndim(som, i, som->b_idx);
+ double dist = 0;
+ for (j=0; j<som->ndim; j++)
+ dist += (som->a_idx[j] - som->b_idx[j]) * (som->a_idx[j] - som->b_idx[j]);
+ if ( dist <= radius )
+ {
+ double influence = exp(-dist*dist*0.5/radius) * learning_rate;
+ for (k=0; k<som->kdim; k++)
+ ptr[k] += influence * (vec[k] - ptr[k]);
+
+ // Bad sites may help to shape the map, but only nodes with big enough
+ // influence will be used for classification.
+ if ( update_counts ) *cnt += influence;
+ }
+ ptr += som->kdim;
+ cnt++;
+ }
+}
+static void som_norm_counts(som_t *som)
+{
+ int i;
+ double max = 0;
+ for (i=0; i<som->size; i++)
+ if ( max < som->c[i] ) max = som->c[i];
+ for (i=0; i<som->size; i++)
+ som->c[i] /= max;
+}
+static som_t *som_init(args_t *args)
+{
+ som_t *som = (som_t*) calloc(1,sizeof(som_t));
+ som->ndim = args->ndim;
+ som->nbin = args->nbin;
+ som->kdim = args->mvals;
+ som->nt = args->ntrain;
+ som->learn = args->learn;
+ som->bmu_th = args->bmu_th;
+ som->size = pow(som->nbin,som->ndim);
+ som->w = (double*) malloc(sizeof(double)*som->size*som->kdim);
+ if ( !som->w ) error("Could not alloc %d bytes [nbin=%d ndim=%d kdim=%d]\n", sizeof(double)*som->size*som->kdim,som->nbin,som->ndim,som->kdim);
+ som->c = (double*) calloc(som->size,sizeof(double));
+ if ( !som->w ) error("Could not alloc %d bytes [nbin=%d ndim=%d]\n", sizeof(double)*som->size,som->nbin,som->ndim);
+ int i;
+ for (i=0; i<som->size*som->kdim; i++)
+ som->w[i] = (double)random()/RAND_MAX;
+ som->a_idx = (int*) malloc(sizeof(int)*som->ndim);
+ som->b_idx = (int*) malloc(sizeof(int)*som->ndim);
+ som->div = (double*) malloc(sizeof(double)*som->ndim);
+ for (i=0; i<som->ndim; i++)
+ som->div[i] = pow(som->nbin,som->ndim-i-1);
+ return som;
+}
+static void som_destroy(som_t *som)
+{
+ free(som->a_idx); free(som->b_idx); free(som->div);
+ free(som->w); free(som->c);
+ free(som);
+}
+
+static void init_data(args_t *args)
+{
+ // Get first line to learn the vector size
+ annots_reader_reset(args);
+ annots_reader_next(args);
+
+ if ( args->action==SOM_CLASSIFY )
+ args->som = som_load_map(args->prefix,&args->nfold);
+}
+static void destroy_data(args_t *args)
+{
+ int i;
+ if ( args->som )
+ {
+ for (i=0; i<args->nfold; i++) som_destroy(args->som[i]);
+ }
+ free(args->train_dat);
+ free(args->train_class);
+ free(args->som);
+ free(args->vals);
+ free(args->str.s);
+}
+
+#define MERGE_MIN 0
+#define MERGE_MAX 1
+#define MERGE_AVG 2
+static double get_min_score(args_t *args, int iskip)
+{
+ int i;
+ double score, min_score = HUGE_VAL;
+ for (i=0; i<args->nfold; i++)
+ {
+ if ( i==iskip ) continue;
+ score = som_get_score(args->som[i], args->vals, args->bmu_th);
+ if ( i==0 || score < min_score ) min_score = score;
+ }
+ return min_score;
+}
+static double get_max_score(args_t *args, int iskip)
+{
+ int i;
+ double score, max_score = -HUGE_VAL;
+ for (i=0; i<args->nfold; i++)
+ {
+ if ( i==iskip ) continue;
+ score = som_get_score(args->som[i], args->vals, args->bmu_th);
+ if ( i==0 || max_score < score ) max_score = score;
+ }
+ return max_score;
+}
+static double get_avg_score(args_t *args, int iskip)
+{
+ int i, n = 0;
+ double score = 0;
+ for (i=0; i<args->nfold; i++)
+ {
+ if ( i==iskip ) continue;
+ score += som_get_score(args->som[i], args->vals, args->bmu_th);
+ n++;
+ }
+ return score/n;
+}
+static int cmpfloat_desc(const void *a, const void *b)
+{
+ float fa = *((float*)a);
+ float fb = *((float*)b);
+ if ( fa<fb ) return 1;
+ if ( fa>fb ) return -1;
+ return 0;
+}
+
+static void create_eval_plot(args_t *args)
+{
+ FILE *fp = open_file(NULL,"w","%s.eval.py", args->prefix);
+ fprintf(fp,
+ "import matplotlib as mpl\n"
+ "mpl.use('Agg')\n"
+ "import matplotlib.pyplot as plt\n"
+ "\n"
+ "import csv\n"
+ "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
+ "dat = []\n"
+ "with open('%s.eval', 'rb') as f:\n"
+ "\treader = csv.reader(f, 'tab')\n"
+ "\tfor row in reader:\n"
+ "\t\tif row[0][0]!='#': dat.append(row)\n"
+ "\n"
+ "fig = plt.figure()\n"
+ "ax1 = plt.subplot(111)\n"
+ "ax1.plot([x[0] for x in dat],[x[1] for x in dat],'g',label='Good')\n"
+ "ax1.plot([x[0] for x in dat],[x[2] for x in dat],'r',label='Bad')\n"
+ "ax1.set_xlabel('SOM score')\n"
+ "ax1.set_ylabel('Number of training sites')\n"
+ "ax1.legend(loc='best',prop={'size':8},frameon=False)\n"
+ "plt.savefig('%s.eval.png')\n"
+ "plt.close()\n"
+ "\n", args->prefix,args->prefix
+ );
+ fclose(fp);
+}
+
+static void do_train(args_t *args)
+{
+ // read training sites
+ int i, igood = 0, ibad = 0, ngood = 0, nbad = 0, ntrain = 0;
+ annots_reader_reset(args);
+ while ( annots_reader_next(args) )
+ {
+ // determine which of the nfold's SOMs to train
+ int isom = 0;
+ if ( args->dclass == args->good_class )
+ {
+ if ( ++igood >= args->nfold ) igood = 0;
+ isom = igood;
+ ngood++;
+ }
+ else if ( args->dclass == args->bad_class )
+ {
+ if ( ++ibad >= args->nfold ) ibad = 0;
+ isom = ibad;
+ nbad++;
+ }
+ else
+ error("Could not determine the class: %d (vs %d and %d)\n", args->dclass,args->good_class,args->bad_class);
+
+ // save the values for evaluation
+ ntrain++;
+ hts_expand(double, ntrain*args->mvals, args->mtrain_dat, args->train_dat);
+ hts_expand(int, ntrain, args->mtrain_class, args->train_class);
+ memcpy(args->train_dat+(ntrain-1)*args->mvals, args->vals, args->mvals*sizeof(double));
+ args->train_class[ntrain-1] = (args->dclass==args->good_class ? 1 : 0) | isom<<1; // store class + chunk used for training
+ }
+ annots_reader_close(args);
+
+ // init maps
+ if ( !args->ntrain ) args->ntrain = ngood/args->nfold;
+ srandom(args->rand_seed);
+ args->som = (som_t**) malloc(sizeof(som_t*)*args->nfold);
+ for (i=0; i<args->nfold; i++) args->som[i] = som_init(args);
+
+ // train
+ for (i=0; i<ntrain; i++)
+ {
+ int is_good = args->train_class[i] & 1;
+ int isom = args->train_class[i] >> 1;
+ if ( is_good || args->train_bad )
+ som_train_site(args->som[isom], args->train_dat+i*args->mvals, is_good);
+ }
+
+ // norm and create plots
+ for (i=0; i<args->nfold; i++)
+ {
+ som_norm_counts(args->som[i]);
+ if ( args->prefix )
+ {
+ char *bname = msprintf("%s.som.%d", args->prefix,i);
+ som_create_plot(args->som[i], bname);
+ free(bname);
+ }
+ }
+
+ // evaluate
+ float *good = (float*) malloc(sizeof(float)*ngood); assert(good);
+ float *bad = (float*) malloc(sizeof(float)*nbad); assert(bad);
+ igood = ibad = 0;
+ double max_score = sqrt(args->som[0]->kdim);
+ for (i=0; i<ntrain; i++)
+ {
+ double score = 0;
+ int is_good = args->train_class[i] & 1;
+ int isom = args->train_class[i] >> 1; // this vector was used for training isom-th SOM, skip
+ if ( args->nfold==1 ) isom = -1;
+ memcpy(args->vals, args->train_dat+i*args->mvals, args->mvals*sizeof(double));
+ switch (args->merge)
+ {
+ case MERGE_MIN: score = get_min_score(args, isom); break;
+ case MERGE_MAX: score = get_max_score(args, isom); break;
+ case MERGE_AVG: score = get_avg_score(args, isom); break;
+ }
+ score = 1.0 - score/max_score;
+ if ( is_good )
+ good[igood++] = score;
+ else
+ bad[ibad++] = score;
+ }
+ qsort(good, ngood, sizeof(float), cmpfloat_desc);
+ qsort(bad, nbad, sizeof(float), cmpfloat_desc);
+ FILE *fp = NULL;
+ if ( args->prefix ) fp = open_file(NULL,"w","%s.eval", args->prefix);
+ igood = 0;
+ ibad = 0;
+ float prev_score = good[0]>bad[0] ? good[0] : bad[0];
+ int printed = 0;
+ while ( igood<ngood || ibad<nbad )
+ {
+ if ( igood<ngood && good[igood]==prev_score ) { igood++; continue; }
+ if ( ibad<nbad && bad[ibad]==prev_score ) { ibad++; continue; }
+ if ( fp )
+ fprintf(fp,"%e\t%f\t%f\n", prev_score, (float)igood/ngood, (float)ibad/nbad);
+ if ( !printed && (float)igood/ngood > 0.9 )
+ {
+ printf("%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
+ printed = 1;
+ }
+
+ if ( igood<ngood && ibad<nbad ) prev_score = good[igood]>bad[ibad] ? good[igood] : bad[ibad];
+ else if ( igood<ngood ) prev_score = good[igood];
+ else prev_score = bad[ibad];
+ }
+ if ( !printed ) printf("%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
+ if ( fp )
+ {
+ if ( fclose(fp) ) error("%s.eval: fclose failed: %s\n",args->prefix,strerror(errno));
+ create_eval_plot(args);
+ som_write_map(args->prefix, args->som, args->nfold);
+ }
+
+ free(good);
+ free(bad);
+}
+
+static void do_classify(args_t *args)
+{
+ annots_reader_reset(args);
+ double max_score = sqrt(args->som[0]->kdim);
+ while ( annots_reader_next(args) )
+ {
+ double score = 0;
+ switch (args->merge)
+ {
+ case MERGE_MIN: score = get_min_score(args, -1); break;
+ case MERGE_MAX: score = get_max_score(args, -1); break;
+ case MERGE_AVG: score = get_avg_score(args, -1); break;
+ }
+ printf("%e\n", 1.0 - score/max_score);
+ }
+ annots_reader_close(args);
+}
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: SOM (Self-Organizing Map) filtering.\n");
+ fprintf(pysamerr, "Usage: bcftools som --train [options] <annots.tab.gz>\n");
+ fprintf(pysamerr, " bcftools som --classify [options]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Model training options:\n");
+ fprintf(pysamerr, " -f, --nfold <int> n-fold cross-validation (number of maps) [5]\n");
+ fprintf(pysamerr, " -p, --prefix <string> prefix of output files\n");
+ fprintf(pysamerr, " -s, --size <int> map size [20]\n");
+ fprintf(pysamerr, " -t, --train \n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Classifying options:\n");
+ fprintf(pysamerr, " -c, --classify \n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Experimental training options (no reason to change):\n");
+ fprintf(pysamerr, " -b, --bmu-threshold <float> threshold for selection of best-matching unit [0.9]\n");
+ fprintf(pysamerr, " -d, --som-dimension <int> SOM dimension [2]\n");
+ fprintf(pysamerr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n");
+ fprintf(pysamerr, " -l, --learning-rate <float> learning rate [1.0]\n");
+ fprintf(pysamerr, " -m, --merge <min|max|avg> -f merge algorithm [avg]\n");
+ fprintf(pysamerr, " -n, --ntrain-sites <int> effective number of training sites [number of good sites]\n");
+ fprintf(pysamerr, " -r, --random-seed <int> random seed, 0 for time() [1]\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfsom(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->nbin = 20;
+ args->learn = 1.0;
+ args->bmu_th = 0.9;
+ args->nfold = 5;
+ args->rand_seed = 1;
+ args->ndim = 2;
+ args->bad_class = 1;
+ args->good_class = 2;
+ args->merge = MERGE_AVG;
+ args->train_bad = 1;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"prefix",1,0,'p'},
+ {"ntrain-sites",1,0,'n'},
+ {"random-seed",1,0,'r'},
+ {"bmu-threshold",1,0,'b'},
+ {"exclude-bad",0,0,'e'},
+ {"learning-rate",1,0,'l'},
+ {"size",1,0,'s'},
+ {"som-dimension",1,0,'d'},
+ {"nfold",1,0,'f'},
+ {"merge",1,0,'m'},
+ {"train",0,0,'t'},
+ {"classify",0,0,'c'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "htcp:n:r:b:l:s:f:d:m:e",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'e': args->train_bad = 0; break;
+ case 'm':
+ if ( !strcmp(optarg,"min") ) args->merge = MERGE_MIN;
+ else if ( !strcmp(optarg,"max") ) args->merge = MERGE_MAX;
+ else if ( !strcmp(optarg,"avg") ) args->merge = MERGE_AVG;
+ else error("The -m method not recognised: %s\n", optarg);
+ break;
+ case 'p': args->prefix = optarg; break;
+ case 'n': args->ntrain = atoi(optarg); break;
+ case 'r': args->rand_seed = atoi(optarg); break;
+ case 'b': args->bmu_th = atof(optarg); break;
+ case 'l': args->learn = atof(optarg); break;
+ case 's': args->nbin = atoi(optarg); break;
+ case 'f': args->nfold = atoi(optarg); break;
+ case 'd':
+ args->ndim = atoi(optarg);
+ if ( args->ndim<2 ) error("Expected -d >=2, got %d\n", args->ndim);
+ if ( args->ndim>3 ) fprintf(pysamerr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim);
+ break;
+ case 't': args->action = SOM_TRAIN; break;
+ case 'c': args->action = SOM_CLASSIFY; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( !args->rand_seed ) args->rand_seed = time(NULL);
+ if ( argc!=optind+1 ) usage();
+ args->fname = argv[optind];
+ init_data(args);
+
+ if ( args->action == SOM_TRAIN ) do_train(args);
+ else if ( args->action == SOM_CLASSIFY ) do_classify(args);
+
+ destroy_data(args);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c
new file mode 100644
index 0000000..1032bf8
--- /dev/null
+++ b/bcftools/vcfstats.c
@@ -0,0 +1,1590 @@
+/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
+
+ Copyright (C) 2012-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+/*
+ Notes and known issues:
+ - SN ts/tv calculation includes all non-ref alleles listed in ALT while per-sample ts/tv
+ takes the first non-ref allele only, something to consider with many non-ref HETs.
+*/
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <htslib/faidx.h>
+#include <inttypes.h>
+#include "bcftools.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define HWE_STATS 1
+#define QUAL_STATS 1
+#define IRC_STATS 1
+#define IRC_RLEN 10
+#define NA_STRING "0"
+
+typedef struct
+{
+ char *tag;
+ float min, max;
+ uint64_t *vals_ts, *vals_tv;
+ void *val;
+ int nbins, type, m_val;
+}
+user_stats_t;
+
+typedef struct
+{
+ int min, max, step, m_vals;
+ uint64_t *vals;
+}
+idist_t;
+
+typedef struct
+{
+ double x;
+ double x2;
+ double y;
+ double y2;
+ double xy;
+ double n;
+}
+smpl_r_t;
+
+typedef struct
+{
+ int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
+ int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons
+ #if HWE_STATS
+ int *af_hwe;
+ #endif
+ #if IRC_STATS
+ int n_repeat[IRC_RLEN][4], n_repeat_na; // number of indels which are repeat-consistent, repeat-inconsistent (dels and ins), and not applicable
+ int *af_repeats[3];
+ #endif
+ int ts_alt1, tv_alt1;
+ #if QUAL_STATS
+ int *qual_ts, *qual_tv, *qual_snps, *qual_indels;
+ #endif
+ int *insertions, *deletions, m_indel; // maximum indel length
+ int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
+ int subst[15];
+ int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
+ int *smpl_indel_hets, *smpl_indel_homs;
+ int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+ unsigned long int *smpl_dp;
+ idist_t dp, dp_sites;
+ int nusr;
+ user_stats_t *usr;
+}
+stats_t;
+
+typedef struct
+{
+ uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches
+ float r2sum;
+ uint32_t r2n;
+}
+gtcmp_t;
+
+typedef struct
+{
+ char *seq;
+ int pos, cnt, len;
+}
+_idc1_t;
+typedef struct
+{
+ faidx_t *ref;
+ _idc1_t *dat;
+ int ndat, mdat;
+}
+indel_ctx_t;
+
+typedef struct
+{
+ // stats
+ stats_t stats[3];
+ int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm;
+ uint8_t *tmp_frm;
+ int dp_min, dp_max, dp_step;
+ gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons
+
+ // indel context
+ indel_ctx_t *indel_ctx;
+ char *ref_fname;
+
+ // user stats
+ int nusr;
+ user_stats_t *usr;
+
+ // other
+ bcf_srs_t *files;
+ bcf_sr_regions_t *exons;
+ char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
+ int argc, verbose_sites, first_allele_only, samples_is_file;
+ int split_by_id, nstats;
+
+ filter_t *filter[2];
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
+ // Per Sample r working data arrays of size equal to number of samples
+ smpl_r_t* smpl_r_snps;
+ smpl_r_t* smpl_r_indels;
+}
+args_t;
+
+static int type2dosage[6], type2ploidy[6], type2stats[6];
+
+static void idist_init(idist_t *d, int min, int max, int step)
+{
+ d->min = min; d->max = max; d->step = step;
+ d->m_vals = 4 + (d->max - d->min)/d->step;
+ d->vals = (uint64_t*) calloc(d->m_vals,sizeof(uint64_t));
+}
+static void idist_destroy(idist_t *d)
+{
+ if ( d->vals ) free(d->vals);
+}
+static inline uint64_t *idist(idist_t *d, int val)
+{
+ if ( val < d->min ) return &d->vals[0];
+ if ( val > d->max ) return &d->vals[d->m_vals-1];
+ return &d->vals[1 + (val - d->min) / d->step];
+}
+static inline int idist_i2bin(idist_t *d, int i)
+{
+ if ( i<=0 ) return d->min;
+ if ( i>= d->m_vals ) return d->max;
+ return i-1+d->min;
+}
+
+
+#define IC_DBG 0
+#if IC_DBG
+static void _indel_ctx_print1(_idc1_t *idc)
+{
+ int i;
+ fprintf(stdout, "%d\t", idc->cnt);
+ for (i=0; i<idc->len; i++)
+ fputc(idc->seq[i], stdout);
+ fputc('\n', stdout);
+}
+static void _indel_ctx_print(indel_ctx_t *ctx)
+{
+ int i;
+ for (i=0; i<ctx->ndat; i++)
+ _indel_ctx_print1(&ctx->dat[i]);
+ fputc('\n',stdout);
+}
+#endif
+static int _indel_ctx_lookup(indel_ctx_t *ctx, char *seq, int seq_len, int *hit)
+{
+ // binary search
+ int min = 0, max = ctx->ndat - 1;
+ while ( min<=max )
+ {
+ int i = (min+max)/2;
+ int cmp = strncmp(seq, ctx->dat[i].seq, seq_len);
+ if ( cmp<0 ) max = i - 1;
+ else if ( cmp>0 ) min = i + 1;
+ else
+ {
+ if ( seq_len==ctx->dat[i].len )
+ {
+ *hit = 1;
+ return i;
+ }
+ else if ( seq_len<ctx->dat[i].len ) max = i - 1;
+ else min = i + 1;
+ }
+ }
+ *hit = 0;
+ return max;
+}
+static void _indel_ctx_insert(indel_ctx_t *ctx, char *seq, int seq_len, int pos)
+{
+ int idat, hit, i;
+ idat = _indel_ctx_lookup(ctx, seq, seq_len, &hit);
+ if ( !hit )
+ {
+ if ( pos>0 ) return;
+ idat++;
+ ctx->ndat++;
+ hts_expand(_idc1_t, ctx->ndat+1, ctx->mdat, ctx->dat);
+ if ( idat<ctx->ndat && ctx->ndat>1 )
+ memmove(&ctx->dat[idat+1], &ctx->dat[idat], (ctx->ndat - idat - 1)*sizeof(_idc1_t));
+ ctx->dat[idat].len = seq_len;
+ ctx->dat[idat].cnt = 1;
+ ctx->dat[idat].pos = pos;
+ ctx->dat[idat].seq = (char*) malloc(sizeof(char)*(seq_len+1));
+ for (i=0; i<seq_len; i++) ctx->dat[idat].seq[i] = seq[i];
+ ctx->dat[idat].seq[i] = 0;
+ return;
+ }
+ if ( ctx->dat[idat].pos + seq_len == pos )
+ {
+ ctx->dat[idat].cnt++;
+ ctx->dat[idat].pos = pos;
+ }
+}
+indel_ctx_t *indel_ctx_init(char *fa_ref_fname)
+{
+ indel_ctx_t *ctx = (indel_ctx_t *) calloc(1,sizeof(indel_ctx_t));
+ ctx->ref = fai_load(fa_ref_fname);
+ if ( !ctx->ref )
+ {
+ free(ctx);
+ return NULL;
+ }
+ return ctx;
+}
+void indel_ctx_destroy(indel_ctx_t *ctx)
+{
+ fai_destroy(ctx->ref);
+ if ( ctx->mdat ) free(ctx->dat);
+ free(ctx);
+}
+/**
+ * indel_ctx_type() - determine indel context type
+ * @ctx:
+ * @chr: chromosome name
+ * @pos: position of the first @ref base, 1-based
+ * @ref: reference allele
+ * @alt: alternate allele. Only first of multiple comma-separated alleles is
+ * considered
+ * @nrep: number of repeated elements (w)
+ * @nlen: length of a single repeat element (w)
+ *
+ * Returns the INDEL length, negative for deletions, positive for insertions
+ */
+int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, int *nrep, int *nlen)
+{
+ const int win_size = 50; // hard-wired for now
+ const int rep_len = IRC_RLEN; // hard-wired for now
+
+ int ref_len = strlen(ref);
+ int alt_len = 0;
+ while ( alt[alt_len] && alt[alt_len]!=',' ) alt_len++;
+
+ int i, fai_ref_len;
+ char *fai_ref = faidx_fetch_seq(ctx->ref, chr, pos-1, pos+win_size, &fai_ref_len);
+ for (i=0; i<fai_ref_len; i++)
+ if ( (int)fai_ref[i]>96 ) fai_ref[i] -= 32;
+
+ // Sanity check: the reference sequence must match the REF allele
+ for (i=0; i<fai_ref_len && i<ref_len; i++)
+ if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] )
+ error("\nSanity check failed, the reference sequence differs: %s:%d+%d .. %c vs %c\n", chr, pos, i, ref[i],fai_ref[i]);
+
+ // Count occurrences of all possible kmers
+ ctx->ndat = 0;
+ for (i=0; i<win_size; i++)
+ {
+ int k, kmax = rep_len <= i ? rep_len : i+1;
+ for (k=0; k<kmax; k++)
+ _indel_ctx_insert(ctx, &fai_ref[i-k+1], k+1, i-k);
+ }
+
+ #if IC_DBG
+ fprintf(stdout,"ref: %s\n", ref);
+ fprintf(stdout,"alt: %s\n", alt);
+ fprintf(stdout,"ctx: %s\n", fai_ref);
+ _indel_ctx_print(ctx);
+ #endif
+
+ int max_cnt = 0, max_len = 0;
+ for (i=0; i<ctx->ndat; i++)
+ {
+ if ( max_cnt < ctx->dat[i].cnt || (max_cnt==ctx->dat[i].cnt && max_len < ctx->dat[i].len) )
+ {
+ max_cnt = ctx->dat[i].cnt;
+ max_len = ctx->dat[i].len;
+ }
+ free(ctx->dat[i].seq);
+ }
+ free(fai_ref);
+
+ *nrep = max_cnt;
+ *nlen = max_len;
+ return alt_len - ref_len;
+}
+
+static void add_user_stats(args_t *args, char *str)
+{
+ args->nusr++;
+ args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr);
+ user_stats_t *usr = &args->usr[args->nusr-1];
+ memset(usr,0,sizeof(*usr));
+ usr->min = 0;
+ usr->max = 1;
+ usr->nbins = 100;
+
+ char *tmp = str;
+ while ( *tmp && *tmp!=':' ) tmp++;
+ usr->tag = (char*)calloc(tmp-str+2,sizeof(char));
+ memcpy(usr->tag,str,tmp-str);
+
+ if ( *tmp )
+ {
+ char *ptr = ++tmp;
+ usr->min = strtod(tmp, &ptr);
+ if ( tmp==ptr ) error("Could not parse %s\n", str);
+ tmp = ptr+1;
+ }
+ if ( *tmp )
+ {
+ char *ptr = tmp;
+ usr->max = strtod(tmp, &ptr);
+ if ( tmp==ptr ) error("Could not parse %s\n", str);
+ tmp = ptr+1;
+ }
+ if ( *tmp )
+ {
+ char *ptr = tmp;
+ usr->nbins = strtol(tmp, &ptr, 10);
+ if ( tmp==ptr ) error("Could not parse %s\n", str);
+ if ( usr->nbins<=0 ) error("Number of bins does not make sense (%d): %s.\n", usr->nbins, str);
+ }
+}
+static void init_user_stats(args_t *args, bcf_hdr_t *hdr, stats_t *stats)
+{
+ stats->nusr = args->nusr;
+ stats->usr = (user_stats_t*)malloc(sizeof(user_stats_t)*args->nusr);
+ memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t));
+ int i;
+ for (i=0; i<stats->nusr; i++)
+ {
+ user_stats_t *usr = &stats->usr[i];
+ usr->vals_ts = (uint64_t*)calloc(usr->nbins,sizeof(uint64_t));
+ usr->vals_tv = (uint64_t*)calloc(usr->nbins,sizeof(uint64_t));
+ int id = bcf_hdr_id2int(hdr,BCF_DT_ID,usr->tag);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) error("The INFO tag \"%s\" is not defined in the header\n", usr->tag);
+ usr->type = bcf_hdr_id2type(hdr,BCF_HL_INFO,id);
+ if ( usr->type!=BCF_HT_REAL && usr->type!=BCF_HT_INT ) error("The INFO tag \"%s\" is not of Float or Integer type (%d)\n", usr->type);
+ }
+}
+static void init_stats(args_t *args)
+{
+ int i;
+ args->nstats = args->files->nreaders==1 ? 1 : 3;
+ if ( args->split_by_id ) args->nstats = 2;
+
+ if ( args->filter_str )
+ {
+ args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str);
+ if ( args->files->nreaders==2 )
+ args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str);
+ }
+
+ // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs
+ args->m_af = 101;
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
+ args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+
+ #if QUAL_STATS
+ args->m_qual = 999;
+ #endif
+ #if HWE_STATS
+ args->naf_hwe = 100;
+ #endif
+
+ if ( args->samples_list )
+ {
+ if ( !bcf_sr_set_samples(args->files,args->samples_list,args->samples_is_file) )
+ {
+ if ( !bcf_hdr_nsamples(args->files->readers[0].header) )
+ error("No sample columns in %s\n", args->files->readers[0].fname);
+ error("Unable to parse the samples: \"%s\"\n", args->samples_list);
+ }
+ args->af_gts_snps = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
+ args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
+ args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
+ args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
+ args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
+ args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
+ }
+ for (i=0; i<args->nstats; i++)
+ {
+ stats_t *stats = &args->stats[i];
+ stats->m_indel = 60;
+ stats->insertions = (int*) calloc(stats->m_indel,sizeof(int));
+ stats->deletions = (int*) calloc(stats->m_indel,sizeof(int));
+ stats->af_ts = (int*) calloc(args->m_af,sizeof(int));
+ stats->af_tv = (int*) calloc(args->m_af,sizeof(int));
+ stats->af_snps = (int*) calloc(args->m_af,sizeof(int));
+ int j;
+ for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int));
+ #if QUAL_STATS
+ stats->qual_ts = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_tv = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_snps = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int));
+ #endif
+ if ( args->files->n_smpl )
+ {
+ stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int));
+ stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int));
+ #if HWE_STATS
+ stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int));
+ #endif
+ if ( args->exons_fname )
+ stats->smpl_frm_shifts = (int*) calloc(args->files->n_smpl*3,sizeof(int));
+ }
+ idist_init(&stats->dp, args->dp_min,args->dp_max,args->dp_step);
+ idist_init(&stats->dp_sites, args->dp_min,args->dp_max,args->dp_step);
+ init_user_stats(args, i!=1 ? args->files->readers[0].header : args->files->readers[1].header, stats);
+ }
+
+ if ( args->exons_fname )
+ {
+ args->exons = bcf_sr_regions_init(args->exons_fname,1,0,1,2);
+ if ( !args->exons )
+ error("Error occurred while reading, was the file compressed with bgzip: %s?\n", args->exons_fname);
+ }
+
+ #if IRC_STATS
+ if ( args->ref_fname )
+ args->indel_ctx = indel_ctx_init(args->ref_fname);
+ #endif
+
+ type2dosage[GT_HOM_RR] = 0;
+ type2dosage[GT_HET_RA] = 1;
+ type2dosage[GT_HOM_AA] = 2;
+ type2dosage[GT_HET_AA] = 2;
+ type2dosage[GT_HAPL_R] = 0;
+ type2dosage[GT_HAPL_A] = 1;
+
+ type2ploidy[GT_HOM_RR] = 1;
+ type2ploidy[GT_HET_RA] = 1;
+ type2ploidy[GT_HOM_AA] = 1;
+ type2ploidy[GT_HET_AA] = 1;
+ type2ploidy[GT_HAPL_R] = -1;
+ type2ploidy[GT_HAPL_A] = -1;
+
+ type2stats[GT_HOM_RR] = 0;
+ type2stats[GT_HET_RA] = 1;
+ type2stats[GT_HOM_AA] = 2;
+ type2stats[GT_HET_AA] = 1;
+ type2stats[GT_HAPL_R] = 0;
+ type2stats[GT_HAPL_A] = 2;
+
+}
+static void destroy_stats(args_t *args)
+{
+ int id, j;
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ if (stats->af_ts) free(stats->af_ts);
+ if (stats->af_tv) free(stats->af_tv);
+ if (stats->af_snps) free(stats->af_snps);
+ for (j=0; j<3; j++)
+ if (stats->af_repeats[j]) free(stats->af_repeats[j]);
+ #if QUAL_STATS
+ if (stats->qual_ts) free(stats->qual_ts);
+ if (stats->qual_tv) free(stats->qual_tv);
+ if (stats->qual_snps) free(stats->qual_snps);
+ if (stats->qual_indels) free(stats->qual_indels);
+ #endif
+ #if HWE_STATS
+ //if ( args->files->n_smpl ) free(stats->af_hwe);
+ free(stats->af_hwe);
+ #endif
+ free(stats->insertions);
+ free(stats->deletions);
+ if (stats->smpl_hets) free(stats->smpl_hets);
+ if (stats->smpl_homAA) free(stats->smpl_homAA);
+ if (stats->smpl_homRR) free(stats->smpl_homRR);
+ if (stats->smpl_indel_homs) free(stats->smpl_indel_homs);
+ if (stats->smpl_indel_hets) free(stats->smpl_indel_hets);
+ if (stats->smpl_ts) free(stats->smpl_ts);
+ if (stats->smpl_tv) free(stats->smpl_tv);
+ if (stats->smpl_indels) free(stats->smpl_indels);
+ if (stats->smpl_dp) free(stats->smpl_dp);
+ if (stats->smpl_ndp) free(stats->smpl_ndp);
+ if (stats->smpl_sngl) free(stats->smpl_sngl);
+ idist_destroy(&stats->dp);
+ idist_destroy(&stats->dp_sites);
+ for (j=0; j<stats->nusr; j++)
+ {
+ free(stats->usr[j].vals_ts);
+ free(stats->usr[j].vals_tv);
+ free(stats->usr[j].val);
+ }
+ free(stats->usr);
+ if ( args->exons ) free(stats->smpl_frm_shifts);
+ }
+ for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
+ free(args->usr);
+ free(args->tmp_frm);
+ free(args->tmp_iaf);
+ if (args->exons) bcf_sr_regions_destroy(args->exons);
+ free(args->af_gts_snps);
+ free(args->af_gts_indels);
+ free(args->smpl_gts_snps);
+ free(args->smpl_gts_indels);
+ free(args->smpl_r_snps);
+ free(args->smpl_r_indels);
+ if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
+ if (args->filter[0]) filter_destroy(args->filter[0]);
+ if (args->filter[1]) filter_destroy(args->filter[1]);
+}
+
+static void init_iaf(args_t *args, bcf_sr_t *reader)
+{
+ bcf1_t *line = reader->buffer[0];
+ if ( args->ntmp_iaf < line->n_allele )
+ {
+ args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int));
+ args->ntmp_iaf = line->n_allele;
+ }
+ // tmp_iaf is first filled with AC counts in calc_ac and then transformed to
+ // an index to af_gts_snps
+ int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
+ if ( ret )
+ {
+ int an=0;
+ for (i=0; i<line->n_allele; i++)
+ an += args->tmp_iaf[i];
+
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->tmp_iaf[i]==1 )
+ args->tmp_iaf[i] = 0; // singletons into the first bin
+ else if ( !an )
+ args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
+ else
+ args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an;
+ }
+ }
+ else
+ for (i=0; i<line->n_allele; i++)
+ args->tmp_iaf[i] = 0;
+
+ // todo: otherwise use AF
+}
+
+static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_mnps++;
+}
+
+static inline void do_other_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_others++;
+}
+
+static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_indels++;
+
+ bcf1_t *line = reader->buffer[0];
+
+ #if QUAL_STATS
+ int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ stats->qual_indels[iqual]++;
+ #endif
+
+ // Check if the indel is near an exon for the frameshift statistics
+ int i, exon_overlap = 0;
+ if ( args->exons )
+ {
+ if ( !bcf_sr_regions_overlap(args->exons, bcf_seqname(reader->header,line),line->pos,line->pos) ) exon_overlap = 1;
+ hts_expand(uint8_t,line->n_allele,args->mtmp_frm,args->tmp_frm);
+ for (i=0; i<line->n_allele; i++) args->tmp_frm[i] = 0;
+ }
+
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->first_allele_only && i>1 ) break;
+ if ( bcf_get_variant_type(line,i)!=VCF_INDEL ) continue;
+ int len = line->d.var[i].n;
+
+ #if IRC_STATS
+ // Indel repeat consistency
+ if ( args->indel_ctx )
+ {
+ int nrep, nlen, ndel;
+ ndel = indel_ctx_type(args->indel_ctx, (char*)reader->header->id[BCF_DT_CTG][line->rid].key, line->pos+1, line->d.allele[0], line->d.allele[i], &nrep, &nlen);
+ if ( nlen<=1 || nrep<=1 )
+ {
+ // not a repeat or a single base repeat
+ stats->n_repeat_na++;
+ stats->af_repeats[2][ args->tmp_iaf[i] ]++;
+ }
+ else
+ {
+ if ( abs(ndel) % nlen )
+ {
+ // the length of the inserted/deleted sequence is not consistent with the repeat element
+ stats->n_repeat[nlen-1][ndel<0 ? 1 : 3]++;
+ stats->af_repeats[1][ args->tmp_iaf[i] ]++;
+ }
+ else
+ {
+ // the length consistent with the repeat
+ stats->n_repeat[nlen-1][ndel<0 ? 0 : 2]++;
+ stats->af_repeats[0][ args->tmp_iaf[i] ]++;
+ }
+ }
+ }
+ else
+ stats->af_repeats[2][ args->tmp_iaf[i] ]++;
+ #endif
+
+ // Check the frameshifts
+ int tlen = 0;
+ if ( args->exons && exon_overlap ) // there is an exon
+ {
+ if ( len>0 )
+ {
+ // insertion
+ if ( args->exons->start <= line->pos && args->exons->end > line->pos ) tlen = abs(len);
+ }
+ else if ( args->exons->start <= line->pos + abs(len) )
+ {
+ // deletion
+ tlen = abs(len);
+ if ( line->pos < args->exons->start ) // trim the beginning
+ tlen -= args->exons->start - line->pos + 1;
+ if ( args->exons->end < line->pos + abs(len) ) // trim the end
+ tlen -= line->pos + abs(len) - args->exons->end;
+ }
+ }
+ if ( tlen ) // there are some deleted/inserted bases in the exon
+ {
+ if ( tlen%3 ) { stats->out_frame++; args->tmp_frm[i] = 2; }
+ else { stats->in_frame++; args->tmp_frm[i] = 1; }
+
+ if ( i==1 )
+ {
+ if ( tlen%3 ) stats->out_frame_alt1++;
+ else stats->in_frame_alt1++;
+ }
+ }
+ else // no exon affected
+ {
+ if ( i==1 ) stats->na_frame_alt1++;
+ stats->na_frame++;
+ }
+
+
+ // Indel length distribution
+ int *ptr = stats->insertions;
+ if ( len<0 )
+ {
+ len *= -1;
+ ptr = stats->deletions;
+ }
+ if ( --len >= stats->m_indel ) len = stats->m_indel-1;
+ ptr[len]++;
+ }
+}
+
+static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
+{
+ int i;
+ for (i=0; i<stats->nusr; i++)
+ {
+ user_stats_t *usr = &stats->usr[i];
+ uint64_t *vals = is_ts ? usr->vals_ts : usr->vals_tv;
+ float val;
+ if ( usr->type==BCF_HT_REAL )
+ {
+ if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
+ val = ((float*)usr->val)[0];
+ }
+ else
+ {
+ if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
+ val = ((int32_t*)usr->val)[0];
+ }
+ int idx;
+ if ( val<=usr->min ) idx = 0;
+ else if ( val>=usr->max ) idx = usr->nbins - 1;
+ else idx = (val - usr->min)/(usr->max - usr->min) * (usr->nbins-1);
+ vals[idx]++;
+ }
+}
+
+static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_snps++;
+
+ bcf1_t *line = reader->buffer[0];
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ if ( ref<0 ) return;
+
+ #if QUAL_STATS
+ int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ stats->qual_snps[iqual]++;
+ #endif
+
+ int i;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->first_allele_only && i>1 ) break;
+ if ( !(bcf_get_variant_type(line,i)&VCF_SNP) ) continue;
+ int alt = bcf_acgt2int(*line->d.allele[i]);
+ if ( alt<0 || ref==alt ) continue;
+ stats->subst[ref<<2|alt]++;
+ int iaf = args->tmp_iaf[i];
+ stats->af_snps[iaf]++;
+ if ( abs(ref-alt)==2 )
+ {
+ if (i==1)
+ {
+ stats->ts_alt1++;
+ #if QUAL_STATS
+ stats->qual_ts[iqual]++;
+ #endif
+ do_user_stats(stats, reader, 1);
+ }
+ stats->af_ts[iaf]++;
+ }
+ else
+ {
+ if (i==1)
+ {
+ stats->tv_alt1++;
+ #if QUAL_STATS
+ stats->qual_tv[iqual]++;
+ #endif
+ do_user_stats(stats, reader, 0);
+ }
+ stats->af_tv[iaf]++;
+ }
+ }
+}
+
+static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+{
+ bcf_srs_t *files = args->files;
+ bcf1_t *line = reader->buffer[0];
+ bcf_fmt_t *fmt_ptr;
+ int nref_tot = 0, nhet_tot = 0, nalt_tot = 0;
+ int line_type = bcf_get_variant_types(line);
+
+ if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) )
+ {
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ int is, n_nref = 0, i_nref = 0;
+ for (is=0; is<args->files->n_smpl; is++)
+ {
+ int ial, jal;
+ int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
+ if ( gt==GT_UNKN ) continue;
+ if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
+ {
+ if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele );
+ stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+ }
+ continue;
+ }
+ if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
+ #if HWE_STATS
+ switch (gt)
+ {
+ case GT_HOM_RR: nref_tot++; break;
+ case GT_HET_RA: nhet_tot++; break;
+ case GT_HET_AA:
+ case GT_HOM_AA: nalt_tot++; break;
+ }
+ #endif
+ if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP
+ {
+ if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
+ else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
+ else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++;
+ else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++;
+ if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
+ {
+ int alt = bcf_acgt2int(*line->d.allele[ial]);
+ if ( alt<0 ) continue;
+ if ( abs(ref-alt)==2 )
+ stats->smpl_ts[is]++;
+ else
+ stats->smpl_tv[is]++;
+ }
+ }
+ if ( line_type&VCF_INDEL )
+ {
+ if ( gt != GT_HOM_RR )
+ {
+ stats->smpl_indels[is]++;
+ if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++;
+ else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++;
+ }
+ if ( stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele && jal<line->n_allele );
+ stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+ stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++;
+ }
+ }
+ }
+ if ( n_nref==1 ) stats->smpl_sngl[i_nref]++;
+ }
+
+ #if HWE_STATS
+ if ( nhet_tot + nref_tot + nalt_tot )
+ {
+ float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
+ int idx = het_frac*(args->naf_hwe - 1);
+ if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
+ stats->af_hwe[idx]++;
+ }
+ #endif
+
+ if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) )
+ {
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ int is; \
+ for (is=0; is<args->files->n_smpl; is++) \
+ { \
+ type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
+ if ( *p==vector_end ) continue; \
+ if ( *p!=missing ) \
+ { \
+ (*idist(&stats->dp, *p))++; \
+ stats->smpl_ndp[is]++; \
+ stats->smpl_dp[is] += *p; \
+ } \
+ } \
+ }
+ switch (fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+ }
+
+ if ( matched==3 )
+ {
+ int is;
+ bcf_fmt_t *fmt0, *fmt1;
+ fmt0 = bcf_get_fmt(files->readers[0].header,files->readers[0].buffer[0],"GT"); if ( !fmt0 ) return;
+ fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return;
+
+ // only the first ALT allele is considered
+ int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1;
+ int line_type = bcf_get_variant_types(files->readers[0].buffer[0]);
+ gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels;
+ gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels;
+
+ //
+ // Calculates r squared
+ // x is mean dosage of x at given site
+ // x2 is mean squared dosage of x at given site
+ // y is mean dosage of x at given site
+ // y2 is mean squared dosage of x at given site
+ // xy is mean dosage of x*y at given site
+ // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
+ // r2n is number of sites considered
+ // output as r2sum/r2n for each AF bin
+ int r2n = 0;
+ float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0;
+ // Select smpl_r
+ smpl_r_t *smpl_r = NULL;
+ if (line_type&VCF_SNP)
+ {
+ smpl_r = args->smpl_r_snps;
+ }
+ else if (line_type&VCF_INDEL)
+ {
+ smpl_r = args->smpl_r_indels;
+ }
+ for (is=0; is<files->n_smpl; is++)
+ {
+ // Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
+ // actual alleles can be enforced by running without the -c option.
+ int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
+ if ( gt0 == GT_UNKN ) continue;
+
+ int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
+ if ( gt1 == GT_UNKN ) continue;
+
+ if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
+
+ int dsg0 = type2dosage[gt0];
+ int dsg1 = type2dosage[gt1];
+ x += dsg0;
+ x2 += dsg0*dsg0;
+ y += dsg1;
+ y2 += dsg1*dsg1;
+ xy += dsg0*dsg1;
+ r2n++;
+
+ int idx = type2stats[gt0];
+ if ( gt0==gt1 )
+ {
+ af_stats[iaf].m[idx]++;
+ smpl_stats[is].m[idx]++;
+ }
+ else
+ {
+ af_stats[iaf].mm[idx]++;
+ smpl_stats[is].mm[idx]++;
+ }
+
+ // Now do it across samples
+
+ if (smpl_r) {
+ smpl_r[is].xy += dsg0*dsg1;
+ smpl_r[is].x += dsg0;
+ smpl_r[is].x2 += dsg0*dsg0;
+ smpl_r[is].y += dsg1;
+ smpl_r[is].y2 += dsg1*dsg1;
+ ++(smpl_r[is].n);
+ }
+ }
+
+ if ( r2n )
+ {
+ x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n;
+ float cov = xy - x*y;
+ float var2 = (x2 - x*x) * (y2 - y*y);
+ if ( var2!=0 )
+ {
+ af_stats[iaf].r2sum += cov*cov/var2;
+ af_stats[iaf].r2n++;
+ }
+ }
+
+ if ( args->verbose_sites )
+ {
+ int nm = 0, nmm = 0, nrefm = 0;
+ for (is=0; is<files->n_smpl; is++)
+ {
+ int gt = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
+ if ( gt == GT_UNKN ) continue;
+ int gt2 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
+ if ( gt2 == GT_UNKN ) continue;
+ if ( gt != gt2 )
+ {
+ nmm++;
+ bcf_sr_t *reader = &files->readers[0];
+ printf("DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2);
+ }
+ else
+ {
+ if ( gt!=GT_HOM_RR ) nrefm++;
+ nm++;
+ }
+ }
+ float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0;
+ printf("PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd);
+ }
+ }
+}
+
+static void do_vcf_stats(args_t *args)
+{
+ bcf_srs_t *files = args->files;
+ assert( sizeof(int)>files->nreaders );
+ while ( bcf_sr_next_line(files) )
+ {
+ bcf_sr_t *reader = NULL;
+ bcf1_t *line = NULL;
+ int ret = 0, i, pass = 1;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !bcf_sr_has_line(files,i) ) continue;
+ if ( args->filter[i] )
+ {
+ int is_ok = filter_test(args->filter[i], bcf_sr_get_line(files,i), NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
+ if ( !is_ok ) { pass = 0; break; }
+ }
+ ret |= 1<<i;
+ if ( !reader )
+ {
+ reader = &files->readers[i];
+ line = bcf_sr_get_line(files,i);
+ }
+
+ }
+ if ( !pass ) continue;
+
+ int line_type = bcf_get_variant_types(line);
+ init_iaf(args, reader);
+
+ stats_t *stats = &args->stats[ret-1];
+ if ( args->split_by_id && line->d.id[0]=='.' && !line->d.id[1] )
+ stats = &args->stats[1];
+
+ stats->n_records++;
+
+ if ( line_type==VCF_REF )
+ stats->n_noalts++;
+ if ( line_type&VCF_SNP )
+ do_snp_stats(args, stats, reader);
+ if ( line_type&VCF_INDEL )
+ do_indel_stats(args, stats, reader);
+ if ( line_type&VCF_MNP )
+ do_mnp_stats(args, stats, reader);
+ if ( line_type&VCF_OTHER )
+ do_other_stats(args, stats, reader);
+
+ if ( line->n_allele>2 )
+ {
+ stats->n_mals++;
+ if ( line_type == VCF_SNP ) stats->n_snp_mals++;
+ }
+
+ if ( files->n_smpl )
+ do_sample_stats(args, stats, reader, ret);
+
+ if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 )
+ (*idist(&stats->dp_sites, args->tmp_iaf[0]))++;
+ }
+}
+
+static void print_header(args_t *args)
+{
+ int i;
+ printf("# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version());
+ printf("# The command line was:\tbcftools %s ", args->argv[0]);
+ for (i=1; i<args->argc; i++)
+ printf(" %s",args->argv[i]);
+ printf("\n#\n");
+
+ printf("# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n");
+ if ( args->files->nreaders==1 )
+ {
+ const char *fname = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : "<STDIN>";
+ if ( args->split_by_id )
+ {
+ printf("ID\t0\t%s:known (sites with ID different from \".\")\n", fname);
+ printf("ID\t1\t%s:novel (sites where ID column is \".\")\n", fname);
+ }
+ else
+ printf("ID\t0\t%s\n", fname);
+ }
+ else
+ {
+ const char *fname0 = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : "<STDIN>";
+ const char *fname1 = strcmp("-",args->files->readers[1].fname) ? args->files->readers[1].fname : "<STDIN>";
+ printf("ID\t0\t%s\n", fname0);
+ printf("ID\t1\t%s\n", fname1);
+ printf("ID\t2\t%s\t%s\n", fname0,fname1);
+
+ if ( args->verbose_sites )
+ {
+ printf(
+ "# Verbose per-site discordance output.\n"
+ "# PSD\t[2]CHROM\t[3]POS\t[4]Number of matches\t[5]Number of mismatches\t[6]NRD\n");
+ printf(
+ "# Verbose per-site and per-sample output. Genotype codes: %d:HomRefRef, %d:HomAltAlt, %d:HetAltRef, %d:HetAltAlt, %d:haploidRef, %d:haploidAlt\n"
+ "# DBG\t[2]CHROM\t[3]POS\t[4]Sample\t[5]GT in %s\t[6]GT in %s\n",
+ GT_HOM_RR, GT_HOM_AA, GT_HET_RA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A, fname0,fname1);
+ }
+ }
+}
+
+#define T2S(x) type2stats[x]
+static void print_stats(args_t *args)
+{
+ int i, id;
+ printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
+ for (id=0; id<args->files->nreaders; id++)
+ printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records);
+ printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts);
+ printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps);
+ printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps);
+ printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels);
+ printf("SN\t%d\tnumber of others:\t%d\n", id, stats->n_others);
+ printf("SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals);
+ printf("SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals);
+ }
+ printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ int ts=0,tv=0;
+ for (i=0; i<args->m_af; i++) { ts += stats->af_ts[i]; tv += stats->af_tv[i]; }
+ printf("TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0);
+ }
+ if ( args->exons_fname )
+ {
+ printf("# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ int in=args->stats[id].in_frame, out=args->stats[id].out_frame, na=args->stats[id].na_frame;
+ int in1=args->stats[id].in_frame_alt1, out1=args->stats[id].out_frame_alt1, na1=args->stats[id].na_frame_alt1;
+ printf("FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0);
+ }
+ }
+ if ( args->indel_ctx )
+ {
+ printf("# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ int nc = 0, ni = 0, na = args->stats[id].n_repeat_na;
+ for (i=0; i<IRC_RLEN; i++)
+ {
+ nc += args->stats[id].n_repeat[i][0] + args->stats[id].n_repeat[i][2];
+ ni += args->stats[id].n_repeat[i][1] + args->stats[id].n_repeat[i][3];
+ }
+ printf("ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0);
+ }
+ printf("# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ for (i=1; i<IRC_RLEN; i++)
+ {
+ int nc = args->stats[id].n_repeat[i][0]+args->stats[id].n_repeat[i][2], ni = args->stats[id].n_repeat[i][1]+args->stats[id].n_repeat[i][3];
+ printf("ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1,
+ args->stats[id].n_repeat[i][0],args->stats[id].n_repeat[i][1],args->stats[id].n_repeat[i][2],args->stats[id].n_repeat[i][3],
+ nc+ni ? (float)nc/(nc+ni) : 0.0);
+ }
+ }
+ }
+ printf("# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ printf("SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0],
+ stats->af_repeats[0][0]+stats->af_repeats[1][0]+stats->af_repeats[2][0],stats->af_repeats[0][0],stats->af_repeats[1][0],stats->af_repeats[2][0]);
+ // put the singletons stats into the first AF bin, note that not all of the stats is transferred (i.e. nrd mismatches)
+ stats->af_snps[1] += stats->af_snps[0];
+ stats->af_ts[1] += stats->af_ts[0];
+ stats->af_tv[1] += stats->af_tv[0];
+ stats->af_repeats[0][1] += stats->af_repeats[0][0];
+ stats->af_repeats[1][1] += stats->af_repeats[1][0];
+ stats->af_repeats[2][1] += stats->af_repeats[2][0];
+ }
+ printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=1; i<args->m_af; i++) // note that af[1] now contains also af[0], see SiS stats output above
+ {
+ if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue;
+ printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
+ stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]);
+ }
+ }
+ #if QUAL_STATS
+ printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->m_qual; i++)
+ {
+ if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue;
+ printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
+ }
+ }
+ #endif
+ for (i=0; i<args->nusr; i++)
+ {
+ printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
+ args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag);
+ for (id=0; id<args->nstats; id++)
+ {
+ user_stats_t *usr = &args->stats[id].usr[i];
+ int j;
+ for (j=0; j<usr->nbins; j++)
+ {
+ if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins
+ float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
+ const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n";
+ printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
+ }
+ }
+ }
+ printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=stats->m_indel-1; i>=0; i--)
+ if ( stats->deletions[i] ) printf("IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]);
+ for (i=0; i<stats->m_indel; i++)
+ if ( stats->insertions[i] ) printf("IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]);
+ }
+ printf("# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ int t;
+ for (t=0; t<15; t++)
+ {
+ if ( t>>2 == (t&3) ) continue;
+ printf("ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]);
+ }
+ }
+ if ( args->files->nreaders>1 && args->files->n_smpl )
+ {
+ printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
+
+ int x;
+ for (x=0; x<2; x++)
+ {
+ gtcmp_t *stats;
+ if ( x==0 )
+ {
+ printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ stats = args->af_gts_snps;
+ }
+ else
+ {
+ printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ stats = args->af_gts_indels;
+ }
+ uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
+ for (i=0; i<args->m_af; i++)
+ {
+ int j, n = 0;
+ for (j=0; j<3; j++)
+ {
+ n += stats[i].m[j] + stats[i].mm[j];
+ nrd_m[j] += stats[i].m[j];
+ nrd_mm[j] += stats[i].mm[j];
+ }
+ if ( !i || !n ) continue; // skip singleton stats and empty bins
+ printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
+ printf("\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
+ }
+
+ if ( x==0 )
+ {
+ printf("# NRD and discordance is calculated as follows:\n");
+ printf("# m .. number of matches\n");
+ printf("# x .. number of mismatches\n");
+ printf("# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
+ printf("# RR discordance = xRR / (xRR + mRR)\n");
+ printf("# RA discordance = xRA / (xRA + mRA)\n");
+ printf("# AA discordance = xAA / (xAA + mAA)\n");
+ printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
+ }
+ else
+ printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
+ uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)];
+ uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)];
+ printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
+ m+mm ? mm*100.0/(m+mm) : 0,
+ nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0,
+ nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)] ? nrd_mm[T2S(GT_HET_RA)]*100.0/(nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)]) : 0,
+ nrd_m[T2S(GT_HOM_AA)]+nrd_mm[T2S(GT_HOM_AA)] ? nrd_mm[T2S(GT_HOM_AA)]*100.0/(nrd_m[T2S(GT_HOM_AA)]+nrd_mm[T2S(GT_HOM_AA)]) : 0
+ );
+ }
+
+ for (x=0; x<2; x++)
+ {
+ gtcmp_t *stats;
+ smpl_r_t *smpl_r_array;
+ if ( x==0 )
+ {
+ printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+ stats = args->smpl_gts_snps;
+ smpl_r_array = args->smpl_r_snps;
+ }
+ else
+ {
+ printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+ stats = args->smpl_gts_indels;
+ smpl_r_array = args->smpl_r_indels;
+ }
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)];
+ uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)];
+ // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
+ smpl_r_t *smpl_r = smpl_r_array + i;
+ double r = 0.0;
+ if (smpl_r->n) {
+ double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula
+ double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n;
+ double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
+ r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
+ }
+ printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
+ if (smpl_r->n && !isnan(r)) printf("\t%f\n", r*r);
+ else printf("\t"NA_STRING"\n");
+ }
+ }
+ }
+
+ printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ long unsigned int sum = 0, sum_sites = 0;
+ for (i=0; i<stats->dp.m_vals; i++) { sum += stats->dp.vals[i]; sum_sites += stats->dp_sites.vals[i]; }
+ for (i=0; i<stats->dp.m_vals; i++)
+ {
+ if ( stats->dp.vals[i]==0 && stats->dp_sites.vals[i]==0 ) continue;
+ printf("DP\t%d\t", id);
+ if ( i==0 ) printf("<%d", stats->dp.min);
+ else if ( i+1==stats->dp.m_vals ) printf(">%d", stats->dp.max);
+ else printf("%d", idist_i2bin(&stats->dp,i));
+ printf("\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0);
+ printf("\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0);
+ }
+ }
+
+ if ( args->files->n_smpl )
+ {
+ printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
+ printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
+ stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
+ stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]);
+ }
+ }
+
+
+ printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ int na = 0, in = 0, out = 0;
+ if ( args->exons )
+ {
+ na = stats->smpl_frm_shifts[i*3 + 0];
+ in = stats->smpl_frm_shifts[i*3 + 1];
+ out = stats->smpl_frm_shifts[i*3 + 2];
+ }
+ int nhom = stats->smpl_indel_homs[i];
+ int nhet = stats->smpl_indel_hets[i];
+ printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom);
+ }
+ }
+
+ #ifdef HWE_STATS
+ printf("# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->naf_hwe; i++) stats->af_hwe[i+args->naf_hwe] += stats->af_hwe[i]; // singletons
+ for (i=1; i<args->m_af; i++)
+ {
+ unsigned int sum_tot = 0, sum_tmp = 0;
+ int j, *ptr = &stats->af_hwe[i*args->naf_hwe];
+ for (j=0; j<args->naf_hwe; j++) sum_tot += ptr[j];
+ if ( !sum_tot ) continue;
+
+ int nprn = 3;
+ printf("HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
+ for (j=0; j<args->naf_hwe; j++)
+ {
+ sum_tmp += ptr[j];
+ float frac = (float)sum_tmp/sum_tot;
+ if ( frac >= 0.75 )
+ {
+ while (nprn>0) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ break;
+ }
+ if ( frac >= 0.5 )
+ {
+ while (nprn>1) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ continue;
+ }
+ if ( frac >= 0.25 )
+ {
+ while (nprn>2) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ }
+ }
+ assert(nprn==0);
+ printf("\n");
+ }
+ }
+ #endif
+ }
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n");
+ fprintf(stderr, " When two files are given, the program generates separate stats for intersection\n");
+ fprintf(stderr, " and the complements. By default only sites are compared, -s/-S must given to include\n");
+ fprintf(stderr, " also sample columns.\n");
+ fprintf(stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
+ fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -E, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
+ fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --samples <list> list of samples for sample stats, \"-\" to include all samples\n");
+ fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfstats(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv;
+ args->dp_min = 0; args->dp_max = 500; args->dp_step = 1;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"1st-allele-only",0,0,'1'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"help",0,0,'h'},
+ {"collapse",1,0,'c'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"verbose",0,0,'v'},
+ {"depth",1,0,'d'},
+ {"apply-filters",1,0,'f'},
+ {"exons",1,0,'E'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"split-by-ID",0,0,'I'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"fasta-ref",1,0,'F'},
+ {"user-tstv",1,0,'u'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'u': add_user_stats(args,optarg); break;
+ case '1': args->first_allele_only = 1; break;
+ case 'F': args->ref_fname = optarg; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'c':
+ if ( !strcmp(optarg,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
+ case 'v': args->verbose_sites = 1; break;
+ case 'd':
+ if ( sscanf(optarg,"%d,%d,%d",&args->dp_min,&args->dp_max,&args->dp_step)!=3 )
+ error("Could not parse --depth %s\n", optarg);
+ if ( args->dp_min<0 || args->dp_min >= args->dp_max || args->dp_step > args->dp_max - args->dp_min + 1 )
+ error("Is this a typo? --depth %s\n", optarg);
+ break;
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'E': args->exons_fname = optarg; break;
+ case 's': args->samples_list = optarg; break;
+ case 'S': args->samples_list = optarg; args->samples_is_file = 1; break;
+ case 'I': args->split_by_id = 1; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
+ }
+ else fname = argv[optind];
+
+ if ( argc-optind>2 ) usage();
+ if ( argc-optind>1 )
+ {
+ args->files->require_index = 1;
+ if ( args->split_by_id ) error("Only one file can be given with -i.\n");
+ }
+ if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO;
+ if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ while (fname)
+ {
+ if ( !bcf_sr_add_reader(args->files, fname) )
+ error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ fname = ++optind < argc ? argv[optind] : NULL;
+ }
+
+ init_stats(args);
+ print_header(args);
+ do_vcf_stats(args);
+ print_stats(args);
+ destroy_stats(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c
new file mode 100644
index 0000000..fcbc15b
--- /dev/null
+++ b/bcftools/vcfstats.c.pysam.c
@@ -0,0 +1,1592 @@
+#include "pysam.h"
+
+/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
+
+ Copyright (C) 2012-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+/*
+ Notes and known issues:
+ - SN ts/tv calculation includes all non-ref alleles listed in ALT while per-sample ts/tv
+ takes the first non-ref allele only, something to consider with many non-ref HETs.
+*/
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <htslib/faidx.h>
+#include <inttypes.h>
+#include "bcftools.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define HWE_STATS 1
+#define QUAL_STATS 1
+#define IRC_STATS 1
+#define IRC_RLEN 10
+#define NA_STRING "0"
+
+typedef struct
+{
+ char *tag;
+ float min, max;
+ uint64_t *vals_ts, *vals_tv;
+ void *val;
+ int nbins, type, m_val;
+}
+user_stats_t;
+
+typedef struct
+{
+ int min, max, step, m_vals;
+ uint64_t *vals;
+}
+idist_t;
+
+typedef struct
+{
+ double x;
+ double x2;
+ double y;
+ double y2;
+ double xy;
+ double n;
+}
+smpl_r_t;
+
+typedef struct
+{
+ int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
+ int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons
+ #if HWE_STATS
+ int *af_hwe;
+ #endif
+ #if IRC_STATS
+ int n_repeat[IRC_RLEN][4], n_repeat_na; // number of indels which are repeat-consistent, repeat-inconsistent (dels and ins), and not applicable
+ int *af_repeats[3];
+ #endif
+ int ts_alt1, tv_alt1;
+ #if QUAL_STATS
+ int *qual_ts, *qual_tv, *qual_snps, *qual_indels;
+ #endif
+ int *insertions, *deletions, m_indel; // maximum indel length
+ int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
+ int subst[15];
+ int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
+ int *smpl_indel_hets, *smpl_indel_homs;
+ int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+ unsigned long int *smpl_dp;
+ idist_t dp, dp_sites;
+ int nusr;
+ user_stats_t *usr;
+}
+stats_t;
+
+typedef struct
+{
+ uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches
+ float r2sum;
+ uint32_t r2n;
+}
+gtcmp_t;
+
+typedef struct
+{
+ char *seq;
+ int pos, cnt, len;
+}
+_idc1_t;
+typedef struct
+{
+ faidx_t *ref;
+ _idc1_t *dat;
+ int ndat, mdat;
+}
+indel_ctx_t;
+
+typedef struct
+{
+ // stats
+ stats_t stats[3];
+ int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm;
+ uint8_t *tmp_frm;
+ int dp_min, dp_max, dp_step;
+ gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons
+
+ // indel context
+ indel_ctx_t *indel_ctx;
+ char *ref_fname;
+
+ // user stats
+ int nusr;
+ user_stats_t *usr;
+
+ // other
+ bcf_srs_t *files;
+ bcf_sr_regions_t *exons;
+ char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
+ int argc, verbose_sites, first_allele_only, samples_is_file;
+ int split_by_id, nstats;
+
+ filter_t *filter[2];
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
+ // Per Sample r working data arrays of size equal to number of samples
+ smpl_r_t* smpl_r_snps;
+ smpl_r_t* smpl_r_indels;
+}
+args_t;
+
+static int type2dosage[6], type2ploidy[6], type2stats[6];
+
+static void idist_init(idist_t *d, int min, int max, int step)
+{
+ d->min = min; d->max = max; d->step = step;
+ d->m_vals = 4 + (d->max - d->min)/d->step;
+ d->vals = (uint64_t*) calloc(d->m_vals,sizeof(uint64_t));
+}
+static void idist_destroy(idist_t *d)
+{
+ if ( d->vals ) free(d->vals);
+}
+static inline uint64_t *idist(idist_t *d, int val)
+{
+ if ( val < d->min ) return &d->vals[0];
+ if ( val > d->max ) return &d->vals[d->m_vals-1];
+ return &d->vals[1 + (val - d->min) / d->step];
+}
+static inline int idist_i2bin(idist_t *d, int i)
+{
+ if ( i<=0 ) return d->min;
+ if ( i>= d->m_vals ) return d->max;
+ return i-1+d->min;
+}
+
+
+#define IC_DBG 0
+#if IC_DBG
+static void _indel_ctx_print1(_idc1_t *idc)
+{
+ int i;
+ fprintf(stdout, "%d\t", idc->cnt);
+ for (i=0; i<idc->len; i++)
+ fputc(idc->seq[i], stdout);
+ fputc('\n', stdout);
+}
+static void _indel_ctx_print(indel_ctx_t *ctx)
+{
+ int i;
+ for (i=0; i<ctx->ndat; i++)
+ _indel_ctx_print1(&ctx->dat[i]);
+ fputc('\n',stdout);
+}
+#endif
+static int _indel_ctx_lookup(indel_ctx_t *ctx, char *seq, int seq_len, int *hit)
+{
+ // binary search
+ int min = 0, max = ctx->ndat - 1;
+ while ( min<=max )
+ {
+ int i = (min+max)/2;
+ int cmp = strncmp(seq, ctx->dat[i].seq, seq_len);
+ if ( cmp<0 ) max = i - 1;
+ else if ( cmp>0 ) min = i + 1;
+ else
+ {
+ if ( seq_len==ctx->dat[i].len )
+ {
+ *hit = 1;
+ return i;
+ }
+ else if ( seq_len<ctx->dat[i].len ) max = i - 1;
+ else min = i + 1;
+ }
+ }
+ *hit = 0;
+ return max;
+}
+static void _indel_ctx_insert(indel_ctx_t *ctx, char *seq, int seq_len, int pos)
+{
+ int idat, hit, i;
+ idat = _indel_ctx_lookup(ctx, seq, seq_len, &hit);
+ if ( !hit )
+ {
+ if ( pos>0 ) return;
+ idat++;
+ ctx->ndat++;
+ hts_expand(_idc1_t, ctx->ndat+1, ctx->mdat, ctx->dat);
+ if ( idat<ctx->ndat && ctx->ndat>1 )
+ memmove(&ctx->dat[idat+1], &ctx->dat[idat], (ctx->ndat - idat - 1)*sizeof(_idc1_t));
+ ctx->dat[idat].len = seq_len;
+ ctx->dat[idat].cnt = 1;
+ ctx->dat[idat].pos = pos;
+ ctx->dat[idat].seq = (char*) malloc(sizeof(char)*(seq_len+1));
+ for (i=0; i<seq_len; i++) ctx->dat[idat].seq[i] = seq[i];
+ ctx->dat[idat].seq[i] = 0;
+ return;
+ }
+ if ( ctx->dat[idat].pos + seq_len == pos )
+ {
+ ctx->dat[idat].cnt++;
+ ctx->dat[idat].pos = pos;
+ }
+}
+indel_ctx_t *indel_ctx_init(char *fa_ref_fname)
+{
+ indel_ctx_t *ctx = (indel_ctx_t *) calloc(1,sizeof(indel_ctx_t));
+ ctx->ref = fai_load(fa_ref_fname);
+ if ( !ctx->ref )
+ {
+ free(ctx);
+ return NULL;
+ }
+ return ctx;
+}
+void indel_ctx_destroy(indel_ctx_t *ctx)
+{
+ fai_destroy(ctx->ref);
+ if ( ctx->mdat ) free(ctx->dat);
+ free(ctx);
+}
+/**
+ * indel_ctx_type() - determine indel context type
+ * @ctx:
+ * @chr: chromosome name
+ * @pos: position of the first @ref base, 1-based
+ * @ref: reference allele
+ * @alt: alternate allele. Only first of multiple comma-separated alleles is
+ * considered
+ * @nrep: number of repeated elements (w)
+ * @nlen: length of a single repeat element (w)
+ *
+ * Returns the INDEL length, negative for deletions, positive for insertions
+ */
+int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, int *nrep, int *nlen)
+{
+ const int win_size = 50; // hard-wired for now
+ const int rep_len = IRC_RLEN; // hard-wired for now
+
+ int ref_len = strlen(ref);
+ int alt_len = 0;
+ while ( alt[alt_len] && alt[alt_len]!=',' ) alt_len++;
+
+ int i, fai_ref_len;
+ char *fai_ref = faidx_fetch_seq(ctx->ref, chr, pos-1, pos+win_size, &fai_ref_len);
+ for (i=0; i<fai_ref_len; i++)
+ if ( (int)fai_ref[i]>96 ) fai_ref[i] -= 32;
+
+ // Sanity check: the reference sequence must match the REF allele
+ for (i=0; i<fai_ref_len && i<ref_len; i++)
+ if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] )
+ error("\nSanity check failed, the reference sequence differs: %s:%d+%d .. %c vs %c\n", chr, pos, i, ref[i],fai_ref[i]);
+
+ // Count occurrences of all possible kmers
+ ctx->ndat = 0;
+ for (i=0; i<win_size; i++)
+ {
+ int k, kmax = rep_len <= i ? rep_len : i+1;
+ for (k=0; k<kmax; k++)
+ _indel_ctx_insert(ctx, &fai_ref[i-k+1], k+1, i-k);
+ }
+
+ #if IC_DBG
+ fprintf(stdout,"ref: %s\n", ref);
+ fprintf(stdout,"alt: %s\n", alt);
+ fprintf(stdout,"ctx: %s\n", fai_ref);
+ _indel_ctx_print(ctx);
+ #endif
+
+ int max_cnt = 0, max_len = 0;
+ for (i=0; i<ctx->ndat; i++)
+ {
+ if ( max_cnt < ctx->dat[i].cnt || (max_cnt==ctx->dat[i].cnt && max_len < ctx->dat[i].len) )
+ {
+ max_cnt = ctx->dat[i].cnt;
+ max_len = ctx->dat[i].len;
+ }
+ free(ctx->dat[i].seq);
+ }
+ free(fai_ref);
+
+ *nrep = max_cnt;
+ *nlen = max_len;
+ return alt_len - ref_len;
+}
+
+static void add_user_stats(args_t *args, char *str)
+{
+ args->nusr++;
+ args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr);
+ user_stats_t *usr = &args->usr[args->nusr-1];
+ memset(usr,0,sizeof(*usr));
+ usr->min = 0;
+ usr->max = 1;
+ usr->nbins = 100;
+
+ char *tmp = str;
+ while ( *tmp && *tmp!=':' ) tmp++;
+ usr->tag = (char*)calloc(tmp-str+2,sizeof(char));
+ memcpy(usr->tag,str,tmp-str);
+
+ if ( *tmp )
+ {
+ char *ptr = ++tmp;
+ usr->min = strtod(tmp, &ptr);
+ if ( tmp==ptr ) error("Could not parse %s\n", str);
+ tmp = ptr+1;
+ }
+ if ( *tmp )
+ {
+ char *ptr = tmp;
+ usr->max = strtod(tmp, &ptr);
+ if ( tmp==ptr ) error("Could not parse %s\n", str);
+ tmp = ptr+1;
+ }
+ if ( *tmp )
+ {
+ char *ptr = tmp;
+ usr->nbins = strtol(tmp, &ptr, 10);
+ if ( tmp==ptr ) error("Could not parse %s\n", str);
+ if ( usr->nbins<=0 ) error("Number of bins does not make sense (%d): %s.\n", usr->nbins, str);
+ }
+}
+static void init_user_stats(args_t *args, bcf_hdr_t *hdr, stats_t *stats)
+{
+ stats->nusr = args->nusr;
+ stats->usr = (user_stats_t*)malloc(sizeof(user_stats_t)*args->nusr);
+ memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t));
+ int i;
+ for (i=0; i<stats->nusr; i++)
+ {
+ user_stats_t *usr = &stats->usr[i];
+ usr->vals_ts = (uint64_t*)calloc(usr->nbins,sizeof(uint64_t));
+ usr->vals_tv = (uint64_t*)calloc(usr->nbins,sizeof(uint64_t));
+ int id = bcf_hdr_id2int(hdr,BCF_DT_ID,usr->tag);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) error("The INFO tag \"%s\" is not defined in the header\n", usr->tag);
+ usr->type = bcf_hdr_id2type(hdr,BCF_HL_INFO,id);
+ if ( usr->type!=BCF_HT_REAL && usr->type!=BCF_HT_INT ) error("The INFO tag \"%s\" is not of Float or Integer type (%d)\n", usr->type);
+ }
+}
+static void init_stats(args_t *args)
+{
+ int i;
+ args->nstats = args->files->nreaders==1 ? 1 : 3;
+ if ( args->split_by_id ) args->nstats = 2;
+
+ if ( args->filter_str )
+ {
+ args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str);
+ if ( args->files->nreaders==2 )
+ args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str);
+ }
+
+ // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs
+ args->m_af = 101;
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
+ args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+
+ #if QUAL_STATS
+ args->m_qual = 999;
+ #endif
+ #if HWE_STATS
+ args->naf_hwe = 100;
+ #endif
+
+ if ( args->samples_list )
+ {
+ if ( !bcf_sr_set_samples(args->files,args->samples_list,args->samples_is_file) )
+ {
+ if ( !bcf_hdr_nsamples(args->files->readers[0].header) )
+ error("No sample columns in %s\n", args->files->readers[0].fname);
+ error("Unable to parse the samples: \"%s\"\n", args->samples_list);
+ }
+ args->af_gts_snps = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
+ args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
+ args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
+ args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
+ args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
+ args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
+ }
+ for (i=0; i<args->nstats; i++)
+ {
+ stats_t *stats = &args->stats[i];
+ stats->m_indel = 60;
+ stats->insertions = (int*) calloc(stats->m_indel,sizeof(int));
+ stats->deletions = (int*) calloc(stats->m_indel,sizeof(int));
+ stats->af_ts = (int*) calloc(args->m_af,sizeof(int));
+ stats->af_tv = (int*) calloc(args->m_af,sizeof(int));
+ stats->af_snps = (int*) calloc(args->m_af,sizeof(int));
+ int j;
+ for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int));
+ #if QUAL_STATS
+ stats->qual_ts = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_tv = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_snps = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int));
+ #endif
+ if ( args->files->n_smpl )
+ {
+ stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int));
+ stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int));
+ #if HWE_STATS
+ stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int));
+ #endif
+ if ( args->exons_fname )
+ stats->smpl_frm_shifts = (int*) calloc(args->files->n_smpl*3,sizeof(int));
+ }
+ idist_init(&stats->dp, args->dp_min,args->dp_max,args->dp_step);
+ idist_init(&stats->dp_sites, args->dp_min,args->dp_max,args->dp_step);
+ init_user_stats(args, i!=1 ? args->files->readers[0].header : args->files->readers[1].header, stats);
+ }
+
+ if ( args->exons_fname )
+ {
+ args->exons = bcf_sr_regions_init(args->exons_fname,1,0,1,2);
+ if ( !args->exons )
+ error("Error occurred while reading, was the file compressed with bgzip: %s?\n", args->exons_fname);
+ }
+
+ #if IRC_STATS
+ if ( args->ref_fname )
+ args->indel_ctx = indel_ctx_init(args->ref_fname);
+ #endif
+
+ type2dosage[GT_HOM_RR] = 0;
+ type2dosage[GT_HET_RA] = 1;
+ type2dosage[GT_HOM_AA] = 2;
+ type2dosage[GT_HET_AA] = 2;
+ type2dosage[GT_HAPL_R] = 0;
+ type2dosage[GT_HAPL_A] = 1;
+
+ type2ploidy[GT_HOM_RR] = 1;
+ type2ploidy[GT_HET_RA] = 1;
+ type2ploidy[GT_HOM_AA] = 1;
+ type2ploidy[GT_HET_AA] = 1;
+ type2ploidy[GT_HAPL_R] = -1;
+ type2ploidy[GT_HAPL_A] = -1;
+
+ type2stats[GT_HOM_RR] = 0;
+ type2stats[GT_HET_RA] = 1;
+ type2stats[GT_HOM_AA] = 2;
+ type2stats[GT_HET_AA] = 1;
+ type2stats[GT_HAPL_R] = 0;
+ type2stats[GT_HAPL_A] = 2;
+
+}
+static void destroy_stats(args_t *args)
+{
+ int id, j;
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ if (stats->af_ts) free(stats->af_ts);
+ if (stats->af_tv) free(stats->af_tv);
+ if (stats->af_snps) free(stats->af_snps);
+ for (j=0; j<3; j++)
+ if (stats->af_repeats[j]) free(stats->af_repeats[j]);
+ #if QUAL_STATS
+ if (stats->qual_ts) free(stats->qual_ts);
+ if (stats->qual_tv) free(stats->qual_tv);
+ if (stats->qual_snps) free(stats->qual_snps);
+ if (stats->qual_indels) free(stats->qual_indels);
+ #endif
+ #if HWE_STATS
+ //if ( args->files->n_smpl ) free(stats->af_hwe);
+ free(stats->af_hwe);
+ #endif
+ free(stats->insertions);
+ free(stats->deletions);
+ if (stats->smpl_hets) free(stats->smpl_hets);
+ if (stats->smpl_homAA) free(stats->smpl_homAA);
+ if (stats->smpl_homRR) free(stats->smpl_homRR);
+ if (stats->smpl_indel_homs) free(stats->smpl_indel_homs);
+ if (stats->smpl_indel_hets) free(stats->smpl_indel_hets);
+ if (stats->smpl_ts) free(stats->smpl_ts);
+ if (stats->smpl_tv) free(stats->smpl_tv);
+ if (stats->smpl_indels) free(stats->smpl_indels);
+ if (stats->smpl_dp) free(stats->smpl_dp);
+ if (stats->smpl_ndp) free(stats->smpl_ndp);
+ if (stats->smpl_sngl) free(stats->smpl_sngl);
+ idist_destroy(&stats->dp);
+ idist_destroy(&stats->dp_sites);
+ for (j=0; j<stats->nusr; j++)
+ {
+ free(stats->usr[j].vals_ts);
+ free(stats->usr[j].vals_tv);
+ free(stats->usr[j].val);
+ }
+ free(stats->usr);
+ if ( args->exons ) free(stats->smpl_frm_shifts);
+ }
+ for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
+ free(args->usr);
+ free(args->tmp_frm);
+ free(args->tmp_iaf);
+ if (args->exons) bcf_sr_regions_destroy(args->exons);
+ free(args->af_gts_snps);
+ free(args->af_gts_indels);
+ free(args->smpl_gts_snps);
+ free(args->smpl_gts_indels);
+ free(args->smpl_r_snps);
+ free(args->smpl_r_indels);
+ if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
+ if (args->filter[0]) filter_destroy(args->filter[0]);
+ if (args->filter[1]) filter_destroy(args->filter[1]);
+}
+
+static void init_iaf(args_t *args, bcf_sr_t *reader)
+{
+ bcf1_t *line = reader->buffer[0];
+ if ( args->ntmp_iaf < line->n_allele )
+ {
+ args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int));
+ args->ntmp_iaf = line->n_allele;
+ }
+ // tmp_iaf is first filled with AC counts in calc_ac and then transformed to
+ // an index to af_gts_snps
+ int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
+ if ( ret )
+ {
+ int an=0;
+ for (i=0; i<line->n_allele; i++)
+ an += args->tmp_iaf[i];
+
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->tmp_iaf[i]==1 )
+ args->tmp_iaf[i] = 0; // singletons into the first bin
+ else if ( !an )
+ args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
+ else
+ args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an;
+ }
+ }
+ else
+ for (i=0; i<line->n_allele; i++)
+ args->tmp_iaf[i] = 0;
+
+ // todo: otherwise use AF
+}
+
+static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_mnps++;
+}
+
+static inline void do_other_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_others++;
+}
+
+static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_indels++;
+
+ bcf1_t *line = reader->buffer[0];
+
+ #if QUAL_STATS
+ int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ stats->qual_indels[iqual]++;
+ #endif
+
+ // Check if the indel is near an exon for the frameshift statistics
+ int i, exon_overlap = 0;
+ if ( args->exons )
+ {
+ if ( !bcf_sr_regions_overlap(args->exons, bcf_seqname(reader->header,line),line->pos,line->pos) ) exon_overlap = 1;
+ hts_expand(uint8_t,line->n_allele,args->mtmp_frm,args->tmp_frm);
+ for (i=0; i<line->n_allele; i++) args->tmp_frm[i] = 0;
+ }
+
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->first_allele_only && i>1 ) break;
+ if ( bcf_get_variant_type(line,i)!=VCF_INDEL ) continue;
+ int len = line->d.var[i].n;
+
+ #if IRC_STATS
+ // Indel repeat consistency
+ if ( args->indel_ctx )
+ {
+ int nrep, nlen, ndel;
+ ndel = indel_ctx_type(args->indel_ctx, (char*)reader->header->id[BCF_DT_CTG][line->rid].key, line->pos+1, line->d.allele[0], line->d.allele[i], &nrep, &nlen);
+ if ( nlen<=1 || nrep<=1 )
+ {
+ // not a repeat or a single base repeat
+ stats->n_repeat_na++;
+ stats->af_repeats[2][ args->tmp_iaf[i] ]++;
+ }
+ else
+ {
+ if ( abs(ndel) % nlen )
+ {
+ // the length of the inserted/deleted sequence is not consistent with the repeat element
+ stats->n_repeat[nlen-1][ndel<0 ? 1 : 3]++;
+ stats->af_repeats[1][ args->tmp_iaf[i] ]++;
+ }
+ else
+ {
+ // the length consistent with the repeat
+ stats->n_repeat[nlen-1][ndel<0 ? 0 : 2]++;
+ stats->af_repeats[0][ args->tmp_iaf[i] ]++;
+ }
+ }
+ }
+ else
+ stats->af_repeats[2][ args->tmp_iaf[i] ]++;
+ #endif
+
+ // Check the frameshifts
+ int tlen = 0;
+ if ( args->exons && exon_overlap ) // there is an exon
+ {
+ if ( len>0 )
+ {
+ // insertion
+ if ( args->exons->start <= line->pos && args->exons->end > line->pos ) tlen = abs(len);
+ }
+ else if ( args->exons->start <= line->pos + abs(len) )
+ {
+ // deletion
+ tlen = abs(len);
+ if ( line->pos < args->exons->start ) // trim the beginning
+ tlen -= args->exons->start - line->pos + 1;
+ if ( args->exons->end < line->pos + abs(len) ) // trim the end
+ tlen -= line->pos + abs(len) - args->exons->end;
+ }
+ }
+ if ( tlen ) // there are some deleted/inserted bases in the exon
+ {
+ if ( tlen%3 ) { stats->out_frame++; args->tmp_frm[i] = 2; }
+ else { stats->in_frame++; args->tmp_frm[i] = 1; }
+
+ if ( i==1 )
+ {
+ if ( tlen%3 ) stats->out_frame_alt1++;
+ else stats->in_frame_alt1++;
+ }
+ }
+ else // no exon affected
+ {
+ if ( i==1 ) stats->na_frame_alt1++;
+ stats->na_frame++;
+ }
+
+
+ // Indel length distribution
+ int *ptr = stats->insertions;
+ if ( len<0 )
+ {
+ len *= -1;
+ ptr = stats->deletions;
+ }
+ if ( --len >= stats->m_indel ) len = stats->m_indel-1;
+ ptr[len]++;
+ }
+}
+
+static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
+{
+ int i;
+ for (i=0; i<stats->nusr; i++)
+ {
+ user_stats_t *usr = &stats->usr[i];
+ uint64_t *vals = is_ts ? usr->vals_ts : usr->vals_tv;
+ float val;
+ if ( usr->type==BCF_HT_REAL )
+ {
+ if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
+ val = ((float*)usr->val)[0];
+ }
+ else
+ {
+ if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
+ val = ((int32_t*)usr->val)[0];
+ }
+ int idx;
+ if ( val<=usr->min ) idx = 0;
+ else if ( val>=usr->max ) idx = usr->nbins - 1;
+ else idx = (val - usr->min)/(usr->max - usr->min) * (usr->nbins-1);
+ vals[idx]++;
+ }
+}
+
+static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
+{
+ stats->n_snps++;
+
+ bcf1_t *line = reader->buffer[0];
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ if ( ref<0 ) return;
+
+ #if QUAL_STATS
+ int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ stats->qual_snps[iqual]++;
+ #endif
+
+ int i;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->first_allele_only && i>1 ) break;
+ if ( !(bcf_get_variant_type(line,i)&VCF_SNP) ) continue;
+ int alt = bcf_acgt2int(*line->d.allele[i]);
+ if ( alt<0 || ref==alt ) continue;
+ stats->subst[ref<<2|alt]++;
+ int iaf = args->tmp_iaf[i];
+ stats->af_snps[iaf]++;
+ if ( abs(ref-alt)==2 )
+ {
+ if (i==1)
+ {
+ stats->ts_alt1++;
+ #if QUAL_STATS
+ stats->qual_ts[iqual]++;
+ #endif
+ do_user_stats(stats, reader, 1);
+ }
+ stats->af_ts[iaf]++;
+ }
+ else
+ {
+ if (i==1)
+ {
+ stats->tv_alt1++;
+ #if QUAL_STATS
+ stats->qual_tv[iqual]++;
+ #endif
+ do_user_stats(stats, reader, 0);
+ }
+ stats->af_tv[iaf]++;
+ }
+ }
+}
+
+static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+{
+ bcf_srs_t *files = args->files;
+ bcf1_t *line = reader->buffer[0];
+ bcf_fmt_t *fmt_ptr;
+ int nref_tot = 0, nhet_tot = 0, nalt_tot = 0;
+ int line_type = bcf_get_variant_types(line);
+
+ if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) )
+ {
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ int is, n_nref = 0, i_nref = 0;
+ for (is=0; is<args->files->n_smpl; is++)
+ {
+ int ial, jal;
+ int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
+ if ( gt==GT_UNKN ) continue;
+ if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
+ {
+ if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele );
+ stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+ }
+ continue;
+ }
+ if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
+ #if HWE_STATS
+ switch (gt)
+ {
+ case GT_HOM_RR: nref_tot++; break;
+ case GT_HET_RA: nhet_tot++; break;
+ case GT_HET_AA:
+ case GT_HOM_AA: nalt_tot++; break;
+ }
+ #endif
+ if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP
+ {
+ if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
+ else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
+ else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++;
+ else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++;
+ if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
+ {
+ int alt = bcf_acgt2int(*line->d.allele[ial]);
+ if ( alt<0 ) continue;
+ if ( abs(ref-alt)==2 )
+ stats->smpl_ts[is]++;
+ else
+ stats->smpl_tv[is]++;
+ }
+ }
+ if ( line_type&VCF_INDEL )
+ {
+ if ( gt != GT_HOM_RR )
+ {
+ stats->smpl_indels[is]++;
+ if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++;
+ else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++;
+ }
+ if ( stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele && jal<line->n_allele );
+ stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+ stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++;
+ }
+ }
+ }
+ if ( n_nref==1 ) stats->smpl_sngl[i_nref]++;
+ }
+
+ #if HWE_STATS
+ if ( nhet_tot + nref_tot + nalt_tot )
+ {
+ float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
+ int idx = het_frac*(args->naf_hwe - 1);
+ if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
+ stats->af_hwe[idx]++;
+ }
+ #endif
+
+ if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) )
+ {
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ int is; \
+ for (is=0; is<args->files->n_smpl; is++) \
+ { \
+ type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
+ if ( *p==vector_end ) continue; \
+ if ( *p!=missing ) \
+ { \
+ (*idist(&stats->dp, *p))++; \
+ stats->smpl_ndp[is]++; \
+ stats->smpl_dp[is] += *p; \
+ } \
+ } \
+ }
+ switch (fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(pysamerr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+ }
+
+ if ( matched==3 )
+ {
+ int is;
+ bcf_fmt_t *fmt0, *fmt1;
+ fmt0 = bcf_get_fmt(files->readers[0].header,files->readers[0].buffer[0],"GT"); if ( !fmt0 ) return;
+ fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return;
+
+ // only the first ALT allele is considered
+ int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1;
+ int line_type = bcf_get_variant_types(files->readers[0].buffer[0]);
+ gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels;
+ gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels;
+
+ //
+ // Calculates r squared
+ // x is mean dosage of x at given site
+ // x2 is mean squared dosage of x at given site
+ // y is mean dosage of x at given site
+ // y2 is mean squared dosage of x at given site
+ // xy is mean dosage of x*y at given site
+ // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
+ // r2n is number of sites considered
+ // output as r2sum/r2n for each AF bin
+ int r2n = 0;
+ float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0;
+ // Select smpl_r
+ smpl_r_t *smpl_r = NULL;
+ if (line_type&VCF_SNP)
+ {
+ smpl_r = args->smpl_r_snps;
+ }
+ else if (line_type&VCF_INDEL)
+ {
+ smpl_r = args->smpl_r_indels;
+ }
+ for (is=0; is<files->n_smpl; is++)
+ {
+ // Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
+ // actual alleles can be enforced by running without the -c option.
+ int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
+ if ( gt0 == GT_UNKN ) continue;
+
+ int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
+ if ( gt1 == GT_UNKN ) continue;
+
+ if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
+
+ int dsg0 = type2dosage[gt0];
+ int dsg1 = type2dosage[gt1];
+ x += dsg0;
+ x2 += dsg0*dsg0;
+ y += dsg1;
+ y2 += dsg1*dsg1;
+ xy += dsg0*dsg1;
+ r2n++;
+
+ int idx = type2stats[gt0];
+ if ( gt0==gt1 )
+ {
+ af_stats[iaf].m[idx]++;
+ smpl_stats[is].m[idx]++;
+ }
+ else
+ {
+ af_stats[iaf].mm[idx]++;
+ smpl_stats[is].mm[idx]++;
+ }
+
+ // Now do it across samples
+
+ if (smpl_r) {
+ smpl_r[is].xy += dsg0*dsg1;
+ smpl_r[is].x += dsg0;
+ smpl_r[is].x2 += dsg0*dsg0;
+ smpl_r[is].y += dsg1;
+ smpl_r[is].y2 += dsg1*dsg1;
+ ++(smpl_r[is].n);
+ }
+ }
+
+ if ( r2n )
+ {
+ x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n;
+ float cov = xy - x*y;
+ float var2 = (x2 - x*x) * (y2 - y*y);
+ if ( var2!=0 )
+ {
+ af_stats[iaf].r2sum += cov*cov/var2;
+ af_stats[iaf].r2n++;
+ }
+ }
+
+ if ( args->verbose_sites )
+ {
+ int nm = 0, nmm = 0, nrefm = 0;
+ for (is=0; is<files->n_smpl; is++)
+ {
+ int gt = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
+ if ( gt == GT_UNKN ) continue;
+ int gt2 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
+ if ( gt2 == GT_UNKN ) continue;
+ if ( gt != gt2 )
+ {
+ nmm++;
+ bcf_sr_t *reader = &files->readers[0];
+ printf("DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2);
+ }
+ else
+ {
+ if ( gt!=GT_HOM_RR ) nrefm++;
+ nm++;
+ }
+ }
+ float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0;
+ printf("PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd);
+ }
+ }
+}
+
+static void do_vcf_stats(args_t *args)
+{
+ bcf_srs_t *files = args->files;
+ assert( sizeof(int)>files->nreaders );
+ while ( bcf_sr_next_line(files) )
+ {
+ bcf_sr_t *reader = NULL;
+ bcf1_t *line = NULL;
+ int ret = 0, i, pass = 1;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !bcf_sr_has_line(files,i) ) continue;
+ if ( args->filter[i] )
+ {
+ int is_ok = filter_test(args->filter[i], bcf_sr_get_line(files,i), NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
+ if ( !is_ok ) { pass = 0; break; }
+ }
+ ret |= 1<<i;
+ if ( !reader )
+ {
+ reader = &files->readers[i];
+ line = bcf_sr_get_line(files,i);
+ }
+
+ }
+ if ( !pass ) continue;
+
+ int line_type = bcf_get_variant_types(line);
+ init_iaf(args, reader);
+
+ stats_t *stats = &args->stats[ret-1];
+ if ( args->split_by_id && line->d.id[0]=='.' && !line->d.id[1] )
+ stats = &args->stats[1];
+
+ stats->n_records++;
+
+ if ( line_type==VCF_REF )
+ stats->n_noalts++;
+ if ( line_type&VCF_SNP )
+ do_snp_stats(args, stats, reader);
+ if ( line_type&VCF_INDEL )
+ do_indel_stats(args, stats, reader);
+ if ( line_type&VCF_MNP )
+ do_mnp_stats(args, stats, reader);
+ if ( line_type&VCF_OTHER )
+ do_other_stats(args, stats, reader);
+
+ if ( line->n_allele>2 )
+ {
+ stats->n_mals++;
+ if ( line_type == VCF_SNP ) stats->n_snp_mals++;
+ }
+
+ if ( files->n_smpl )
+ do_sample_stats(args, stats, reader, ret);
+
+ if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 )
+ (*idist(&stats->dp_sites, args->tmp_iaf[0]))++;
+ }
+}
+
+static void print_header(args_t *args)
+{
+ int i;
+ printf("# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version());
+ printf("# The command line was:\tbcftools %s ", args->argv[0]);
+ for (i=1; i<args->argc; i++)
+ printf(" %s",args->argv[i]);
+ printf("\n#\n");
+
+ printf("# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n");
+ if ( args->files->nreaders==1 )
+ {
+ const char *fname = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : "<STDIN>";
+ if ( args->split_by_id )
+ {
+ printf("ID\t0\t%s:known (sites with ID different from \".\")\n", fname);
+ printf("ID\t1\t%s:novel (sites where ID column is \".\")\n", fname);
+ }
+ else
+ printf("ID\t0\t%s\n", fname);
+ }
+ else
+ {
+ const char *fname0 = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : "<STDIN>";
+ const char *fname1 = strcmp("-",args->files->readers[1].fname) ? args->files->readers[1].fname : "<STDIN>";
+ printf("ID\t0\t%s\n", fname0);
+ printf("ID\t1\t%s\n", fname1);
+ printf("ID\t2\t%s\t%s\n", fname0,fname1);
+
+ if ( args->verbose_sites )
+ {
+ printf(
+ "# Verbose per-site discordance output.\n"
+ "# PSD\t[2]CHROM\t[3]POS\t[4]Number of matches\t[5]Number of mismatches\t[6]NRD\n");
+ printf(
+ "# Verbose per-site and per-sample output. Genotype codes: %d:HomRefRef, %d:HomAltAlt, %d:HetAltRef, %d:HetAltAlt, %d:haploidRef, %d:haploidAlt\n"
+ "# DBG\t[2]CHROM\t[3]POS\t[4]Sample\t[5]GT in %s\t[6]GT in %s\n",
+ GT_HOM_RR, GT_HOM_AA, GT_HET_RA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A, fname0,fname1);
+ }
+ }
+}
+
+#define T2S(x) type2stats[x]
+static void print_stats(args_t *args)
+{
+ int i, id;
+ printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
+ for (id=0; id<args->files->nreaders; id++)
+ printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records);
+ printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts);
+ printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps);
+ printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps);
+ printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels);
+ printf("SN\t%d\tnumber of others:\t%d\n", id, stats->n_others);
+ printf("SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals);
+ printf("SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals);
+ }
+ printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ int ts=0,tv=0;
+ for (i=0; i<args->m_af; i++) { ts += stats->af_ts[i]; tv += stats->af_tv[i]; }
+ printf("TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0);
+ }
+ if ( args->exons_fname )
+ {
+ printf("# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ int in=args->stats[id].in_frame, out=args->stats[id].out_frame, na=args->stats[id].na_frame;
+ int in1=args->stats[id].in_frame_alt1, out1=args->stats[id].out_frame_alt1, na1=args->stats[id].na_frame_alt1;
+ printf("FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0);
+ }
+ }
+ if ( args->indel_ctx )
+ {
+ printf("# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ int nc = 0, ni = 0, na = args->stats[id].n_repeat_na;
+ for (i=0; i<IRC_RLEN; i++)
+ {
+ nc += args->stats[id].n_repeat[i][0] + args->stats[id].n_repeat[i][2];
+ ni += args->stats[id].n_repeat[i][1] + args->stats[id].n_repeat[i][3];
+ }
+ printf("ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0);
+ }
+ printf("# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ for (i=1; i<IRC_RLEN; i++)
+ {
+ int nc = args->stats[id].n_repeat[i][0]+args->stats[id].n_repeat[i][2], ni = args->stats[id].n_repeat[i][1]+args->stats[id].n_repeat[i][3];
+ printf("ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1,
+ args->stats[id].n_repeat[i][0],args->stats[id].n_repeat[i][1],args->stats[id].n_repeat[i][2],args->stats[id].n_repeat[i][3],
+ nc+ni ? (float)nc/(nc+ni) : 0.0);
+ }
+ }
+ }
+ printf("# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ printf("SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0],
+ stats->af_repeats[0][0]+stats->af_repeats[1][0]+stats->af_repeats[2][0],stats->af_repeats[0][0],stats->af_repeats[1][0],stats->af_repeats[2][0]);
+ // put the singletons stats into the first AF bin, note that not all of the stats is transferred (i.e. nrd mismatches)
+ stats->af_snps[1] += stats->af_snps[0];
+ stats->af_ts[1] += stats->af_ts[0];
+ stats->af_tv[1] += stats->af_tv[0];
+ stats->af_repeats[0][1] += stats->af_repeats[0][0];
+ stats->af_repeats[1][1] += stats->af_repeats[1][0];
+ stats->af_repeats[2][1] += stats->af_repeats[2][0];
+ }
+ printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=1; i<args->m_af; i++) // note that af[1] now contains also af[0], see SiS stats output above
+ {
+ if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue;
+ printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
+ stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]);
+ }
+ }
+ #if QUAL_STATS
+ printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->m_qual; i++)
+ {
+ if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue;
+ printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
+ }
+ }
+ #endif
+ for (i=0; i<args->nusr; i++)
+ {
+ printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
+ args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag);
+ for (id=0; id<args->nstats; id++)
+ {
+ user_stats_t *usr = &args->stats[id].usr[i];
+ int j;
+ for (j=0; j<usr->nbins; j++)
+ {
+ if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins
+ float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
+ const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n";
+ printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
+ }
+ }
+ }
+ printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=stats->m_indel-1; i>=0; i--)
+ if ( stats->deletions[i] ) printf("IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]);
+ for (i=0; i<stats->m_indel; i++)
+ if ( stats->insertions[i] ) printf("IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]);
+ }
+ printf("# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ int t;
+ for (t=0; t<15; t++)
+ {
+ if ( t>>2 == (t&3) ) continue;
+ printf("ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]);
+ }
+ }
+ if ( args->files->nreaders>1 && args->files->n_smpl )
+ {
+ printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
+
+ int x;
+ for (x=0; x<2; x++)
+ {
+ gtcmp_t *stats;
+ if ( x==0 )
+ {
+ printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ stats = args->af_gts_snps;
+ }
+ else
+ {
+ printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ stats = args->af_gts_indels;
+ }
+ uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
+ for (i=0; i<args->m_af; i++)
+ {
+ int j, n = 0;
+ for (j=0; j<3; j++)
+ {
+ n += stats[i].m[j] + stats[i].mm[j];
+ nrd_m[j] += stats[i].m[j];
+ nrd_mm[j] += stats[i].mm[j];
+ }
+ if ( !i || !n ) continue; // skip singleton stats and empty bins
+ printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
+ printf("\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
+ }
+
+ if ( x==0 )
+ {
+ printf("# NRD and discordance is calculated as follows:\n");
+ printf("# m .. number of matches\n");
+ printf("# x .. number of mismatches\n");
+ printf("# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
+ printf("# RR discordance = xRR / (xRR + mRR)\n");
+ printf("# RA discordance = xRA / (xRA + mRA)\n");
+ printf("# AA discordance = xAA / (xAA + mAA)\n");
+ printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
+ }
+ else
+ printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
+ uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)];
+ uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)];
+ printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
+ m+mm ? mm*100.0/(m+mm) : 0,
+ nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0,
+ nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)] ? nrd_mm[T2S(GT_HET_RA)]*100.0/(nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)]) : 0,
+ nrd_m[T2S(GT_HOM_AA)]+nrd_mm[T2S(GT_HOM_AA)] ? nrd_mm[T2S(GT_HOM_AA)]*100.0/(nrd_m[T2S(GT_HOM_AA)]+nrd_mm[T2S(GT_HOM_AA)]) : 0
+ );
+ }
+
+ for (x=0; x<2; x++)
+ {
+ gtcmp_t *stats;
+ smpl_r_t *smpl_r_array;
+ if ( x==0 )
+ {
+ printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+ stats = args->smpl_gts_snps;
+ smpl_r_array = args->smpl_r_snps;
+ }
+ else
+ {
+ printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+ stats = args->smpl_gts_indels;
+ smpl_r_array = args->smpl_r_indels;
+ }
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)];
+ uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)];
+ // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
+ smpl_r_t *smpl_r = smpl_r_array + i;
+ double r = 0.0;
+ if (smpl_r->n) {
+ double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula
+ double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n;
+ double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
+ r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
+ }
+ printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
+ if (smpl_r->n && !isnan(r)) printf("\t%f\n", r*r);
+ else printf("\t"NA_STRING"\n");
+ }
+ }
+ }
+
+ printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ long unsigned int sum = 0, sum_sites = 0;
+ for (i=0; i<stats->dp.m_vals; i++) { sum += stats->dp.vals[i]; sum_sites += stats->dp_sites.vals[i]; }
+ for (i=0; i<stats->dp.m_vals; i++)
+ {
+ if ( stats->dp.vals[i]==0 && stats->dp_sites.vals[i]==0 ) continue;
+ printf("DP\t%d\t", id);
+ if ( i==0 ) printf("<%d", stats->dp.min);
+ else if ( i+1==stats->dp.m_vals ) printf(">%d", stats->dp.max);
+ else printf("%d", idist_i2bin(&stats->dp,i));
+ printf("\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0);
+ printf("\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0);
+ }
+ }
+
+ if ( args->files->n_smpl )
+ {
+ printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
+ printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
+ stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
+ stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]);
+ }
+ }
+
+
+ printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ int na = 0, in = 0, out = 0;
+ if ( args->exons )
+ {
+ na = stats->smpl_frm_shifts[i*3 + 0];
+ in = stats->smpl_frm_shifts[i*3 + 1];
+ out = stats->smpl_frm_shifts[i*3 + 2];
+ }
+ int nhom = stats->smpl_indel_homs[i];
+ int nhet = stats->smpl_indel_hets[i];
+ printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom);
+ }
+ }
+
+ #ifdef HWE_STATS
+ printf("# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->naf_hwe; i++) stats->af_hwe[i+args->naf_hwe] += stats->af_hwe[i]; // singletons
+ for (i=1; i<args->m_af; i++)
+ {
+ unsigned int sum_tot = 0, sum_tmp = 0;
+ int j, *ptr = &stats->af_hwe[i*args->naf_hwe];
+ for (j=0; j<args->naf_hwe; j++) sum_tot += ptr[j];
+ if ( !sum_tot ) continue;
+
+ int nprn = 3;
+ printf("HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
+ for (j=0; j<args->naf_hwe; j++)
+ {
+ sum_tmp += ptr[j];
+ float frac = (float)sum_tmp/sum_tot;
+ if ( frac >= 0.75 )
+ {
+ while (nprn>0) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ break;
+ }
+ if ( frac >= 0.5 )
+ {
+ while (nprn>1) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ continue;
+ }
+ if ( frac >= 0.25 )
+ {
+ while (nprn>2) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ }
+ }
+ assert(nprn==0);
+ printf("\n");
+ }
+ }
+ #endif
+ }
+}
+
+static void usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n");
+ fprintf(pysamerr, " When two files are given, the program generates separate stats for intersection\n");
+ fprintf(pysamerr, " and the complements. By default only sites are compared, -s/-S must given to include\n");
+ fprintf(pysamerr, " also sample columns.\n");
+ fprintf(pysamerr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
+ fprintf(pysamerr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(pysamerr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
+ fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysamerr, " -E, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
+ fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysamerr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
+ fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(pysamerr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -s, --samples <list> list of samples for sample stats, \"-\" to include all samples\n");
+ fprintf(pysamerr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysamerr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(pysamerr, " -v, --verbose produce verbose per-site and per-sample output\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfstats(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->files = bcf_sr_init();
+ args->argc = argc; args->argv = argv;
+ args->dp_min = 0; args->dp_max = 500; args->dp_step = 1;
+ int regions_is_file = 0, targets_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"1st-allele-only",0,0,'1'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"help",0,0,'h'},
+ {"collapse",1,0,'c'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"verbose",0,0,'v'},
+ {"depth",1,0,'d'},
+ {"apply-filters",1,0,'f'},
+ {"exons",1,0,'E'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"split-by-ID",0,0,'I'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {"fasta-ref",1,0,'F'},
+ {"user-tstv",1,0,'u'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) {
+ switch (c) {
+ case 'u': add_user_stats(args,optarg); break;
+ case '1': args->first_allele_only = 1; break;
+ case 'F': args->ref_fname = optarg; break;
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'c':
+ if ( !strcmp(optarg,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
+ case 'v': args->verbose_sites = 1; break;
+ case 'd':
+ if ( sscanf(optarg,"%d,%d,%d",&args->dp_min,&args->dp_max,&args->dp_step)!=3 )
+ error("Could not parse --depth %s\n", optarg);
+ if ( args->dp_min<0 || args->dp_min >= args->dp_max || args->dp_step > args->dp_max - args->dp_min + 1 )
+ error("Is this a typo? --depth %s\n", optarg);
+ break;
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 'E': args->exons_fname = optarg; break;
+ case 's': args->samples_list = optarg; break;
+ case 'S': args->samples_list = optarg; args->samples_is_file = 1; break;
+ case 'I': args->split_by_id = 1; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'h':
+ case '?': usage();
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
+ }
+ else fname = argv[optind];
+
+ if ( argc-optind>2 ) usage();
+ if ( argc-optind>1 )
+ {
+ args->files->require_index = 1;
+ if ( args->split_by_id ) error("Only one file can be given with -i.\n");
+ }
+ if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO;
+ if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ while (fname)
+ {
+ if ( !bcf_sr_add_reader(args->files, fname) )
+ error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+ fname = ++optind < argc ? argv[optind] : NULL;
+ }
+
+ init_stats(args);
+ print_header(args);
+ do_vcf_stats(args);
+ print_stats(args);
+ destroy_stats(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
+
diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c
new file mode 100644
index 0000000..ed41595
--- /dev/null
+++ b/bcftools/vcfview.c
@@ -0,0 +1,746 @@
+/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Shane McCarthy <sm15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "htslib/khash_str2int.h"
+
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define ALLELE_NONREF 1
+#define ALLELE_MINOR 2
+#define ALLELE_ALT1 3
+#define ALLELE_MAJOR 4
+#define ALLELE_NONMAJOR 5
+
+#define GT_NEED_HOM 1
+#define GT_NEED_HET 2
+#define GT_NO_HOM 3
+#define GT_NO_HET 4
+#define GT_NEED_MISSING 5
+#define GT_NO_MISSING 6
+
+typedef struct _args_t
+{
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e)
+
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
+ char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
+ int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
+ int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
+ int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
+ int *ac, mac;
+ float min_af, max_af;
+ char *fn_ref, *fn_out, **samples;
+ int sample_is_file, force_samples;
+ char *include_types, *exclude_types;
+ int include, exclude;
+ htsFile *out;
+}
+args_t;
+
+static void init_data(args_t *args)
+{
+ int i;
+ args->hdr = args->files->readers[0].header;
+
+ if (args->calc_ac && args->update_info)
+ {
+ bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
+ bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
+ }
+ bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+
+ // setup sample data
+ if (args->sample_names)
+ {
+ void *hdr_samples = khash_str2int_init();
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
+
+ void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
+ int nsmpl;
+ char **smpl = NULL;
+ args->samples = NULL; args->n_samples = 0;
+ smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
+ if ( !smpl )
+ {
+ error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
+ }
+
+ if ( exclude )
+ {
+ for (i=0; i<nsmpl; i++) {
+ if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
+ if (args->force_samples) {
+ fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
+ } else {
+ error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
+ }
+ }
+ khash_str2int_inc(exclude, smpl[i]);
+ }
+
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ {
+ if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue;
+ args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
+ args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
+ }
+ khash_str2int_destroy(exclude);
+ }
+ else
+ {
+ for (i=0; i<nsmpl; i++) {
+ if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
+ if (args->force_samples) {
+ fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
+ continue;
+ } else {
+ error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
+ }
+ }
+ args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
+ args->samples[args->n_samples++] = strdup(smpl[i]);
+ }
+ }
+ for (i=0; i<nsmpl; i++) free(smpl[i]);
+ free(smpl);
+ khash_str2int_destroy(hdr_samples);
+ if (args->n_samples == 0) {
+ fprintf(stderr, "Warn: subsetting has removed all samples\n");
+ args->sites_only = 1;
+ }
+ }
+
+ if (args->n_samples)
+ args->imap = (int*)malloc(args->n_samples * sizeof(int));
+
+ // determine variant types to include/exclude
+ if (args->include_types || args->exclude_types) {
+ if (args->include_types && args->exclude_types) {
+ fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n");
+ exit(1);
+ }
+ char **type_list = 0;
+ int m = 0, n = 0;
+ const char *q, *p;
+ for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
+ if (*p == ',' || *p == 0) {
+ if (m == n) {
+ m = m? m<<1 : 16;
+ type_list = (char**)realloc(type_list, m * sizeof(char*));
+ }
+ type_list[n] = (char*)calloc(p - q + 1, 1);
+ strncpy(type_list[n++], q, p - q);
+ q = p + 1;
+ if (*p == 0) break;
+ }
+ }
+ type_list = (char**)realloc(type_list, n * sizeof(char*));
+
+ if (args->include_types) {
+ args->include = 0;
+ for (i = 0; i < n; ++i) {
+ if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
+ else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
+ else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
+ else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
+ else {
+ fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
+ fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
+ exit(1);
+ }
+ }
+ }
+ if (args->exclude_types) {
+ args->exclude = 0;
+ for (i = 0; i < n; ++i) {
+ if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
+ else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
+ else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
+ else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
+ else {
+ fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
+ fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
+ exit(1);
+ }
+ }
+ }
+ for (i = 0; i < n; ++i)
+ free(type_list[i]);
+ free(type_list);
+ }
+
+ // setup output
+ char modew[8];
+ strcpy(modew, "w");
+ if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
+ if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF
+ else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF
+ else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
+ args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
+ if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out, args->n_threads);
+
+ // headers: hdr=full header, hsub=subset header, hnull=sites only header
+ if (args->sites_only){
+ args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
+ bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
+ }
+ if (args->n_samples > 0)
+ {
+ args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
+ if ( !args->hsub ) error("Error occurred while subsetting samples\n");
+ if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
+ {
+ int i;
+ for (i=0; i<args->n_samples; i++)
+ if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
+ }
+ }
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ if ( args->imap ) {
+ for (i = 0; i < args->n_samples; ++i)
+ free(args->samples[i]);
+ free(args->samples);
+ free(args->imap);
+ }
+ if (args->hnull) bcf_hdr_destroy(args->hnull);
+ if (args->hsub) bcf_hdr_destroy(args->hsub);
+ if ( args->filter )
+ filter_destroy(args->filter);
+ free(args->ac);
+}
+
+// true if all samples are phased.
+// haploid genotypes are considered phased
+// ./. => not phased, .|. => phased
+int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
+{
+ bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
+ int all_phased = 1;
+ if ( fmt_ptr )
+ {
+ int i, isample;
+ for (isample=0; isample<line->n_sample; isample++)
+ {
+ int sample_phased = 0;
+ #define BRANCH_INT(type_t,vector_end) { \
+ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
+ for (i=0; i<fmt_ptr->n; i++) \
+ { \
+ if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
+ if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
+ if ((p[i])&1) { \
+ sample_phased = 1; \
+ break; \
+ } \
+ } \
+ }
+ switch (fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+ if (!sample_phased) {
+ all_phased = 0;
+ break;
+ }
+ }
+ }
+ return all_phased;
+}
+
+int subset_vcf(args_t *args, bcf1_t *line)
+{
+ if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
+ if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
+ if (args->novel || args->known)
+ {
+ if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
+ if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0; // skip sites which are novel, ID == '.'
+ }
+
+ if (args->include || args->exclude)
+ {
+ int line_type = bcf_get_variant_types(line);
+ if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
+ if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types
+ }
+
+ if ( args->filter )
+ {
+ int ret = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
+ else if ( ret ) return 0;
+ }
+
+ hts_expand(int, line->n_allele, args->mac, args->ac);
+ int i, an = 0, non_ref_ac = 0;
+ if (args->calc_ac) {
+ bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
+ for (i=1; i<line->n_allele; i++)
+ non_ref_ac += args->ac[i];
+ for (i=0; i<line->n_allele; i++)
+ an += args->ac[i];
+ }
+
+ if (args->n_samples)
+ {
+ int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
+ bcf_subset(args->hdr, line, args->n_samples, args->imap);
+ if (args->calc_ac) {
+ bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
+ an = 0;
+ for (i=0; i<line->n_allele; i++) {
+ args->ac[i] = ac_sub[i];
+ an += ac_sub[i];
+ }
+ for (i=1; i<line->n_allele; i++)
+ non_ref_ac_sub += ac_sub[i];
+ if (args->private_vars) {
+ if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
+ if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
+ }
+ non_ref_ac = non_ref_ac_sub;
+ }
+ free(ac_sub);
+ }
+
+ bcf_fmt_t *gt_fmt;
+ if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
+ {
+ int nhet = 0, nhom = 0, nmiss = 0;
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ {
+ int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
+ if ( type==GT_HET_RA || type==GT_HET_AA )
+ {
+ if ( args->gt_type==GT_NO_HET ) return 0;
+ nhet = 1;
+ }
+ else if ( type==GT_UNKN )
+ {
+ if ( args->gt_type==GT_NO_MISSING ) return 0;
+ nmiss = 1;
+ }
+ else
+ {
+ if ( args->gt_type==GT_NO_HOM ) return 0;
+ nhom = 1;
+ }
+ }
+ if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
+ else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
+ else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
+ }
+
+ int minor_ac = 0;
+ int major_ac = 0;
+ if ( args->calc_ac )
+ {
+ minor_ac = args->ac[0];
+ major_ac = args->ac[0];
+ for (i=1; i<line->n_allele; i++){
+ if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
+ if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
+ }
+ }
+
+ if (args->min_ac)
+ {
+ if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
+ else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
+ else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
+ else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
+ else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
+ }
+ if (args->max_ac)
+ {
+ if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
+ else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
+ else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
+ else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
+ else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
+ }
+ if (args->min_af)
+ {
+ if (an == 0) return 0; // freq not defined, skip site
+ if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
+ else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
+ else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
+ else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
+ else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
+ }
+ if (args->max_af)
+ {
+ if (an == 0) return 0; // freq not defined, skip site
+ if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
+ else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
+ else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
+ else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
+ else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
+ }
+ if (args->uncalled) {
+ if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
+ if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
+ }
+ if (args->calc_ac && args->update_info) {
+ bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
+ bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
+ }
+ if (args->trim_alts)
+ {
+ int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
+ if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
+ }
+ if (args->phased) {
+ int phased = bcf_all_phased(args->hdr, line);
+ if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
+ if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
+ }
+ if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
+ return 1;
+}
+
+void set_allele_type (int *atype, char *atype_string)
+{
+ *atype = ALLELE_NONREF;
+ if (strcmp(atype_string, "minor") == 0) {
+ *atype = ALLELE_MINOR;
+ }
+ else if (strcmp(atype_string, "alt1") == 0) {
+ *atype = ALLELE_ALT1;
+ }
+ else if (strcmp(atype_string, "nref") == 0) {
+ *atype = ALLELE_NONREF;
+ }
+ else if (strcmp(atype_string, "major") == 0) {
+ *atype = ALLELE_MAJOR;
+ }
+ else if (strcmp(atype_string, "nonmajor") == 0) {
+ *atype = ALLELE_NONMAJOR;
+ }
+ else {
+ error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string);
+ }
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
+ fprintf(stderr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Output options:\n");
+ fprintf(stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
+ fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
+ fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
+ fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Subset options:\n");
+ fprintf(stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
+ fprintf(stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
+ fprintf(stderr, " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " --force-samples only warn about unknown subset samples\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Filter options:\n");
+ fprintf(stderr, " -c/C, --min-ac/--max-ac <int>[:<type>] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -g, --genotype [^]<hom|het|miss> require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n");
+ fprintf(stderr, " -i/e, --include/--exclude <expr> select/exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n");
+ fprintf(stderr, " -m/M, --min-alleles/--max-alleles <int> minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
+ fprintf(stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n");
+ fprintf(stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
+ fprintf(stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
+ fprintf(stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+int main_vcfview(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->clevel = -1;
+ args->print_header = 1;
+ args->update_info = 1;
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ int targets_is_file = 0, regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"genotype",required_argument,NULL,'g'},
+ {"compression-level",required_argument,NULL,'l'},
+ {"threads",required_argument,NULL,9},
+ {"header-only",no_argument,NULL,'h'},
+ {"no-header",no_argument,NULL,'H'},
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
+ {"trim-alt-alleles",no_argument,NULL,'a'},
+ {"no-update",no_argument,NULL,'I'},
+ {"drop-genotypes",no_argument,NULL,'G'},
+ {"private",no_argument,NULL,'x'},
+ {"exclude-private",no_argument,NULL,'X'},
+ {"uncalled",no_argument,NULL,'u'},
+ {"exclude-uncalled",no_argument,NULL,'U'},
+ {"apply-filters",required_argument,NULL,'f'},
+ {"known",no_argument,NULL,'k'},
+ {"novel",no_argument,NULL,'n'},
+ {"min-alleles",required_argument,NULL,'m'},
+ {"max-alleles",required_argument,NULL,'M'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {"force-samples",no_argument,NULL,1},
+ {"output-type",required_argument,NULL,'O'},
+ {"output-file",required_argument,NULL,'o'},
+ {"types",required_argument,NULL,'v'},
+ {"exclude-types",required_argument,NULL,'V'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"min-ac",required_argument,NULL,'c'},
+ {"max-ac",required_argument,NULL,'C'},
+ {"min-af",required_argument,NULL,'q'},
+ {"max-af",required_argument,NULL,'Q'},
+ {"phased",no_argument,NULL,'p'},
+ {"exclude-phased",no_argument,NULL,'P'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
+ {
+ char allele_type[8] = "nref";
+ switch (c)
+ {
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'l':
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
+ args->output_type |= FT_GZ;
+ break;
+ case 'o': args->fn_out = optarg; break;
+ case 'H': args->print_header = 0; break;
+ case 'h': args->header_only = 1; break;
+
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+
+ case 's': args->sample_names = optarg; break;
+ case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
+ case 1 : args->force_samples = 1; break;
+ case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
+ case 'I': args->update_info = 0; break;
+ case 'G': args->sites_only = 1; break;
+
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'k': args->known = 1; break;
+ case 'n': args->novel = 1; break;
+ case 'm':
+ args->min_alleles = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
+ break;
+ case 'M':
+ args->max_alleles = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
+ break;
+ case 'v': args->include_types = optarg; break;
+ case 'V': args->exclude_types = optarg; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+
+ case 'c':
+ {
+ args->min_ac_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%d:%s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 )
+ error("Error: Could not parse --min-ac %s\n", optarg);
+ set_allele_type(&args->min_ac_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+ case 'C':
+ {
+ args->max_ac_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%d:%s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 )
+ error("Error: Could not parse --max-ac %s\n", optarg);
+ set_allele_type(&args->max_ac_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+ case 'q':
+ {
+ args->min_af_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%f:%s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 )
+ error("Error: Could not parse --min_af %s\n", optarg);
+ set_allele_type(&args->min_af_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+ case 'Q':
+ {
+ args->max_af_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%f:%s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 )
+ error("Error: Could not parse --min_af %s\n", optarg);
+ set_allele_type(&args->max_af_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+
+ case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break;
+ case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break;
+ case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break;
+ case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break;
+ case 'p': args->phased |= FLT_INCLUDE; break; // phased
+ case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased
+ case 'g':
+ {
+ if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM;
+ else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET;
+ else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING;
+ else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM;
+ else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET;
+ else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING;
+ else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
+ break;
+ }
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?': usage(args);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
+ if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n");
+ if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n");
+ if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n");
+
+ if ( args->sample_names && args->update_info) args->calc_ac = 1;
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
+
+ // read in the regions from the command line
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ else if ( optind+1 < argc )
+ {
+ int i;
+ kstring_t tmp = {0,0,0};
+ kputs(argv[optind+1],&tmp);
+ for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
+ if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 )
+ error("Failed to read the regions: %s\n", tmp.s);
+ free(tmp.s);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr);
+ if (args->print_header)
+ bcf_hdr_write(args->out, out_hdr);
+ else if ( args->output_type & FT_BCF )
+ error("BCF output requires header, cannot proceed with -H\n");
+ if (!args->header_only)
+ {
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n");
+ if ( subset_vcf(args, line) )
+ bcf_write1(args->out, out_hdr, line);
+ }
+ }
+ hts_close(args->out);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c
new file mode 100644
index 0000000..a6a0cc0
--- /dev/null
+++ b/bcftools/vcfview.c.pysam.c
@@ -0,0 +1,748 @@
+#include "pysam.h"
+
+/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Shane McCarthy <sm15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "htslib/khash_str2int.h"
+
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define ALLELE_NONREF 1
+#define ALLELE_MINOR 2
+#define ALLELE_ALT1 3
+#define ALLELE_MAJOR 4
+#define ALLELE_NONMAJOR 5
+
+#define GT_NEED_HOM 1
+#define GT_NEED_HET 2
+#define GT_NO_HOM 3
+#define GT_NO_HET 4
+#define GT_NEED_MISSING 5
+#define GT_NO_MISSING 6
+
+typedef struct _args_t
+{
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e)
+
+ bcf_srs_t *files;
+ bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
+ char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
+ int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
+ int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
+ int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
+ int *ac, mac;
+ float min_af, max_af;
+ char *fn_ref, *fn_out, **samples;
+ int sample_is_file, force_samples;
+ char *include_types, *exclude_types;
+ int include, exclude;
+ htsFile *out;
+}
+args_t;
+
+static void init_data(args_t *args)
+{
+ int i;
+ args->hdr = args->files->readers[0].header;
+
+ if (args->calc_ac && args->update_info)
+ {
+ bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
+ bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
+ }
+ bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+
+ // setup sample data
+ if (args->sample_names)
+ {
+ void *hdr_samples = khash_str2int_init();
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
+
+ void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
+ int nsmpl;
+ char **smpl = NULL;
+ args->samples = NULL; args->n_samples = 0;
+ smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
+ if ( !smpl )
+ {
+ error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
+ }
+
+ if ( exclude )
+ {
+ for (i=0; i<nsmpl; i++) {
+ if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
+ if (args->force_samples) {
+ fprintf(pysamerr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
+ } else {
+ error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
+ }
+ }
+ khash_str2int_inc(exclude, smpl[i]);
+ }
+
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ {
+ if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue;
+ args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
+ args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
+ }
+ khash_str2int_destroy(exclude);
+ }
+ else
+ {
+ for (i=0; i<nsmpl; i++) {
+ if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
+ if (args->force_samples) {
+ fprintf(pysamerr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
+ continue;
+ } else {
+ error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
+ }
+ }
+ args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
+ args->samples[args->n_samples++] = strdup(smpl[i]);
+ }
+ }
+ for (i=0; i<nsmpl; i++) free(smpl[i]);
+ free(smpl);
+ khash_str2int_destroy(hdr_samples);
+ if (args->n_samples == 0) {
+ fprintf(pysamerr, "Warn: subsetting has removed all samples\n");
+ args->sites_only = 1;
+ }
+ }
+
+ if (args->n_samples)
+ args->imap = (int*)malloc(args->n_samples * sizeof(int));
+
+ // determine variant types to include/exclude
+ if (args->include_types || args->exclude_types) {
+ if (args->include_types && args->exclude_types) {
+ fprintf(pysamerr, "Error: only supply one of --include-types, --exclude-types options\n");
+ exit(1);
+ }
+ char **type_list = 0;
+ int m = 0, n = 0;
+ const char *q, *p;
+ for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
+ if (*p == ',' || *p == 0) {
+ if (m == n) {
+ m = m? m<<1 : 16;
+ type_list = (char**)realloc(type_list, m * sizeof(char*));
+ }
+ type_list[n] = (char*)calloc(p - q + 1, 1);
+ strncpy(type_list[n++], q, p - q);
+ q = p + 1;
+ if (*p == 0) break;
+ }
+ }
+ type_list = (char**)realloc(type_list, n * sizeof(char*));
+
+ if (args->include_types) {
+ args->include = 0;
+ for (i = 0; i < n; ++i) {
+ if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
+ else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
+ else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
+ else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
+ else {
+ fprintf(pysamerr, "[E::%s] unknown type\n", type_list[i]);
+ fprintf(pysamerr, "Accepted types are snps, indels, mnps, other\n");
+ exit(1);
+ }
+ }
+ }
+ if (args->exclude_types) {
+ args->exclude = 0;
+ for (i = 0; i < n; ++i) {
+ if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
+ else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
+ else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
+ else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
+ else {
+ fprintf(pysamerr, "[E::%s] unknown type\n", type_list[i]);
+ fprintf(pysamerr, "Accepted types are snps, indels, mnps, other\n");
+ exit(1);
+ }
+ }
+ }
+ for (i = 0; i < n; ++i)
+ free(type_list[i]);
+ free(type_list);
+ }
+
+ // setup output
+ char modew[8];
+ strcpy(modew, "w");
+ if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
+ if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF
+ else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF
+ else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
+ args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
+ if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
+ if ( args->n_threads ) hts_set_threads(args->out, args->n_threads);
+
+ // headers: hdr=full header, hsub=subset header, hnull=sites only header
+ if (args->sites_only){
+ args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
+ bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
+ }
+ if (args->n_samples > 0)
+ {
+ args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
+ if ( !args->hsub ) error("Error occurred while subsetting samples\n");
+ if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
+ {
+ int i;
+ for (i=0; i<args->n_samples; i++)
+ if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
+ }
+ }
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+}
+
+static void destroy_data(args_t *args)
+{
+ int i;
+ if ( args->imap ) {
+ for (i = 0; i < args->n_samples; ++i)
+ free(args->samples[i]);
+ free(args->samples);
+ free(args->imap);
+ }
+ if (args->hnull) bcf_hdr_destroy(args->hnull);
+ if (args->hsub) bcf_hdr_destroy(args->hsub);
+ if ( args->filter )
+ filter_destroy(args->filter);
+ free(args->ac);
+}
+
+// true if all samples are phased.
+// haploid genotypes are considered phased
+// ./. => not phased, .|. => phased
+int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
+{
+ bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
+ int all_phased = 1;
+ if ( fmt_ptr )
+ {
+ int i, isample;
+ for (isample=0; isample<line->n_sample; isample++)
+ {
+ int sample_phased = 0;
+ #define BRANCH_INT(type_t,vector_end) { \
+ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
+ for (i=0; i<fmt_ptr->n; i++) \
+ { \
+ if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
+ if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
+ if ((p[i])&1) { \
+ sample_phased = 1; \
+ break; \
+ } \
+ } \
+ }
+ switch (fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(pysamerr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+ if (!sample_phased) {
+ all_phased = 0;
+ break;
+ }
+ }
+ }
+ return all_phased;
+}
+
+int subset_vcf(args_t *args, bcf1_t *line)
+{
+ if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
+ if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
+ if (args->novel || args->known)
+ {
+ if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
+ if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0; // skip sites which are novel, ID == '.'
+ }
+
+ if (args->include || args->exclude)
+ {
+ int line_type = bcf_get_variant_types(line);
+ if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
+ if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types
+ }
+
+ if ( args->filter )
+ {
+ int ret = filter_test(args->filter, line, NULL);
+ if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
+ else if ( ret ) return 0;
+ }
+
+ hts_expand(int, line->n_allele, args->mac, args->ac);
+ int i, an = 0, non_ref_ac = 0;
+ if (args->calc_ac) {
+ bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
+ for (i=1; i<line->n_allele; i++)
+ non_ref_ac += args->ac[i];
+ for (i=0; i<line->n_allele; i++)
+ an += args->ac[i];
+ }
+
+ if (args->n_samples)
+ {
+ int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
+ bcf_subset(args->hdr, line, args->n_samples, args->imap);
+ if (args->calc_ac) {
+ bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
+ an = 0;
+ for (i=0; i<line->n_allele; i++) {
+ args->ac[i] = ac_sub[i];
+ an += ac_sub[i];
+ }
+ for (i=1; i<line->n_allele; i++)
+ non_ref_ac_sub += ac_sub[i];
+ if (args->private_vars) {
+ if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
+ if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
+ }
+ non_ref_ac = non_ref_ac_sub;
+ }
+ free(ac_sub);
+ }
+
+ bcf_fmt_t *gt_fmt;
+ if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
+ {
+ int nhet = 0, nhom = 0, nmiss = 0;
+ for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
+ {
+ int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
+ if ( type==GT_HET_RA || type==GT_HET_AA )
+ {
+ if ( args->gt_type==GT_NO_HET ) return 0;
+ nhet = 1;
+ }
+ else if ( type==GT_UNKN )
+ {
+ if ( args->gt_type==GT_NO_MISSING ) return 0;
+ nmiss = 1;
+ }
+ else
+ {
+ if ( args->gt_type==GT_NO_HOM ) return 0;
+ nhom = 1;
+ }
+ }
+ if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
+ else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
+ else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
+ }
+
+ int minor_ac = 0;
+ int major_ac = 0;
+ if ( args->calc_ac )
+ {
+ minor_ac = args->ac[0];
+ major_ac = args->ac[0];
+ for (i=1; i<line->n_allele; i++){
+ if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
+ if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
+ }
+ }
+
+ if (args->min_ac)
+ {
+ if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
+ else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
+ else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
+ else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
+ else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
+ }
+ if (args->max_ac)
+ {
+ if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
+ else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
+ else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
+ else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
+ else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
+ }
+ if (args->min_af)
+ {
+ if (an == 0) return 0; // freq not defined, skip site
+ if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
+ else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
+ else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
+ else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
+ else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
+ }
+ if (args->max_af)
+ {
+ if (an == 0) return 0; // freq not defined, skip site
+ if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
+ else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
+ else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
+ else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
+ else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
+ }
+ if (args->uncalled) {
+ if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
+ if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
+ }
+ if (args->calc_ac && args->update_info) {
+ bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
+ bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
+ }
+ if (args->trim_alts)
+ {
+ int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
+ if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
+ }
+ if (args->phased) {
+ int phased = bcf_all_phased(args->hdr, line);
+ if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
+ if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
+ }
+ if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
+ return 1;
+}
+
+void set_allele_type (int *atype, char *atype_string)
+{
+ *atype = ALLELE_NONREF;
+ if (strcmp(atype_string, "minor") == 0) {
+ *atype = ALLELE_MINOR;
+ }
+ else if (strcmp(atype_string, "alt1") == 0) {
+ *atype = ALLELE_ALT1;
+ }
+ else if (strcmp(atype_string, "nref") == 0) {
+ *atype = ALLELE_NONREF;
+ }
+ else if (strcmp(atype_string, "major") == 0) {
+ *atype = ALLELE_MAJOR;
+ }
+ else if (strcmp(atype_string, "nonmajor") == 0) {
+ *atype = ALLELE_NONMAJOR;
+ }
+ else {
+ error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string);
+ }
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
+ fprintf(pysamerr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Output options:\n");
+ fprintf(pysamerr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
+ fprintf(pysamerr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
+ fprintf(pysamerr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
+ fprintf(pysamerr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysamerr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(pysamerr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Subset options:\n");
+ fprintf(pysamerr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
+ fprintf(pysamerr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
+ fprintf(pysamerr, " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(pysamerr, " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(pysamerr, " --force-samples only warn about unknown subset samples\n");
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Filter options:\n");
+ fprintf(pysamerr, " -c/C, --min-ac/--max-ac <int>[:<type>] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(pysamerr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysamerr, " -g, --genotype [^]<hom|het|miss> require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n");
+ fprintf(pysamerr, " -i/e, --include/--exclude <expr> select/exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysamerr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n");
+ fprintf(pysamerr, " -m/M, --min-alleles/--max-alleles <int> minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
+ fprintf(pysamerr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n");
+ fprintf(pysamerr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(pysamerr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(pysamerr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
+ fprintf(pysamerr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
+ fprintf(pysamerr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+ fprintf(pysamerr, "\n");
+ exit(1);
+}
+
+int main_vcfview(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->files = bcf_sr_init();
+ args->clevel = -1;
+ args->print_header = 1;
+ args->update_info = 1;
+ args->output_type = FT_VCF;
+ args->n_threads = 0;
+ int targets_is_file = 0, regions_is_file = 0;
+
+ static struct option loptions[] =
+ {
+ {"genotype",required_argument,NULL,'g'},
+ {"compression-level",required_argument,NULL,'l'},
+ {"threads",required_argument,NULL,9},
+ {"header-only",no_argument,NULL,'h'},
+ {"no-header",no_argument,NULL,'H'},
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
+ {"trim-alt-alleles",no_argument,NULL,'a'},
+ {"no-update",no_argument,NULL,'I'},
+ {"drop-genotypes",no_argument,NULL,'G'},
+ {"private",no_argument,NULL,'x'},
+ {"exclude-private",no_argument,NULL,'X'},
+ {"uncalled",no_argument,NULL,'u'},
+ {"exclude-uncalled",no_argument,NULL,'U'},
+ {"apply-filters",required_argument,NULL,'f'},
+ {"known",no_argument,NULL,'k'},
+ {"novel",no_argument,NULL,'n'},
+ {"min-alleles",required_argument,NULL,'m'},
+ {"max-alleles",required_argument,NULL,'M'},
+ {"samples",required_argument,NULL,'s'},
+ {"samples-file",required_argument,NULL,'S'},
+ {"force-samples",no_argument,NULL,1},
+ {"output-type",required_argument,NULL,'O'},
+ {"output-file",required_argument,NULL,'o'},
+ {"types",required_argument,NULL,'v'},
+ {"exclude-types",required_argument,NULL,'V'},
+ {"targets",required_argument,NULL,'t'},
+ {"targets-file",required_argument,NULL,'T'},
+ {"regions",required_argument,NULL,'r'},
+ {"regions-file",required_argument,NULL,'R'},
+ {"min-ac",required_argument,NULL,'c'},
+ {"max-ac",required_argument,NULL,'C'},
+ {"min-af",required_argument,NULL,'q'},
+ {"max-af",required_argument,NULL,'Q'},
+ {"phased",no_argument,NULL,'p'},
+ {"exclude-phased",no_argument,NULL,'P'},
+ {NULL,0,NULL,0}
+ };
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
+ {
+ char allele_type[8] = "nref";
+ switch (c)
+ {
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'l':
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
+ args->output_type |= FT_GZ;
+ break;
+ case 'o': args->fn_out = optarg; break;
+ case 'H': args->print_header = 0; break;
+ case 'h': args->header_only = 1; break;
+
+ case 't': args->targets_list = optarg; break;
+ case 'T': args->targets_list = optarg; targets_is_file = 1; break;
+ case 'r': args->regions_list = optarg; break;
+ case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+
+ case 's': args->sample_names = optarg; break;
+ case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
+ case 1 : args->force_samples = 1; break;
+ case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
+ case 'I': args->update_info = 0; break;
+ case 'G': args->sites_only = 1; break;
+
+ case 'f': args->files->apply_filters = optarg; break;
+ case 'k': args->known = 1; break;
+ case 'n': args->novel = 1; break;
+ case 'm':
+ args->min_alleles = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
+ break;
+ case 'M':
+ args->max_alleles = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
+ break;
+ case 'v': args->include_types = optarg; break;
+ case 'V': args->exclude_types = optarg; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+
+ case 'c':
+ {
+ args->min_ac_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%d:%s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 )
+ error("Error: Could not parse --min-ac %s\n", optarg);
+ set_allele_type(&args->min_ac_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+ case 'C':
+ {
+ args->max_ac_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%d:%s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 )
+ error("Error: Could not parse --max-ac %s\n", optarg);
+ set_allele_type(&args->max_ac_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+ case 'q':
+ {
+ args->min_af_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%f:%s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 )
+ error("Error: Could not parse --min_af %s\n", optarg);
+ set_allele_type(&args->min_af_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+ case 'Q':
+ {
+ args->max_af_type = ALLELE_NONREF;
+ if ( sscanf(optarg,"%f:%s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 )
+ error("Error: Could not parse --min_af %s\n", optarg);
+ set_allele_type(&args->max_af_type, allele_type);
+ args->calc_ac = 1;
+ break;
+ }
+
+ case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break;
+ case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break;
+ case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break;
+ case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break;
+ case 'p': args->phased |= FLT_INCLUDE; break; // phased
+ case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased
+ case 'g':
+ {
+ if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM;
+ else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET;
+ else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING;
+ else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM;
+ else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET;
+ else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING;
+ else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
+ break;
+ }
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case '?': usage(args);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
+ if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n");
+ if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n");
+ if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n");
+
+ if ( args->sample_names && args->update_info) args->calc_ac = 1;
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
+
+ // read in the regions from the command line
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
+ else if ( optind+1 < argc )
+ {
+ int i;
+ kstring_t tmp = {0,0,0};
+ kputs(argv[optind+1],&tmp);
+ for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
+ if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 )
+ error("Failed to read the regions: %s\n", tmp.s);
+ free(tmp.s);
+ }
+ if ( args->targets_list )
+ {
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
+
+ init_data(args);
+ bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr);
+ if (args->print_header)
+ bcf_hdr_write(args->out, out_hdr);
+ else if ( args->output_type & FT_BCF )
+ error("BCF output requires header, cannot proceed with -H\n");
+ if (!args->header_only)
+ {
+ while ( bcf_sr_next_line(args->files) )
+ {
+ bcf1_t *line = args->files->readers[0].buffer[0];
+ if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n");
+ if ( subset_vcf(args, line) )
+ bcf_write1(args->out, out_hdr, line);
+ }
+ }
+ hts_close(args->out);
+ destroy_data(args);
+ bcf_sr_destroy(args->files);
+ free(args);
+ return 0;
+}
diff --git a/bcftools/vcmp.c b/bcftools/vcmp.c
new file mode 100644
index 0000000..8d04b89
--- /dev/null
+++ b/bcftools/vcmp.c
@@ -0,0 +1,132 @@
+/* vcmp.c -- reference allele utility functions.
+
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <htslib/hts.h>
+#include <ctype.h>
+#include "vcmp.h"
+
+struct _vcmp_t
+{
+ char *dref;
+ int ndref, mdref; // ndref: positive when ref1 longer, negative when ref2 is longer
+ int nmatch;
+ int *map, mmap;
+};
+
+vcmp_t *vcmp_init()
+{
+ return (vcmp_t*)calloc(1,sizeof(vcmp_t));
+}
+
+void vcmp_destroy(vcmp_t *vcmp)
+{
+ free(vcmp->map);
+ free(vcmp->dref);
+ free(vcmp);
+}
+
+int vcmp_set_ref(vcmp_t *vcmp, char *ref1, char *ref2)
+{
+ vcmp->ndref = 0;
+
+ char *a = ref1, *b = ref2;
+ while ( *a && *b && toupper(*a)==toupper(*b) ) { a++; b++; }
+ if ( !*a && !*b ) return 0;
+ if ( *a && *b ) return -1; // refs not compatible
+
+ int i;
+ if ( *a ) // ref1 is longer
+ {
+ vcmp->nmatch = b-ref2;
+ while ( *a ) a++;
+ vcmp->ndref = (a-ref1) - vcmp->nmatch;
+ hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref);
+ for (i=0; i<vcmp->ndref; i++) vcmp->dref[i] = toupper(ref1[vcmp->nmatch+i]);
+ vcmp->dref[vcmp->ndref] = 0;
+ return 0;
+ }
+
+ // ref2 is longer
+ vcmp->nmatch = a-ref1;
+ while ( *b ) b++;
+ vcmp->ndref = (b-ref2) - vcmp->nmatch;
+ hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref);
+ for (i=0; i<vcmp->ndref; i++) vcmp->dref[i] = toupper(ref2[vcmp->nmatch+i]);
+ vcmp->dref[vcmp->ndref] = 0;
+ vcmp->ndref *= -1;
+ return 0;
+}
+
+int vcmp_find_allele(vcmp_t *vcmp, char **als1, int nals1, char *al2)
+{
+ int i, j;
+ for (i=0; i<nals1; i++)
+ {
+ char *a = als1[i], *b = al2;
+ while ( *a && *b && toupper(*a)==toupper(*b) ) { a++; b++; }
+ if ( *a && *b ) continue; // mismatch
+ if ( !vcmp->ndref )
+ {
+ if ( !*a && !*b ) break; // found
+ continue;
+ }
+
+ // the prefixes match
+ if ( *a )
+ {
+ if ( vcmp->ndref<0 ) continue;
+ for (j=0; j<vcmp->ndref; j++)
+ if ( !a[j] || toupper(a[j])!=vcmp->dref[j] ) break;
+ if ( j!=vcmp->ndref || a[j] ) continue;
+ break; // found
+ }
+
+ if ( vcmp->ndref>0 ) continue;
+ for (j=0; j<-vcmp->ndref; j++)
+ if ( !b[j] || toupper(b[j])!=vcmp->dref[j] ) break;
+ if ( j!=-vcmp->ndref || b[j] ) continue;
+ break; // found
+ }
+ if (i==nals1) return -1;
+ return i;
+}
+
+
+int *vcmp_map_ARvalues(vcmp_t *vcmp, int n, int nals1, char **als1, int nals2, char **als2)
+{
+ if ( vcmp_set_ref(vcmp,als1[0],als2[0]) < 0 ) return NULL;
+
+ vcmp->map = (int*) realloc(vcmp->map,sizeof(int)*n);
+
+ int i, ifrom = n==nals2 ? 0 : 1;
+ for (i=ifrom; i<nals2; i++)
+ {
+ vcmp->map[i-ifrom] = vcmp_find_allele(vcmp, als1+ifrom, nals1-ifrom, als2[i]);
+ }
+ return vcmp->map;
+}
+
diff --git a/bcftools/vcmp.c.pysam.c b/bcftools/vcmp.c.pysam.c
new file mode 100644
index 0000000..f1345e2
--- /dev/null
+++ b/bcftools/vcmp.c.pysam.c
@@ -0,0 +1,134 @@
+#include "pysam.h"
+
+/* vcmp.c -- reference allele utility functions.
+
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <htslib/hts.h>
+#include <ctype.h>
+#include "vcmp.h"
+
+struct _vcmp_t
+{
+ char *dref;
+ int ndref, mdref; // ndref: positive when ref1 longer, negative when ref2 is longer
+ int nmatch;
+ int *map, mmap;
+};
+
+vcmp_t *vcmp_init()
+{
+ return (vcmp_t*)calloc(1,sizeof(vcmp_t));
+}
+
+void vcmp_destroy(vcmp_t *vcmp)
+{
+ free(vcmp->map);
+ free(vcmp->dref);
+ free(vcmp);
+}
+
+int vcmp_set_ref(vcmp_t *vcmp, char *ref1, char *ref2)
+{
+ vcmp->ndref = 0;
+
+ char *a = ref1, *b = ref2;
+ while ( *a && *b && toupper(*a)==toupper(*b) ) { a++; b++; }
+ if ( !*a && !*b ) return 0;
+ if ( *a && *b ) return -1; // refs not compatible
+
+ int i;
+ if ( *a ) // ref1 is longer
+ {
+ vcmp->nmatch = b-ref2;
+ while ( *a ) a++;
+ vcmp->ndref = (a-ref1) - vcmp->nmatch;
+ hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref);
+ for (i=0; i<vcmp->ndref; i++) vcmp->dref[i] = toupper(ref1[vcmp->nmatch+i]);
+ vcmp->dref[vcmp->ndref] = 0;
+ return 0;
+ }
+
+ // ref2 is longer
+ vcmp->nmatch = a-ref1;
+ while ( *b ) b++;
+ vcmp->ndref = (b-ref2) - vcmp->nmatch;
+ hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref);
+ for (i=0; i<vcmp->ndref; i++) vcmp->dref[i] = toupper(ref2[vcmp->nmatch+i]);
+ vcmp->dref[vcmp->ndref] = 0;
+ vcmp->ndref *= -1;
+ return 0;
+}
+
+int vcmp_find_allele(vcmp_t *vcmp, char **als1, int nals1, char *al2)
+{
+ int i, j;
+ for (i=0; i<nals1; i++)
+ {
+ char *a = als1[i], *b = al2;
+ while ( *a && *b && toupper(*a)==toupper(*b) ) { a++; b++; }
+ if ( *a && *b ) continue; // mismatch
+ if ( !vcmp->ndref )
+ {
+ if ( !*a && !*b ) break; // found
+ continue;
+ }
+
+ // the prefixes match
+ if ( *a )
+ {
+ if ( vcmp->ndref<0 ) continue;
+ for (j=0; j<vcmp->ndref; j++)
+ if ( !a[j] || toupper(a[j])!=vcmp->dref[j] ) break;
+ if ( j!=vcmp->ndref || a[j] ) continue;
+ break; // found
+ }
+
+ if ( vcmp->ndref>0 ) continue;
+ for (j=0; j<-vcmp->ndref; j++)
+ if ( !b[j] || toupper(b[j])!=vcmp->dref[j] ) break;
+ if ( j!=-vcmp->ndref || b[j] ) continue;
+ break; // found
+ }
+ if (i==nals1) return -1;
+ return i;
+}
+
+
+int *vcmp_map_ARvalues(vcmp_t *vcmp, int n, int nals1, char **als1, int nals2, char **als2)
+{
+ if ( vcmp_set_ref(vcmp,als1[0],als2[0]) < 0 ) return NULL;
+
+ vcmp->map = (int*) realloc(vcmp->map,sizeof(int)*n);
+
+ int i, ifrom = n==nals2 ? 0 : 1;
+ for (i=ifrom; i<nals2; i++)
+ {
+ vcmp->map[i-ifrom] = vcmp_find_allele(vcmp, als1+ifrom, nals1-ifrom, als2[i]);
+ }
+ return vcmp->map;
+}
+
diff --git a/bcftools/vcmp.h b/bcftools/vcmp.h
new file mode 100644
index 0000000..0317704
--- /dev/null
+++ b/bcftools/vcmp.h
@@ -0,0 +1,62 @@
+/* vcmp.h -- reference allele utility functions.
+
+ Copyright (C) 2013-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE. */
+
+#ifndef __VCMP_H__
+#define __VCMP_H__
+
+typedef struct _vcmp_t vcmp_t;
+
+vcmp_t *vcmp_init(void);
+void vcmp_destroy(vcmp_t *vcmp);
+
+/*
+ * vcmp_set_ref() - sets and compares reference alleles
+ * Returns 0 on success or -1 if alleles not compatible
+ */
+int vcmp_set_ref(vcmp_t *vcmp, char *ref1, char *ref2);
+
+/*
+ * vcmp_find_allele()
+ * @param als1: alternate alleles to ref1 above
+ * @param al2: alternate allele to ref2 above
+ * Returns -1 if not found or 0-based index to als1 of matching allele
+ */
+int vcmp_find_allele(vcmp_t *vcmp, char **als1, int nals1, char *al2);
+
+/*
+ * vcmp_map_ARvalues() - Create mapping for Number=A,R tag values
+ * @param number: nals2 for Number=R, nals2-1 for Number=A
+ * @param nals1: number of alleles
+ * @param als1: alleles
+ *
+ * Returns pointer to an array of size nals2 with mapping from als2
+ * to als1 or NULL if REFs (als1[0] and als2[0]) are not compatible.
+ * If i is the index of an allele in als2, ret[i] is the index of matching
+ * allele in als1 or -1 if als2 does not have a matching allele.
+ * The caller must not free the array.
+ */
+int *vcmp_map_ARvalues(vcmp_t *vcmp, int number, int nals1, char **als1, int nals2, char **als2);
+
+
+#endif
diff --git a/samtools/samtools.h b/bcftools/version.c
similarity index 56%
copy from samtools/samtools.h
copy to bcftools/version.c
index 3161822..00eeb5a 100644
--- a/samtools/samtools.h
+++ b/bcftools/version.c
@@ -1,6 +1,6 @@
-/* samtools.h -- utility routines.
+/* version.c -- report version numbers for plugins.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2014 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -22,12 +22,34 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#ifndef SAMTOOLS_H
-#define SAMTOOLS_H
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <htslib/hts.h>
+#include "bcftools.h"
+#include "version.h"
+
+void version(const char **bcftools_version, const char **htslib_version)
+{
+ *bcftools_version = BCFTOOLS_VERSION;
+ *htslib_version = hts_version();
+}
+
+void error(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(-1);
+}
+
+const char *hts_bcf_wmode(int file_type)
+{
+ if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF
+ if ( file_type & FT_BCF ) return "wb"; // compressed BCF
+ if ( file_type & FT_GZ ) return "wz"; // compressed VCF
+ return "w"; // uncompressed VCF
+}
-const char *samtools_version(void);
-void print_error(const char *format, ...);
-void print_error_errno(const char *format, ...);
-
-#endif
diff --git a/samtools/samtools.h b/bcftools/version.c.pysam.c
similarity index 55%
copy from samtools/samtools.h
copy to bcftools/version.c.pysam.c
index 3161822..1fd0d4e 100644
--- a/samtools/samtools.h
+++ b/bcftools/version.c.pysam.c
@@ -1,6 +1,8 @@
-/* samtools.h -- utility routines.
+#include "pysam.h"
- Copyright (C) 2013-2014 Genome Research Ltd.
+/* version.c -- report version numbers for plugins.
+
+ Copyright (C) 2014 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -22,12 +24,34 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#ifndef SAMTOOLS_H
-#define SAMTOOLS_H
-
-const char *samtools_version(void);
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <htslib/hts.h>
+#include "bcftools.h"
+#include "version.h"
+
+void version(const char **bcftools_version, const char **htslib_version)
+{
+ *bcftools_version = BCFTOOLS_VERSION;
+ *htslib_version = hts_version();
+}
+
+void error(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(pysamerr, format, ap);
+ va_end(ap);
+ exit(-1);
+}
+
+const char *hts_bcf_wmode(int file_type)
+{
+ if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF
+ if ( file_type & FT_BCF ) return "wb"; // compressed BCF
+ if ( file_type & FT_GZ ) return "wz"; // compressed VCF
+ return "w"; // uncompressed VCF
+}
-void print_error(const char *format, ...);
-void print_error_errno(const char *format, ...);
-#endif
diff --git a/bcftools/version.h b/bcftools/version.h
new file mode 100644
index 0000000..70d4f93
--- /dev/null
+++ b/bcftools/version.h
@@ -0,0 +1 @@
+#define BCFTOOLS_VERSION "1.3"
diff --git a/ci/conda-recipe/build.sh b/ci/conda-recipe/build.sh
new file mode 100644
index 0000000..32b67db
--- /dev/null
+++ b/ci/conda-recipe/build.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# Use internal htslib
+chmod a+x ./htslib/configure
+export CFLAGS="-I${PREFIX}/include/curl/ -I${PREFIX}/include -L${PREFIX}/lib"
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
+
+$PYTHON setup.py install
diff --git a/ci/conda-recipe/meta.yaml b/ci/conda-recipe/meta.yaml
new file mode 100644
index 0000000..4e57895
--- /dev/null
+++ b/ci/conda-recipe/meta.yaml
@@ -0,0 +1,29 @@
+package:
+ name: pysam
+ version: 0.8.5
+
+source:
+ path: ../../
+
+build:
+ number: 0
+
+requirements:
+ build:
+ - python
+ - setuptools
+ - zlib
+ - cython
+
+ run:
+ - python
+ - zlib
+
+test:
+ imports:
+ - pysam
+
+about:
+ home: https://github.com/pysam-developers/pysam
+ license: MIT
+ summary: Pysam is a python module for reading and manipulating Samfiles. It's a lightweight wrapper of the samtools C-API. Pysam also includes an interface for tabix.
diff --git a/install-CGAT-tools.sh b/ci/install-CGAT-tools.sh
similarity index 84%
rename from install-CGAT-tools.sh
rename to ci/install-CGAT-tools.sh
index e8055f9..27eb481 100755
--- a/install-CGAT-tools.sh
+++ b/ci/install-CGAT-tools.sh
@@ -63,7 +63,7 @@ if [ "$OS" == "ubuntu" -o "$OS" == "travis" ] ; then
echo " Installing packages for Ubuntu "
echo
- apt-get install -y gcc g++ zlib1g-dev libssl-dev libbz2-dev libfreetype6-dev libpng12-dev libblas-dev libatlas-dev liblapack-dev gfortran libpq-dev r-base-dev libreadline-dev libmysqlclient-dev libboost-dev libsqlite3-dev mercurial;
+ apt-get install -y gcc g++
elif [ "$OS" == "sl" ] ; then
@@ -71,7 +71,7 @@ elif [ "$OS" == "sl" ] ; then
echo " Installing packages for Scientific Linux "
echo
- yum -y install gcc zlib-devel gcc-c++ freetype-devel libpng-devel blas atlas lapack gcc-gfortran postgresql-devel R-core-devel readline-devel mysql-devel boost-devel sqlite-devel mercurial openssl-devel bzip2-devel
+ yum -y install gcc zlib-devel gcc-c++
else
@@ -120,28 +120,6 @@ else
fi # if-OS
} # install_python_deps
-install_nosetests_deps() {
-
-return
-
-if [ "$OS" == "ubuntu" -o "$OS" == "travis" ] ; then
-
- # GCProfile
- apt-get install -y libc6-i386 libstdc++5:i386
-
-elif [ "$OS" == "sl" ] ; then
-
- # GCProfile
- yum install -y glibc.i686 compat-libstdc++-33.i686
-
-else
-
- sanity_check_os
-
-fi # if-OS
-
-} # install_nosetests_deps
-
# common set of tasks to prepare external dependencies
nosetests_external_deps() {
echo
@@ -152,14 +130,36 @@ pushd .
# create a new folder to store external tools
mkdir -p $HOME/CGAT/external-tools
-cd $HOME/CGAT/external-tools
# install samtools
-curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.2/samtools-1.2.tar.bz2 > samtools-1.2.tar.bz2
-tar xjvf samtools-1.2.tar.bz2
-cd samtools-1.2
+cd $HOME/CGAT/external-tools
+curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3/samtools-1.3.tar.bz2 > samtools-1.3.tar.bz2
+tar xjf samtools-1.3.tar.bz2
+cd samtools-1.3
make
-PATH=$PATH:$HOME/CGAT/external-tools/samtools-1.2
+PATH=$PATH:$HOME/CGAT/external-tools/samtools-1.3
+
+echo "installed samtools"
+samtools --version
+
+if [ $? != 0 ]; then
+ exit 1
+fi
+
+# install bcftools
+cd $HOME/CGAT/external-tools
+curl -L https://github.com/samtools/bcftools/releases/download/1.3/bcftools-1.3.tar.bz2 > bcftools-1.3.tar.bz2
+tar xjf bcftools-1.3.tar.bz2
+cd bcftools-1.3
+make
+PATH=$PATH:$HOME/CGAT/external-tools/bcftools-1.3
+
+echo "installed bcftools"
+bcftools --version
+
+if [ $? != 0 ]; then
+ exit 1
+fi
popd
@@ -189,7 +189,8 @@ cd tests
echo
echo 'building test data'
echo
-make -C pysam_data
+make -C pysam_data all
+make -C cbcf_data all
# run nosetests
# -s: do not capture stdout, conflicts with pysam.dispatch
@@ -246,7 +247,6 @@ else
OS="travis"
install_os_packages
install_python_deps
- install_nosetests_deps
run_nosetests
elif [ "$1" == "--install-os-packages" ] ; then
diff --git a/cy_build.py b/cy_build.py
new file mode 100644
index 0000000..e052ddb
--- /dev/null
+++ b/cy_build.py
@@ -0,0 +1,87 @@
+import os
+import re
+import sys
+
+from Cython.Distutils import build_ext
+from distutils.extension import Extension
+from distutils.sysconfig import get_config_vars, get_python_lib, get_python_version
+from pkg_resources import Distribution
+
+
+if sys.platform == 'darwin':
+ config_vars = get_config_vars()
+ config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '')
+ config_vars['SHLIB_EXT'] = '.so'
+ config_vars['SO'] = '.so'
+
+
+def is_pip_install():
+ if "_" in os.environ and os.environ["_"].endswith("pip"):
+ return True
+ if "pip-egg-info" in sys.argv:
+ return True
+ if re.search("/pip-.*-build/", __file__):
+ return True
+ return False
+
+
+class CyExtension(Extension):
+ def __init__(self, *args, **kwargs):
+ self._init_func = kwargs.pop("init_func", None)
+ Extension.__init__(self, *args, **kwargs)
+
+ def extend_includes(self, includes):
+ self.include_dirs.extend(includes)
+
+ def extend_macros(self, macros):
+ self.define_macros.extend(macros)
+
+ def extend_extra_objects(self, objs):
+ self.extra_objects.extend(objs)
+
+
+class cy_build_ext(build_ext):
+
+ def _get_egg_name(self):
+ ei_cmd = self.get_finalized_command("egg_info")
+ return Distribution(
+ None, None, ei_cmd.egg_name, ei_cmd.egg_version, get_python_version(),
+ self.distribution.has_ext_modules() and self.plat_name).egg_name()
+
+ def build_extension(self, ext):
+ if isinstance(ext, CyExtension) and ext._init_func:
+ ext._init_func(ext)
+
+ if not self.inplace:
+ ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
+
+ if sys.platform == 'darwin':
+
+ relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
+
+ if "develop" in sys.argv or "test" in sys.argv:
+ # develop-mode and tests use local directory
+ pkg_root = os.path.dirname(__file__)
+ linker_path = os.path.join(pkg_root, relative_module_path)
+ elif "bdist_wheel" in sys.argv or is_pip_install():
+ # making a wheel, or pip is secretly involved
+ linker_path = os.path.join("@rpath", relative_module_path)
+ else:
+ # making an egg: `python setup.py install` default behavior
+ egg_name = '%s.egg' % self._get_egg_name()
+ linker_path = os.path.join("@rpath", egg_name, relative_module_path)
+
+ if not ext.extra_link_args:
+ ext.extra_link_args = []
+ ext.extra_link_args += ['-dynamiclib',
+ '-rpath', get_python_lib(),
+ '-Wl,-headerpad_max_install_names',
+ '-Wl,-install_name,%s' % linker_path,
+ '-Wl,-x']
+ else:
+ if not ext.extra_link_args:
+ ext.extra_link_args = []
+
+ ext.extra_link_args += ['-Wl,-rpath,$ORIGIN']
+
+ build_ext.build_extension(self, ext)
diff --git a/doc/api.rst b/doc/api.rst
index c756959..671fe4e 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -58,14 +58,14 @@ reads are represented as :class:`~pysam.PileupRead` objects in the
import pysam
samfile = pysam.AlignmentFile("ex1.bam", "rb" )
for pileupcolumn in samfile.pileup("chr1", 100, 120):
- print ("\ncoverage at base %s = %s" %
- (pileupcolumn.pos, pileupcolumn.n))
- for pileupread in pileupcolumn.pileups:
- if not pileupread.is_del and not pileupread.is_refskip:
- # query position is None if is_del or is_refskip is set.
- print ('\tbase in read %s = %s' %
- (pileupread.alignment.query_name,
- pileupread.alignment.query_sequence[pileupread.query_position]))
+ print ("\ncoverage at base %s = %s" %
+ (pileupcolumn.pos, pileupcolumn.n))
+ for pileupread in pileupcolumn.pileups:
+ if not pileupread.is_del and not pileupread.is_refskip:
+ # query position is None if is_del or is_refskip is set.
+ print ('\tbase in read %s = %s' %
+ (pileupread.alignment.query_name,
+ pileupread.alignment.query_sequence[pileupread.query_position]))
samfile.close()
@@ -100,7 +100,7 @@ tabix indexed tab-separated file formats with genomic data::
import pysam
tabixfile = pysam.TabixFile("example.gtf.gz")
-
+
for gtf in tabixfile.fetch("chr1", 1000, 2000):
print (gtf.contig, gtf.start, gtf.end, gtf.gene_id)
@@ -197,9 +197,14 @@ Fasta files
Fastq files
-----------
-.. autoclass:: pysam.FastqFile
+.. autoclass:: pysam.FastxFile
+ :members:
+
+
+.. autoclass:: pysam.cfaidx.FastqProxy
:members:
+
VCF files
---------
@@ -209,3 +214,8 @@ VCF files
.. autoclass:: pysam.VariantHeader
:members:
+.. autoclass:: pysam.VariantRecord
+ :members:
+
+.. autoclass:: pysam.VariantHeaderRecord
+ :members:
diff --git a/doc/conf.py b/doc/conf.py
index ede1809..5b92efd 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -30,10 +30,9 @@ extensions = ['sphinx.ext.autodoc',
'sphinx.ext.todo',
'sphinx.ext.ifconfig',
'sphinx.ext.intersphinx',
-# 'numpydoc']
- 'sphinx.ext.napoleon']
+ 'sphinx.ext.napoleon']
-intersphinx_mapping = {'python': ('http://docs.python.org/3.2', None)}
+intersphinx_mapping = {'python': ('http://docs.python.org/3.5', None)}
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@@ -49,13 +48,14 @@ master_doc = 'index'
# General information about the project.
project = u'pysam'
-copyright = u'2009, Andreas Heger, Tildon Grant Belgrad, Martin Goodson, Kevin Jacobs'
+copyright = u'2009, Andreas Heger, Kevin Jacobs et al.'
# Included at the end of each rst file
rst_epilog = '''
.. _CGAT Training Programme: http://www.cgat.org
.. _pysam: https://github.com/pysam-developers/pysam
.. _samtools: http://samtools.sourceforge.net/
+.. _bcftools: https://samtools.github.io/bcftools/bcftools.html
.. _htslib: http://www.htslib.org/
.. _tabix: http://samtools.sourceforge.net/tabix.shtml/
.. _Galaxy: https://main.g2.bx.psu.edu/
@@ -79,34 +79,34 @@ release = version
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
-#language = None
+# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of documents that shouldn't be included in the build.
-#unused_docs = []
+# unused_docs = []
# List of directories, relative to source directory, that shouldn't be searched
# for source files.
exclude_trees = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
@@ -119,31 +119,31 @@ pygments_style = 'sphinx'
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'classic'
+html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#html_theme_options = {}
+# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
-#html_title = None
+# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-#html_favicon = None
+# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
@@ -152,38 +152,38 @@ html_static_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
# If false, no module index is generated.
-#html_use_modindex = True
+# html_use_modindex = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = ''
+# html_file_suffix = ''
# Output file base name for HTML help builder.
htmlhelp_basename = 'samtoolsdoc'
@@ -192,31 +192,31 @@ htmlhelp_basename = 'samtoolsdoc'
# -- Options for LaTeX output --------------------------------------------------
# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
+# latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
+# latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
- ('index', 'pysam.tex', ur'pysam documentation',
- ur'Andreas Heger, Tildon Grant Belgrad, Martin Goodson', 'manual'),
+ ('index', 'pysam.tex', ur'pysam documentation',
+ ur'Andreas Heger, Kevin Jacobs et al.', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
+# latex_preamble = ''
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_use_modindex = True
+# latex_use_modindex = True
diff --git a/doc/faq.rst b/doc/faq.rst
index b414305..1f45981 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -10,6 +10,40 @@ use the github URL: https://github.com/pysam-developers/pysam.
As pysam is a wrapper around htslib and the samtools package, I
suggest cite `Li et al (2009) <http://www.ncbi.nlm.nih.gov/pubmed/19505943>`.
+Is pysam thread-save?
+=====================
+
+Pysam is a mix of python and C code. Instructions within python are
+generally made thread-safe through python's `global interpreter lock`_
+(GIL_). This ensures that python data structures will always be in a
+consistent state.
+
+If an external function outside python is called, the programmer has a
+choice to keep the GIL in place or to release it. Keeping the GIL in
+place will make sure that all python threads wait until the external
+function has completed. This is a safe option and ensures
+thread-safety.
+
+Alternatively, the GIL can be released while the external function is
+called. This will allow other threads to run concurrently. This can be
+beneficial if the external function is expected to halt, for example
+when waiting for data to read or write. However, to achieve
+thread-safety, the external function needs to implememented with
+thread-safety in mind. This means that there can be no shared state
+between threads, or if there is shared, it needs to be controlled to
+prevent any access conflicts.
+
+Pysam generally uses the latter option and aims to release the GIL for
+I/O intensive tasks. This is generally fine, but thread-safety of all
+parts have not been fully tested.
+
+A related issue is when different threads read from the same file
+objec - or the same thread uses two iterators over a file. There is
+only a single file-position for each opened file. To prevent this from
+hapeding, use the option ``mulitple_iterator=True`` when calling
+a fetch() method. This will return an iterator on a newly opened
+file.
+
pysam coordinates are wrong
===========================
@@ -114,8 +148,8 @@ index, use the ``until_eof=True`::
print (r)
-BAM files with a large number of reference sequences is slow
-============================================================
+BAM files with a large number of reference sequences are slow
+=============================================================
If you have many reference sequences in a bam file, the following
might be slow::
@@ -194,7 +228,7 @@ Again, the iteration finishes as the temporary iterator created
by pileup goes out of scope. The solution is to keep a handle
to the iterator that remains alive::
- i = AlignmentFile('ex1.bam').pileup( 'chr1', 1000, 1010)
+ i = AlignmentFile('ex1.bam').pileup('chr1', 1000, 1010)
p = next(i)
for pp in p.pileups:
print pp
@@ -225,6 +259,6 @@ cython_ when building pysam. There are some known incompatibilities:
* Python 3.4 requires cython 0.20.2 or later (see `here
<https://github.com/pysam-developers/pysam/issues/37>`_)
-
+.. _global interpreter lock: https://en.wikipedia.org/wiki/Global_interpreter_lock
diff --git a/doc/index.rst b/doc/index.rst
index 7b032cb..da36028 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -1,21 +1,31 @@
pysam: htslib interface for python
==================================
-:Author: Andreas Heger and contributors
+:Author: Andreas Heger, Kevin Jacobs and contributors
:Date: |today|
:Version: |version|
-The *SAM/BAM* format is a way to store efficiently large numbers of
-alignments [Li2009]_, such as those routinely are created by
-next-generation sequencing methods.
+Pysam is a python module for reading, manipulating and writing
+genomic data sets.
+
+Pysam is a wrapper of the htslib_ C-API and provides facilities to
+read and write SAM/BAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ files as well
+as access to the command line functionality of the samtools_ and
+bcftools_ packages. The module supports compression and random access
+through indexing.
This module provides a low-level wrapper around the htslib_ C-API as
-using `cython`_ and a high-level API for convenient access to the data
-in *SAM/BAM* formatted files. Also included is an interface to the
-samtools_ command line utilities and the tabix_ C-API for reading
-compressed and indexed tabular data.
+using cython and a high-level, pythonic API for convenient access to
+the data within genomic file formats.
+
+The current version wraps *htslib-1.3*, *samtools-1.3* and
+*bcftools-1.3*.
+
+To install the latest release, type::
+
+ pip install pysam
-The current version wraps *htslib-1.2.1* and *samtools-1.2*.
+See the :ref:`Installation notes <installation>` for details.
Contents
--------
@@ -25,6 +35,7 @@ Contents
api.rst
usage.rst
+ installation.rst
faq.rst
developer.rst
release.rst
diff --git a/doc/installation.rst b/doc/installation.rst
new file mode 100644
index 0000000..a3fa2a2
--- /dev/null
+++ b/doc/installation.rst
@@ -0,0 +1,65 @@
+.. _installation:
+
+================
+Installing pysam
+================
+
+Pysam provides a python interface to the functionality contained
+within the htslib_ C library. There are two ways that these two
+can be combined, ``builtin`` and ``external``.
+
+Builtin
+=======
+
+The typical installation will be through pypi_::
+
+ pip install pysam
+
+This will compile the ``builtin`` htslib source code within pysam.
+
+htslib_ can be configured at compilation to turn on additional
+features such support using encrypted configurations, enable plugins,
+and more. See the htslib_ project for more information on these.
+
+Pysam will attempt to configure htslib_ to turn on some advanced
+features. If these fail, for example due to missing library
+dependencies (`libcurl`, `libcrypto`), it will fall back to
+conservative defaults.
+
+Options can be passed to the configure script explicitely by
+setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`.
+For example::
+
+ export HTSLIB_CONFIGURE_OPTIONS=--enable-plugins
+ pip install pysam
+
+External
+========
+
+pysam can be combined with an externally installed htslib_
+library. This is a good way to avoid duplication of libraries. To link
+against an externally installed library, set the environment variables
+`HTSLIB_LIBRARY_DIR` and `HTSLIB_INCLUDE_DIR` before installing::
+
+ export HTSLIB_LIBRARY_DIR=/usr/local/lib
+ export HTSLIB_INCLUDE_DIR=/usr/local/include
+ pip install pysam
+
+Note that the location of the file :file:`libhts.so` needs to be known
+to the linker once you run pysam, for example by setting the
+environment-varirable `LD_LIBRARY_PATH`.
+
+cython
+======
+
+pysam depends on cython_ to provide the connectivity to the htslib_ C
+library. The installation of the source tarball (:file:`.tar.gz`)
+python 2.7 contains pre-built C-files and cython needs not be present
+during installation. However, when installing the source tarball on
+python 3 or building from the repository, these pre-built C-files are
+not present and cython needs to be installed beforehand.
+
+
+
+
+
diff --git a/doc/release.rst b/doc/release.rst
index e02c818..802c6e5 100644
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,65 @@
Release notes
=============
+Release 0.9.0
+=============
+
+Overview
+--------
+
+The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
+enchancements and bugfixes. See below for a detailed list.
+
+`Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
+comes with additional capabilities for remote file access which depend
+on the presence of optional system libraries. As a consequence, the
+installation script :file:`setup.py` has become more complex. For an
+overview, see :ref:`installation`. We have tested installation on
+linux and OS X, but could not capture all variations. It is possible
+that a 0.9.1 release might follow soon addressing installation issues.
+
+The :py:class:`~.pysam.VariantFile` class provides access to
+:term:`vcf` and :term:`bcf` formatted files. The class is certainly
+usable and interface is reaching completion, but the API and the
+functionality is subject to change.
+
+Detailed release notes
+----------------------
+
+* upgrade to htslib 1.3
+* python 3 compatibility tested throughout.
+* added a first set of bcftools commands in the pysam.bcftools
+ submodule.
+* samtools commands are now in the pysam.samtools module. For
+ backwards compatibility they are still imported into the pysam
+ namespace.
+* samtools/bcftools return stdout as a single (byte) string. As output
+ can be binary (VCF.gz, BAM) this is necessary to ensure py2/py3
+ compatibility. To replicate the previous behaviour in py2.7, use::
+
+ pysam.samtools.view(self.filename).splitlines(True)
+
+* get_tags() returns the tag type as a character, not an integer (#214)
+* TabixFile now raises ValueError on indices created by tabix <1.0 (#206)
+* improve OSX installation and develop mode
+* FastxIterator now handles empty sequences (#204)
+* TabixFile.isremote is not TabixFile.is_remote in line with AlignmentFile
+* AlignmentFile.count() has extra optional argument read_callback
+* setup.py has been changed to:
+ * install a single builtin htslib library. Previously, each pysam
+ module contained its own version. This reduces compilation time
+ and code bloat.
+ * run configure for the builtin htslib library in order to detect
+ optional libraries such as libcurl. Configure behaviour can be
+ controlled by setting the environmet variable
+ HTSLIB_CONFIGURE_OPTIONS.
+* get_reference_sequence() now returns the reference sequence and not
+ something looking like it. This bug had effects on
+ get_aligned_pairs(with_seq=True), see #225. If you have relied on on
+ get_aligned_pairs(with_seq=True) in pysam-0.8.4, please check your
+ results.
+* improved autodetection of file formats in AlignmentFile and VariantFile.
+
Release 0.8.4
=============
@@ -14,12 +73,20 @@ writing capability.
Potential isses when upgrading from v0.8.3:
* binary tags are now returned as python arrays
+
* renamed several methods for pep8 compatibility, old names still retained for
backwards compatibility, but should be considered deprecated.
* gettid() is now get_tid()
* getrname() is now get_reference_name()
* parseRegion() is now parse_region()
+
+* some methods have changed for pep8 compatibility without the old
+ names being present:
+ * fromQualityString() is now qualitystring_to_array()
+ * toQualityString() is now qualities_to_qualitystring()
+
* faidx now returns strings and not binary strings in py3.
+
* The cython components have been broken up into smaller files with
more specific content. This will affect users using the cython
interfaces.
diff --git a/doc/usage.rst b/doc/usage.rst
index e005893..90e7688 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -1,11 +1,11 @@
.. _Usage:
-====================================
-Working with BAM/SAM-formatted files
-====================================
+=========================================
+Working with BAM/CRAM/SAM-formatted files
+=========================================
-Opening a samfile
-=================
+Opening a file
+==============
To begin with, import the pysam module and open a
:class:`pysam.AlignmentFile`::
@@ -20,6 +20,11 @@ To open a :term:`SAM` file, type::
import pysam
samfile = pysam.AlignmentFile("ex1.sam", "r")
+:term:`CRAM` files are identified by a ``c`` qualifier::
+
+ import pysam
+ samfile = pysam.AlignmentFile("ex1.cram", "rc")
+
Fetching reads mapped to a :term:`region`
=========================================
@@ -38,30 +43,6 @@ sequence. Note that it will also return reads that are only partially
overlapping with the :term:`region`. Thus the reads returned might
span a region that is larger than the one queried.
-..
- The
- first method follows the :term:`csamtools` API and works
- via a callback function. The callback will be executed for each
- alignment in a :term:`region`::
-
- def my_fetch_callback(alignment):
- print str(alignment)
-
- samfile.fetch('seq1', 10, 20, callback = my_fetch_callback)
-
- Using a function object, work can be done on the alignments. The
- code below simply counts aligned reads::
-
- class Counter:
- def __init__(self):
- self.counts = 0
- def __call__(self, alignment):
- self.counts += 1
-
- c = Counter()
- samfile.fetch( 'seq1', 10, 20, callback = c )
- print "counts=", c.counts
-
Using the pileup-engine
=======================
@@ -72,14 +53,6 @@ on top of the reference sequence similar to a multiple alignment,
:term:`fetching` iterates over the rows of this implied multiple
alignment while a :term:`pileup` iterates over the :term:`columns`.
-..
- Again, there are two principal methods to iterate.
- The first works via a callback function::
-
- def my_pileup_callback( pileups ):
- print str(pileups)
- samfile.pileup( 'seq1', 10, 20, callback = my_pileup_callback )
-
Calling :meth:`~pysam.AlignmentFile.pileup` will return an iterator
over each :term:`column` (reference base) of a specified
:term:`region`. Each call to the iterator returns an object of the
@@ -92,11 +65,11 @@ some additional information::
print (str(x))
-Creating SAM/BAM files from scratch
-===================================
+Creating BAM/CRAM/SAM files from scratch
+========================================
-The following example shows how a new BAM file is constructed from
-scratch. The important part here is that the
+The following example shows how a new :term:`BAM` file is constructed
+from scratch. The important part here is that the
:class:`pysam.AlignmentFile` class needs to receive the sequence
identifiers. These can be given either as a dictionary in a header
structure, as lists of names and sizes, or from a template file.
@@ -106,23 +79,22 @@ Here, we use a header dictionary::
'SQ': [{'LN': 1575, 'SN': 'chr1'},
{'LN': 1584, 'SN': 'chr2'}] }
- outfile = pysam.AlignmentFile(tmpfilename, "wh", header=header)
- a = pysam.AlignedSegment()
- a.query_name = "read_28833_29006_6945"
- a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
- a.flag = 99
- a.reference_id = 0
- a.reference_start = 32
- a.mapping_quality = 20
- a.cigar = ((0,10), (2,1), (0,25))
- a.next_reference_id = 0
- a.next_reference_start=199
- a.template_length=167
- a.query_qualities = pysam.fromQualityString("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
- a.tags = (("NM", 1),
- ("RG", "L1"))
- outfile.write(a)
- outfile.close()
+ with pysam.AlignmentFile(tmpfilename, "wb", header=header) as outf:
+ a = pysam.AlignedSegment()
+ a.query_name = "read_28833_29006_6945"
+ a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
+ a.flag = 99
+ a.reference_id = 0
+ a.reference_start = 32
+ a.mapping_quality = 20
+ a.cigar = ((0,10), (2,1), (0,25))
+ a.next_reference_id = 0
+ a.next_reference_start=199
+ a.template_length=167
+ a.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+ a.tags = (("NM", 1),
+ ("RG", "L1"))
+ outf.write(a)
Using streams
=============
@@ -145,7 +117,7 @@ formatted file on stdout::
for s in infile:
outfile.write(s)
-Note, only the file open mode needs to changed from ``r`` to ``rb``.
+Note that the file open mode needs to changed from ``r`` to ``rb``.
=====================================
Using samtools commands within python
@@ -188,8 +160,8 @@ available using the :meth:`getMessages` method::
pysam.sort.getMessage()
-Note that only the output from the last invocation of a command
-is stored.
+Note that only the output from the last invocation of a command is
+stored.
In order for pysam to make the output of samtools commands accessible
the stdout stream needs to be redirected. This is the default
@@ -270,7 +242,7 @@ form:
Working with VCF/BCF formatted files
====================================
-To iterate through a VCF/BCF formatted file tabular file use
+To iterate through a VCF/BCF formatted file use
:class:`~pysam.VariantFile`::
from pysam import VariantFile
@@ -281,9 +253,103 @@ To iterate through a VCF/BCF formatted file tabular file use
for rec in bcf_in.fetch('chr1', 100000, 200000):
bcf_out.write(rec)
-.. note::
-
- The VCF/BCF API is preliminary and incomplete.
+:meth:`_pysam.VariantFile.fetch()` iterates over
+:class:`~pysam.VariantRecord` objects which provides access to
+simple variant attributes such as :class:`~pysam.VariantRecord.contig`,
+:class:`~pysam.VariantRecord.pos`, :class:`~pysam.VariantRecord.ref`::
+
+ for rec in bcf_in.fetch():
+ print (rec.pos)
+
+but also to complex attributes such as the contents to the
+:term:`info`, :term:`format` and :term:`genotype` columns. These
+complex attributes are views on the underlying htslib data structures
+and provide dictionary-like access to the data::
+
+ for rec in bcf_in.fetch():
+ print (rec.info)
+ print (rec.info.keys())
+ print (rec.info["DP"])
+
+The :py:attr:`~pysam.VariantFile.header` attribute
+(:class:`~pysam.VariantHeader`) provides access information
+stored in the :term:`vcf` header. The complete header can be printed::
+
+ >>> print (bcf_in.header)
+ ##fileformat=VCFv4.2
+ ##FILTER=<ID=PASS,Description="All filters passed">
+ ##fileDate=20090805
+ ##source=myImputationProgramV3.1
+ ##reference=1000GenomesPilot-NCBI36
+ ##phasing=partial
+ ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples
+ With Data">
+ ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+ ##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+ ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+ ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build
+ 129">
+ ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+ ##FILTER=<ID=q10,Description="Quality below 10">
+ ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+ ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+ ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+ ##contig=<ID=M>
+ ##contig=<ID=17>
+ ##contig=<ID=20>
+ ##bcftools_viewVersion=1.3+htslib-1.3
+ ##bcftools_viewCommand=view -O b -o example_vcf42.bcf
+ example_vcf42.vcf.gz
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA0000
+
+Individual contents such as contigs, info fields, samples, formats can
+be retrieved as attributes from :py:attr:`~pysam.VariantFile.header`::
+
+ >>> print (bcf_in.header.contigs)
+ <pysam.cbcf.VariantHeaderContigs object at 0xf250f8>
+
+To convert these views to native python types, iterate through the views::
+
+ >>> print list((bcf_in.header.contigs))
+ ['M', '17', '20']
+ >>> print list((bcf_in.header.filters))
+ ['PASS', 'q10', 's50']
+ >>> print list((bcf_in.header.info))
+ ['NS', 'DP', 'AF', 'AA', 'DB', 'H2']
+ >>> print list((bcf_in.header.samples))
+ ['NA00001', 'NA00002', 'NA00003']
+
+Alternatively, it is possible to iterate through all records in the
+header returning objects of type :py:class:`~pysam.VariantHeaderRecord`:: ::
+
+ >>> for x in bcf_in.header.records:
+ >>> print (x)
+ >>> print (x.type, x.key)
+ GENERIC fileformat
+ FILTER FILTER
+ GENERIC fileDate
+ GENERIC source
+ GENERIC reference
+ GENERIC phasing
+ INFO INFO
+ INFO INFO
+ INFO INFO
+ INFO INFO
+ INFO INFO
+ INFO INFO
+ FILTER FILTER
+ FILTER FILTER
+ FORMAT FORMAT
+ FORMAT FORMAT
+ FORMAT FORMAT
+ FORMAT FORMAT
+ CONTIG contig
+ CONTIG contig
+ CONTIG contig
+ GENERIC bcftools_viewVersion
+ GENERIC bcftools_viewCommand
===============
Extending pysam
diff --git a/import.py b/import.py
new file mode 100644
index 0000000..4018698
--- /dev/null
+++ b/import.py
@@ -0,0 +1,140 @@
+#################################################################
+# Importing samtools and htslib
+#
+# For htslib, simply copy the whole release tar-ball
+# into the directory "htslib" and recreate the file version.h
+#
+# rm -rf htslib
+# mv download/htslib htslib
+# git checkout -- htslib/version.h
+# Edit the file htslib/version.h to set the right version number.
+#
+# For samtools, type:
+# rm -rf samtools
+# python import.py samtools download/samtools
+# Manually, then:
+# modify config.h to set compatibility flags
+# change bamtk.c.pysam.c/main to bamtk.c.pysam.c/samtools_main
+#
+# For bcftools, type:
+# rm -rf bedtools
+# python import.py bedtools download/bedtools
+import os
+import sys
+import fnmatch
+
+
+def locate(pattern, root=os.curdir):
+ '''Locate all files matching supplied filename pattern in and below
+ supplied root directory.
+ '''
+ for path, dirs, files in os.walk(os.path.abspath(root)):
+ for filename in fnmatch.filter(files, pattern):
+ yield os.path.join(path, filename)
+
+
+def _update_pysam_files(cf, destdir):
+ '''update pysam files applying redirection of ouput'''
+ for filename in cf:
+ if not filename:
+ continue
+ dest = filename + ".pysam.c"
+ with open(filename) as infile:
+ with open(dest, "w") as outfile:
+ outfile.write('#include "pysam.h"\n\n')
+ outfile.write(
+ re.sub("stderr", "pysamerr", "".join(infile.readlines())))
+ with open(os.path.join(destdir, "pysam.h"), "w")as outfile:
+ outfile.write("""#ifndef PYSAM_H
+#define PYSAM_H
+#include "stdio.h"
+extern FILE * pysamerr;
+#endif
+""")
+
+
+if len(sys.argv) >= 1:
+ if len(sys.argv) != 3:
+ raise ValueError("import requires dest src")
+
+ dest, srcdir = sys.argv[2:4]
+ if dest not in EXCLUDE:
+ raise ValueError("import expected one of %s" %
+ ",".join(EXCLUDE.keys()))
+ exclude = EXCLUDE[dest]
+ destdir = os.path.abspath(dest)
+ srcdir = os.path.abspath(srcdir)
+ if not os.path.exists(srcdir):
+ raise IOError(
+ "source directory `%s` does not exist." % srcdir)
+
+ cfiles = locate("*.c", srcdir)
+ hfiles = locate("*.h", srcdir)
+
+ # remove unwanted files and htslib subdirectory.
+ cfiles = [x for x in cfiles if os.path.basename(x) not in exclude
+ and not re.search("htslib-", x)]
+
+ hfiles = [x for x in hfiles if os.path.basename(x) not in exclude
+ and not re.search("htslib-", x)]
+
+ ncopied = 0
+
+ def _compareAndCopy(src, srcdir, destdir, exclude):
+
+ d, f = os.path.split(src)
+ common_prefix = os.path.commonprefix((d, srcdir))
+ subdir = re.sub(common_prefix, "", d)[1:]
+ targetdir = os.path.join(destdir, subdir)
+ if not os.path.exists(targetdir):
+ os.makedirs(targetdir)
+ old_file = os.path.join(targetdir, f)
+ if os.path.exists(old_file):
+ md5_old = hashlib.md5(
+ "".join(open(old_file, "r").readlines())).digest()
+ md5_new = hashlib.md5(
+ "".join(open(src, "r").readlines())).digest()
+ if md5_old != md5_new:
+ raise ValueError(
+ "incompatible files for %s and %s" %
+ (old_file, src))
+
+ shutil.copy(src, targetdir)
+ return old_file
+
+ for src_file in hfiles:
+ _compareAndCopy(src_file, srcdir, destdir, exclude)
+ ncopied += 1
+
+ cf = []
+ for src_file in cfiles:
+ cf.append(_compareAndCopy(src_file,
+ srcdir,
+ destdir,
+ exclude))
+ ncopied += 1
+
+ sys.stdout.write(
+ "installed latest source code from %s: "
+ "%i files copied\n" % (srcdir, ncopied))
+ # redirect stderr to pysamerr and replace bam.h with a stub.
+ sys.stdout.write("applying stderr redirection\n")
+
+ _update_pysam_files(cf, destdir)
+
+ sys.exit(0)
+
+
+# if len(sys.argv) >= 2 and sys.argv[1] == "refresh":
+# sys.stdout.write("refreshing latest source code from .c to .pysam.c")
+# # redirect stderr to pysamerr and replace bam.h with a stub.
+# sys.stdout.write("applying stderr redirection")
+# for destdir in ('samtools', ):
+# pysamcfiles = locate("*.pysam.c", destdir)
+# for f in pysamcfiles:
+# os.remove(f)
+# cfiles = locate("*.c", destdir)
+# _update_pysam_files(cfiles, destdir)
+
+# sys.exit(0)
+
diff --git a/pysam/__init__.py b/pysam/__init__.py
index 32f8cfd..cd32bf5 100644
--- a/pysam/__init__.py
+++ b/pysam/__init__.py
@@ -1,8 +1,9 @@
-from pysam.libchtslib import *
+import os
+import sys
+from pysam.libchtslib import *
from pysam.cutils import *
import pysam.cutils as cutils
-
import pysam.cfaidx as cfaidx
from pysam.cfaidx import *
import pysam.ctabix as ctabix
@@ -17,144 +18,12 @@ import pysam.cvcf as cvcf
from pysam.cvcf import *
import pysam.cbcf as cbcf
from pysam.cbcf import *
-import pysam.csamtools as csamtools
-
+from pysam.utils import SamtoolsError
import pysam.Pileup as Pileup
-import os
-
-
-class SamtoolsError(Exception):
- '''exception raised in case of an error incurred in the samtools
- library.'''
-
- def __init__(self, value):
- self.value = value
-
- def __str__(self):
- return repr(self.value)
-
-
-class SamtoolsDispatcher(object):
- '''The samtools dispatcher emulates the samtools command line as
- module calls.
-
- Captures stdout and stderr.
-
- Raises a :class:`pysam.SamtoolsError` exception in case samtools
- exits with an error code other than 0.
-
- Some command line options are associated with parsers. For
- example, the samtools command "pileup -c" creates a tab-separated
- table on standard output. In order to associate parsers with
- options, an optional list of parsers can be supplied. The list
- will be processed in order checking for the presence of each
- option.
-
- If no parser is given or no appropriate parser is found, the stdout
- output of samtools commands will be returned.
-
- '''
+from pysam.samtools import *
+import pysam.config
- dispatch = None
- parsers = None
-
- def __init__(self, dispatch, parsers):
- self.dispatch = dispatch
- self.parsers = parsers
- self.stderr = []
-
- def __call__(self, *args, **kwargs):
- '''execute a samtools command.
-
- Keyword arguments:
- catch_stdout -- redirect stdout from the samtools command and return as variable (default True)
- raw -- ignore any parsers associated with this samtools command.
- '''
- retval, stderr, stdout = csamtools._samtools_dispatch(
- self.dispatch, args, catch_stdout=kwargs.get("catch_stdout", True))
-
- if retval:
- raise SamtoolsError(
- 'csamtools returned with error %i: %s' %
- (retval, "\n".join(stderr)))
-
- self.stderr = stderr
-
- # Uncommented for samtools 1.2
- # # samtools commands do not propagate the return code correctly.
- # # I have thus added this patch to throw if there is output on stderr.
- # # Note that there is sometimes output on stderr that is not an error,
- # # for example: [sam_header_read2] 2 sequences loaded.
- # # Ignore messages like these
- # stderr = [x for x in stderr
- # if not (x.startswith("[sam_header_read2]") or
- # x.startswith("[bam_index_load]") or
- # x.startswith("[bam_sort_core]") or
- # x.startswith("[samopen] SAM header is present"))]
-
- # if stderr:
- # raise SamtoolsError("\n".join(stderr))
-
- # call parser for stdout:
- if not kwargs.get("raw") and stdout and self.parsers:
- for options, parser in self.parsers:
- for option in options:
- if option not in args:
- break
- else:
- return parser(stdout)
-
- return stdout
-
- def get_messages(self):
- return self.stderr
-
- def usage(self):
- '''return the samtools usage information for this command'''
- retval, stderr, stdout = csamtools._samtools_dispatch(
- self.dispatch)
- return "".join(stderr)
-
-#
-# samtools command line options to export in python
-#
-# import is a python reserved word.
-SAMTOOLS_DISPATCH = {
- # samtools 'documented' commands
- "view": ("view", None),
- "sort": ("sort", None),
- "mpileup": ("mpileup", None),
- "depth": ("depth", None),
- "faidx": ("faidx", None),
- "tview": ("tview", None),
- "index": ("index", None),
- "idxstats": ("idxstats", None),
- "fixmate": ("fixmate", None),
- "flagstat": ("flagstat", None),
- "calmd": ("calmd", None),
- "merge": ("merge", None),
- "rmdup": ("rmdup", None),
- "reheader": ("reheader", None),
- "cat": ("cat", None),
- "targetcut": ("targetcut", None),
- "phase": ("phase", None),
- # others
- "samimport": ("import", None),
- "bam2fq": ("bam2fq", None),
- "pad2unpad": ("pad2unpad", None),
- "depad": ("pad2unpad", None),
- "bedcov": ("bedcov", None),
- "bamshuf": ("bamshuf", None),
- # obsolete
- # "pileup": "pileup", ( (("-c",), Pileup.iterate),),),
-}
-
-# instantiate samtools commands as python functions
-for key, options in SAMTOOLS_DISPATCH.items():
- cmd, parser = options
- globals()[key] = SamtoolsDispatcher(cmd, parser)
-
-# hack to export all the symbols from separate modules
+# export all the symbols from separate modules
__all__ = \
libchtslib.__all__ +\
cutils.__all__ +\
@@ -165,8 +34,7 @@ __all__ = \
calignmentfile.__all__ +\
calignedsegment.__all__ +\
csamfile.__all__ +\
- ["SamtoolsError", "SamtoolsDispatcher"] +\
- list(SAMTOOLS_DISPATCH) +\
+ ["SamtoolsError"] +\
["Pileup"]
from pysam.version import __version__, __samtools_version__
@@ -175,15 +43,32 @@ from pysam.version import __version__, __samtools_version__
def get_include():
'''return a list of include directories.'''
dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
- return [dirname,
- os.path.join(dirname, 'include', 'htslib'),
- os.path.join(dirname, 'include', 'samtools')]
+
+ #
+ # Header files may be stored in different relative locations
+ # depending on installation mode (e.g., `python setup.py install`,
+ # `python setup.py develop`. The first entry in each list is
+ # where develop-mode headers can be found.
+ #
+ htslib_possibilities = [os.path.join(dirname, '..', 'htslib'),
+ os.path.join(dirname, 'include', 'htslib')]
+ samtool_possibilities = [os.path.join(dirname, '..', 'samtools'),
+ os.path.join(dirname, 'include', 'samtools')]
+
+ includes = [dirname]
+ for header_locations in [htslib_possibilities, samtool_possibilities]:
+ for header_location in header_locations:
+ if os.path.exists(header_location):
+ includes.append(os.path.abspath(header_location))
+ break
+
+ return includes
def get_defines():
'''return a list of defined compilation parameters.'''
- return [('_FILE_OFFSET_BITS', '64'),
- ('_USE_KNETFILE', '')]
+ return [] #('_FILE_OFFSET_BITS', '64'),
+ # ('_USE_KNETFILE', '')]
def get_libraries():
@@ -191,11 +76,22 @@ def get_libraries():
# Note that this list does not include csamtools.so as there are
# numerous name conflicts with libchtslib.so.
dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
- return [os.path.join(dirname, x) for x in (
- 'libchtslib.so',
- 'ctabixproxies.so',
- 'cfaidx.so',
- 'csamfile.so',
- 'cvcf.so',
- 'cbcf.so',
- 'ctabix.so')]
+ pysam_libs = ['ctabixproxies',
+ 'cfaidx',
+ 'csamfile',
+ 'cvcf',
+ 'cbcf',
+ 'ctabix']
+ if pysam.config.HTSLIB == "builtin":
+ pysam_libs.append('libchtslib')
+
+ if sys.version_info.major >= 3:
+ if sys.version_info.minor >= 5:
+ return [os.path.join(dirname, x + ".{}.so".format(
+ sysconfig.get_config_var('SOABI'))) for x in pysam_libs]
+ else:
+ return [os.path.join(dirname, x + ".{}{}.so".format(
+ sys.implementation.cache_tag,
+ sys.abiflags)) for x in pysam_libs]
+ else:
+ return [os.path.join(dirname, x + ".so") for x in pysam_libs]
diff --git a/pysam/bcftools.py b/pysam/bcftools.py
new file mode 100644
index 0000000..ab891d4
--- /dev/null
+++ b/pysam/bcftools.py
@@ -0,0 +1,24 @@
+from utils import PysamDispatcher
+
+BCFTOOLS_DISPATCH = [
+ "index",
+ "annotate",
+ "concat",
+ "isec",
+ "merge",
+ "norm",
+ "plugin",
+ "query",
+ "reheader",
+ "view",
+ "call",
+ "consensus",
+ "cnv",
+ "filter",
+ "gtcheck",
+ "roh",
+ "stats"]
+
+# instantiate bcftools commands as python functions
+for cmd in BCFTOOLS_DISPATCH:
+ globals()[cmd] = PysamDispatcher("bcftools", cmd, None)
diff --git a/pysam/calignedsegment.pxd b/pysam/calignedsegment.pxd
index ce82d88..0880bef 100644
--- a/pysam/calignedsegment.pxd
+++ b/pysam/calignedsegment.pxd
@@ -35,7 +35,6 @@ cdef extern from "htslib_util.h":
from pysam.calignmentfile cimport AlignmentFile
ctypedef AlignmentFile AlignmentFile_t
-cdef bytes TagToString(tuple tagtup)
# Note: need to declare all C fields and methods here
cdef class AlignedSegment:
@@ -64,10 +63,7 @@ cdef class AlignedSegment:
cpdef has_tag(self, tag)
# returns a valid sam alignment string
- cpdef bytes tostring(self, AlignmentFile_t handle)
-
- # returns the aux tag fields as a string.
- cdef bytes get_tag_string(self)
+ cpdef tostring(self, AlignmentFile_t handle)
cdef class PileupColumn:
@@ -92,3 +88,4 @@ cdef class PileupRead:
cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file)
cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file)
cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
+cdef inline uint32_t get_alignment_length(bam1_t * src)
diff --git a/pysam/calignedsegment.pyx b/pysam/calignedsegment.pyx
index f2b07a1..0a2b94f 100644
--- a/pysam/calignedsegment.pyx
+++ b/pysam/calignedsegment.pyx
@@ -10,17 +10,17 @@
#
# class PileupColumn a collection of segments (PileupRead) aligned to
# a particular genomic position.
-#
+#
# class PileupRead an AlignedSegment aligned to a particular genomic
# position. Contains additional attributes with respect
# to this.
#
# Additionally this module defines numerous additional classes that are part
# of the internal API. These are:
-#
+#
# Various iterator classes to iterate over alignments in sequential (IteratorRow)
# or in a stacked fashion (IteratorColumn):
-#
+#
# class IteratorRow
# class IteratorRowRegion
# class IteratorRowHead
@@ -64,7 +64,8 @@ from cpython.version cimport PY_MAJOR_VERSION
from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
from libc.string cimport strchr
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
+from pysam.cutils cimport force_bytes, force_str, \
+ charptr_to_str, charptr_to_bytes
from pysam.cutils cimport qualities_to_qualitystring, qualitystring_to_array, \
array_to_qualitystring
@@ -118,7 +119,7 @@ cdef convert_binary_tag(uint8_t * tag):
# get number of values in array
nvalues = (<int32_t*>tag)[0]
tag += 4
-
+
# define python array
cdef c_array.array c_values = array.array(
chr(map_typecode_htslib_to_python(auxtype)))
@@ -133,75 +134,11 @@ cdef convert_binary_tag(uint8_t * tag):
return byte_size, nvalues, c_values
-cdef bytes TagToString(tuple tagtup):
- cdef c_array.array b_aux_arr
- cdef char value_type = tagtup[2]
- cdef char* tag = tagtup[0]
- cdef double value_double
- cdef long value_int
- cdef bytes value_bytes
- cdef long i, min_value
- cdef double f
- cdef cython.str ret
- cdef size_t size
- if(value_type in ['c', 'C', 'i', 'I', 's', 'S']):
- value_int = tagtup[1]
- ret = tag + ":i:%s" % value_int
- elif(value_type in ['f', 'F', 'd', 'D']):
- value_float = tagtup[1]
- ret = tag + ":f:%s" % (value_float)
- elif(value_type == "Z"):
- value_bytes = tagtup[1]
- ret = tag + ":Z:" + value_bytes
- elif(value_type == "B"):
- if(isinstance(tagtup[1], array.array)):
- b_aux_arr = tagtup[1]
- else:
- if(isinstance(tagtup[1][0], float)):
- if(len(tagtup[1]) == 1):
- return <bytes> (tag + ":B:f%s," % tagtup[1][0])
- else:
- return <bytes> (tag + ":B:f" +
- ",".join([str(f) for f in tagtup[1]]))
- else:
- b_aux_arr = array('l', tagtup[1])
- # Choose long to accommodate any size integers.
- size = sizeof(b_aux_arr)
- min_value = min(b_aux_arr)
- length = len(b_aux_arr)
- if(size == 1):
- if(min_value < 0):
- ret = tag + ":B:c," + ",".join([str(i) for i in b_aux_arr])
- else:
- ret = tag + ":B:C," + ",".join([str(i) for i in b_aux_arr])
- elif(size == 2):
- if(min_value < 0):
- ret = tag + ":B:i," + ",".join([str(i) for i in b_aux_arr])
- else:
- ret = tag + ":B:I," + ",".join([str(i) for i in b_aux_arr])
- else: # size == 4. Removed check to compile to switch statement.
- if(min_value < 0):
- ret = tag + ":B:s," + ",".join([str(i) for i in b_aux_arr])
- else:
- ret = tag + ":B:S," + ",".join([str(i) for i in b_aux_arr])
- elif(value_type == "H"):
- ret = tag + ":H:" + "".join([hex(i)[2:] for i in tagtup[1]])
- elif(value_type == "A"):
- ret = tag + ":A:" + tagtup[1]
- else:
- # Unrecognized character - returning the string as it was provided.
- # An exception is not being raised because that prevents cython
- # from being able to compile this into a switch statement for
- # performance.
- ret = "%s:%s:%s" % (tag, tagtup[2], tagtup[1])
- return <bytes> ret
-
-
cdef inline uint8_t get_value_code(value, value_type=None):
'''guess type code for a *value*. If *value_type* is None,
the type code will be inferred based on the Python type of
*value*'''
- cdef uint8_t typecode
+ cdef uint8_t typecode
cdef char * _char_type
if value_type is None:
@@ -229,7 +166,7 @@ cdef inline uint8_t get_value_code(value, value_type=None):
return typecode
-cdef inline getTypecode(value, maximum_value=None):
+cdef inline bytes getTypecode(value, maximum_value=None):
'''returns the value typecode of a value.
If max is specified, the approprite type is
@@ -239,13 +176,15 @@ cdef inline getTypecode(value, maximum_value=None):
if maximum_value is None:
maximum_value = value
+ cdef bytes valuetype
+
t = type(value)
if t is float:
valuetype = b'f'
elif t is int:
# signed ints
- if value < 0:
+ if value < 0:
if value >= -128 and maximum_value < 128:
valuetype = b'c'
elif value >= -32768 and maximum_value < 32768:
@@ -272,7 +211,7 @@ cdef inline getTypecode(value, maximum_value=None):
if t is not bytes:
value = value.encode('ascii')
if len(value) == 1:
- valuetype = b"A"
+ valuetype = b'A'
else:
valuetype = b'Z'
@@ -289,17 +228,17 @@ cdef inline packTags(tags):
to be used in a call to struct.pack_into.
"""
fmts, args = ["<"], []
-
- datatype2format = {
- 'c': ('b', 1),
- 'C': ('B', 1),
- 's': ('h', 2),
- 'S': ('H', 2),
- 'i': ('i', 4),
- 'I': ('I', 4),
- 'f': ('f', 4),
- 'A': ('c', 1)}
+ datatype2format = {
+ b'c': ('b', 1),
+ b'C': ('B', 1),
+ b's': ('h', 2),
+ b'S': ('H', 2),
+ b'i': ('i', 4),
+ b'I': ('I', 4),
+ b'f': ('f', 4),
+ b'A': ('c', 1)}
+
for tag in tags:
if len(tag) == 2:
@@ -310,9 +249,8 @@ cdef inline packTags(tags):
else:
raise ValueError("malformatted tag: %s" % str(tag))
- if not type(pytag) is bytes:
- pytag = pytag.encode('ascii')
-
+ pytag = force_bytes(pytag)
+ valuetype = force_bytes(valuetype)
t = type(value)
if t is tuple or t is list:
@@ -327,7 +265,7 @@ cdef inline packTags(tags):
raise ValueError("invalid value type '%s'" % valuetype)
datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0])
- args.extend([pytag[:2],
+ args.extend([pytag[:2],
b"B",
valuetype,
len(value)] + list(value))
@@ -335,24 +273,29 @@ cdef inline packTags(tags):
elif isinstance(value, array.array):
# binary tags from arrays
if valuetype is None:
- valuetype = chr(map_typecode_python_to_htslib(ord(value.typecode)))
-
+ valuetype = force_bytes(chr(
+ map_typecode_python_to_htslib(ord(value.typecode))))
+
if valuetype not in datatype2format:
- raise ValueError("invalid value type '%s'" % valuetype)
-
+ raise ValueError("invalid value type '%s' (%s)" %
+ (valuetype, type(valuetype)))
+
# use array.tostring() to retrieve byte representation and
# save as bytes
datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1])
- args.extend([pytag[:2],
+ args.extend([pytag[:2],
b"B",
valuetype,
len(value),
- value.tostring()])
-
+ force_bytes(value.tostring())])
+
else:
if valuetype is None:
valuetype = getTypecode(value)
+ if valuetype in b"AZ":
+ value = force_bytes(value)
+
if valuetype == b"Z":
datafmt = "2sc%is" % (len(value)+1)
else:
@@ -372,24 +315,24 @@ cdef inline int32_t calculateQueryLength(bam1_t * src):
Return 0 if there is no CIGAR alignment.
"""
-
+
cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
if cigar_p == NULL:
return 0
-
+
cdef uint32_t k, qpos
cdef int op
qpos = 0
-
+
for k from 0 <= k < pysam_get_n_cigar(src):
op = cigar_p[k] & BAM_CIGAR_MASK
-
+
if op == BAM_CMATCH or op == BAM_CINS or \
op == BAM_CSOFT_CLIP or \
op == BAM_CEQUAL or op == BAM_CDIFF:
qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
-
+
return qpos
@@ -441,9 +384,9 @@ cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
return end_offset
-cdef inline object getSequenceInRange(bam1_t *src,
- uint32_t start,
- uint32_t end):
+cdef inline bytes getSequenceInRange(bam1_t *src,
+ uint32_t start,
+ uint32_t end):
"""return python string of the sequence in a bam1_t object.
"""
@@ -463,12 +406,12 @@ cdef inline object getSequenceInRange(bam1_t *src,
# note: do not use string literal as it will be a python string
s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf]
- return charptr_to_str(seq)
+ return charptr_to_bytes(seq)
cdef inline object getQualitiesInRange(bam1_t *src,
- uint32_t start,
- uint32_t end):
+ uint32_t start,
+ uint32_t end):
"""return python array of quality values from a bam1_t object"""
cdef uint8_t * p
@@ -483,7 +426,7 @@ cdef inline object getQualitiesInRange(bam1_t *src,
c_array.resize(result, end - start)
# copy data
- memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
+ memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
return result
@@ -531,42 +474,103 @@ cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file):
return dest
+cdef inline uint32_t get_alignment_length(bam1_t * src):
+ cdef int k = 0
+ cdef uint32_t l = 0
+ if src == NULL:
+ return 0
+ cdef uint32_t * cigar_p = bam_get_cigar(src)
+ if cigar_p == NULL:
+ return 0
+ cdef int op
+ cdef int n = pysam_get_n_cigar(src)
+ for k from 0 <= k < n:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
+ continue
+ l += cigar_p[k] >> BAM_CIGAR_SHIFT
+ return l
+
+
# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
-cdef inline object reconstituteSequenceFromMD(bam1_t * src):
- """return reference sequence from MD tag.
+cdef inline bytes build_alignment_sequence(bam1_t * src):
+ """return expanded sequence from MD tag.
+
+ The sequence includes substitutions and both insertions in the
+ reference as well as deletions to the reference sequence. Combine
+ with the cigar string to reconstitute the query or the reference
+ sequence.
Returns
-------
None, if no MD tag is present.
+
"""
-
- cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
-
- if md_tag_ptr == NULL:
+ if src == NULL:
return None
-
- cdef uint32_t start, end
- start = getQueryStart(src)
- end = getQueryEnd(src)
-
+
+ cdef uint32_t start = getQueryStart(src)
+ cdef uint32_t end = getQueryEnd(src)
# get read sequence, taking into account soft-clipping
r = getSequenceInRange(src, start, end)
cdef char * read_sequence = r
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+ if cigar_p == NULL:
+ return None
- cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
- cdef int md_idx = 0
- cdef int r_idx = 0
+ cdef uint32_t r_idx = 0
+ cdef int op
+ cdef uint32_t k, i, l, x
cdef int nmatches = 0
- cdef int x = 0
cdef int s_idx = 0
- # maximum length of sequence is read length + inserts in MD tag + \0
- cdef uint32_t max_len = end - start + strlen(md_tag) + 1
- cdef char * s = <char*>calloc(max_len, sizeof(char))
+ cdef uint32_t max_len = get_alignment_length(src)
+ if max_len == 0:
+ raise ValueError("could not determine alignment length")
+
+ cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
if s == NULL:
raise ValueError(
"could not allocated sequence of length %i" % max_len)
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+ for i from 0 <= i < l:
+ s[s_idx] = read_sequence[r_idx]
+ r_idx += 1
+ s_idx += 1
+ elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ for i from 0 <= i < l:
+ s[s_idx] = '-'
+ s_idx += 1
+ elif op == BAM_CINS:
+ for i from 0 <= i < l:
+ # encode insertions into reference as lowercase
+ s[s_idx] = read_sequence[r_idx] + 32
+ r_idx += 1
+ s_idx += 1
+ elif op == BAM_CSOFT_CLIP:
+ pass
+ elif op == BAM_CHARD_CLIP:
+ pass # advances neither
+ elif op == BAM_CPAD:
+ raise NotImplementedError(
+ "Padding (BAM_CPAD, 6) is currently not supported. "
+ "Please implement. Sorry about that.")
+
+ cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
+ if md_tag_ptr == NULL:
+ seq = PyBytes_FromStringAndSize(s, s_idx)
+ free(s)
+ return seq
+
+ cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
+ cdef int md_idx = 0
+ s_idx = 0
+
while md_tag[md_idx] != 0:
# c is numerical
if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
@@ -575,33 +579,41 @@ cdef inline object reconstituteSequenceFromMD(bam1_t * src):
md_idx += 1
continue
else:
- # save matches up to this point
- for x from r_idx <= x < r_idx + nmatches:
- s[s_idx] = read_sequence[x]
+ # save matches up to this point, skipping insertions
+ for x from 0 <= x < nmatches:
+ while s[s_idx] >= 'a':
+ s_idx += 1
+ s_idx += 1
+ while s[s_idx] >= 'a':
s_idx += 1
+
r_idx += nmatches
nmatches = 0
-
if md_tag[md_idx] == '^':
md_idx += 1
while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
+ assert s[s_idx] == '-'
s[s_idx] = md_tag[md_idx]
s_idx += 1
md_idx += 1
else:
- # convert mismatch to lower case
+ # save mismatch and change to lower case
s[s_idx] = md_tag[md_idx] + 32
s_idx += 1
r_idx += 1
md_idx += 1
- # save matches up to this point
- for x from r_idx <= x < r_idx + nmatches:
- s[s_idx] = read_sequence[x]
+ # save matches up to this point, skipping insertions
+ for x from 0 <= x < nmatches:
+ while s[s_idx] >= 'a':
+ s_idx += 1
s_idx += 1
-
+ while s[s_idx] >= 'a':
+ s_idx += 1
+
seq = PyBytes_FromStringAndSize(s, s_idx)
free(s)
+
return seq
@@ -650,7 +662,7 @@ cdef class AlignedSegment:
The representation is an approximate :term:`SAM` format, because
an aligned read might not be associated with a :term:`AlignmentFile`.
- As a result :term:`tid` is shown instead of the reference name.
+ As a result :term:`tid` is shown instead of the reference name.
Similarly, the tags field is returned in its parsed state.
To get a valid SAM record, use :meth:`tostring`.
@@ -736,10 +748,10 @@ cdef class AlignedSegment:
return hash_value
- cpdef bytes tostring(self, AlignmentFile_t htsfile):
+ cpdef tostring(self, AlignmentFile_t htsfile):
"""returns a string representation of the aligned segment.
- The output format is valid SAM format if
+ The output format is valid SAM format.
Parameters
----------
@@ -748,37 +760,21 @@ cdef class AlignedSegment:
identifers to chromosome names.
"""
- cdef cython.str cigarstring, mate_ref, ref
- if self.reference_id < 0:
- ref = "*"
- else:
- ref = htsfile.getrname(self.reference_id)
+ cdef kstring_t line
+ line.l = line.m = 0
+ line.s = NULL
- if self.rnext < 0:
- mate_ref = "*"
- elif self.rnext == self.reference_id:
- mate_ref = "="
- else:
- mate_ref = htsfile.getrname(self.rnext)
-
- cigarstring = self.cigarstring if(
- self.cigarstring is not None) else "*"
- ret = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
- self.query_name, self.flag,
- ref, self.pos + 1, self.mapq,
- cigarstring,
- mate_ref, self.mpos + 1,
- self.template_length,
- self.seq, self.qual,
- self.get_tag_string())
- return <bytes> ret
-
- cdef bytes get_tag_string(self):
- cdef tuple tag
- cdef cython.str ret = "\t".join([
- TagToString(tag) for tag in
- self.get_tags(with_value_type=True)])
- return <bytes> ret
+ if sam_format1(htsfile.header, self._delegate, &line) < 0:
+ if line.m:
+ free(line.s)
+ raise ValueError('sam_format failed')
+
+ ret = force_str(line.s[:line.l])
+
+ if line.m:
+ free(line.s)
+
+ return ret
########################################################
## Basic attributes in order of appearance in SAM format
@@ -809,7 +805,7 @@ cdef class AlignedSegment:
l,
<uint8_t*>p)
-
+
pysam_set_l_qname(src, l)
# re-acquire pointer to location in memory
@@ -855,7 +851,7 @@ cdef class AlignedSegment:
src = self._delegate
src.core.pos = pos
if pysam_get_n_cigar(src):
- pysam_set_bin(src,
+ pysam_set_bin(src,
hts_reg2bin(
src.core.pos,
bam_endpos(src),
@@ -878,7 +874,7 @@ cdef class AlignedSegment:
property cigarstring:
'''the :term:`cigar` alignment as a string.
-
+
The cigar string is a string of alternating integers
and characters denoting the length and the type of
an operation.
@@ -900,7 +896,7 @@ cdef class AlignedSegment:
# reverse order
else:
return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
-
+
def __set__(self, cigar):
if cigar is None or len(cigar) == 0:
self.cigartuples = []
@@ -963,7 +959,7 @@ cdef class AlignedSegment:
self._delegate.core.isize = isize
property query_sequence:
- """read sequence bases, including :term:`soft clipped` bases
+ """read sequence bases, including :term:`soft clipped` bases
(None if not present).
Note that assigning to seq will invalidate any quality scores.
@@ -975,7 +971,7 @@ cdef class AlignedSegment:
read.query_qualities = q[5:10]
The sequence is returned as it is stored in the BAM file. Some mappers
- might have stored a reverse complement of the original read
+ might have stored a reverse complement of the original read
sequence.
"""
def __get__(self):
@@ -989,8 +985,8 @@ cdef class AlignedSegment:
if src.core.l_qseq == 0:
return None
- self.cache_query_sequence = getSequenceInRange(
- src, 0, src.core.l_qseq)
+ self.cache_query_sequence = force_str(getSequenceInRange(
+ src, 0, src.core.l_qseq))
return self.cache_query_sequence
def __set__(self, seq):
@@ -1005,7 +1001,7 @@ cdef class AlignedSegment:
if seq == None:
l = 0
else:
- l = len(seq)
+ l = len(seq)
seq = force_bytes(seq)
src = self._delegate
@@ -1040,7 +1036,7 @@ cdef class AlignedSegment:
p = pysam_bam_get_qual(src)
p[0] = 0xff
- self.cache_query_sequence = seq
+ self.cache_query_sequence = force_str(seq)
# clear cached values for quality values
self.cache_query_qualities = None
@@ -1059,7 +1055,7 @@ cdef class AlignedSegment:
beforehand as this will determine the expected length of the
quality score array.
- This method raises a ValueError if the length of the
+ This method raises a ValueError if the length of the
quality scores and the sequence are not the same.
"""
@@ -1111,7 +1107,7 @@ cdef class AlignedSegment:
# copy data
memcpy(p, result.data.as_voidptr, l)
-
+
# save in cache
self.cache_query_qualities = qual
@@ -1124,7 +1120,7 @@ cdef class AlignedSegment:
##########################################################
- # Derived simple attributes. These are simple attributes of
+ # Derived simple attributes. These are simple attributes of
# AlignedSegment getting and setting values.
##########################################################
# 1. Flags
@@ -1230,7 +1226,7 @@ cdef class AlignedSegment:
return None
return bam_endpos(src) - \
self._delegate.core.pos
-
+
property query_alignment_sequence:
"""aligned portion of the read.
@@ -1263,7 +1259,8 @@ cdef class AlignedSegment:
start = getQueryStart(src)
end = getQueryEnd(src)
- self.cache_query_alignment_sequence = getSequenceInRange(src, start, end)
+ self.cache_query_alignment_sequence = force_str(
+ getSequenceInRange(src, start, end))
return self.cache_query_alignment_sequence
property query_alignment_qualities:
@@ -1381,7 +1378,7 @@ cdef class AlignedSegment:
"""
cdef uint32_t * cigar_p
- cdef bam1_t * src
+ cdef bam1_t * src
src = self._delegate
@@ -1389,13 +1386,45 @@ cdef class AlignedSegment:
return src.core.l_qseq
return calculateQueryLength(src)
-
+
def get_reference_sequence(self):
"""return the reference sequence.
This method requires the MD tag to be set.
"""
- return reconstituteSequenceFromMD(self._delegate)
+ cdef uint32_t k, i
+ cdef int op
+ cdef bam1_t * src = self._delegate
+ ref_seq = force_str(build_alignment_sequence(src))
+ if ref_seq is None:
+ raise ValueError("MD tag not present")
+
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+ cdef uint32_t r_idx = 0
+ result = []
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+ for i from 0 <= i < l:
+ result.append(ref_seq[r_idx])
+ r_idx += 1
+ elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ for i from 0 <= i < l:
+ result.append(ref_seq[r_idx])
+ r_idx += 1
+ elif op == BAM_CINS:
+ r_idx += l
+ elif op == BAM_CSOFT_CLIP:
+ pass
+ elif op == BAM_CHARD_CLIP:
+ pass # advances neither
+ elif op == BAM_CPAD:
+ raise NotImplementedError(
+ "Padding (BAM_CPAD, 6) is currently not supported. "
+ "Please implement. Sorry about that.")
+
+ return "".join(result)
def get_aligned_pairs(self, matches_only=False, with_seq=False):
@@ -1405,20 +1434,17 @@ cdef class AlignedSegment:
position may be None.
Padding is currently not supported and leads to an exception.
-
+
Parameters
----------
-
- matches_only : bool
-
- If True, only matched bases are returned - no None on either
- side.
+ matches_only : bool
+ If True, only matched bases are returned - no None on either
+ side.
with_seq : bool
-
- If True, return a third element in the tuple containing the
- reference sequence. Substitutions are lower-case. This option
- requires an MD tag to be present.
+ If True, return a third element in the tuple containing the
+ reference sequence. Substitutions are lower-case. This option
+ requires an MD tag to be present.
Returns
-------
@@ -1426,7 +1452,7 @@ cdef class AlignedSegment:
aligned_pairs : list of tuples
"""
- cdef uint32_t k, i, pos, qpos, r_idx
+ cdef uint32_t k, i, pos, qpos, r_idx, l
cdef int op
cdef uint32_t * cigar_p
cdef bam1_t * src = self._delegate
@@ -1437,7 +1463,7 @@ cdef class AlignedSegment:
# read sequence, cigar and MD tag are consistent.
if _with_seq:
- ref_seq = reconstituteSequenceFromMD(src)
+ ref_seq = force_str(self.get_reference_sequence())
if ref_seq is None:
raise ValueError("MD tag not present")
@@ -1445,12 +1471,11 @@ cdef class AlignedSegment:
if pysam_get_n_cigar(src) == 0:
return []
-
+
result = []
pos = src.core.pos
qpos = 0
cigar_p = pysam_bam_get_cigar(src)
-
for k from 0 <= k < pysam_get_n_cigar(src):
op = cigar_p[k] & BAM_CIGAR_MASK
l = cigar_p[k] >> BAM_CIGAR_SHIFT
@@ -1505,12 +1530,12 @@ cdef class AlignedSegment:
""" a list of start and end positions of
aligned gapless blocks.
- The start and end positions are in genomic
- coordinates.
-
- Blocks are not normalized, i.e. two blocks
+ The start and end positions are in genomic
+ coordinates.
+
+ Blocks are not normalized, i.e. two blocks
might be directly adjacent. This happens if
- the two blocks are separated by an insertion
+ the two blocks are separated by an insertion
in the read.
"""
@@ -1577,7 +1602,7 @@ cdef class AlignedSegment:
# TODO: capture in CIGAR object
property cigartuples:
"""the :term:`cigar` alignment. The alignment
- is returned as a list of tuples of (operation, length).
+ is returned as a list of tuples of (operation, length).
If the alignment is not present, None is returned.
@@ -1679,7 +1704,7 @@ cdef class AlignedSegment:
cpdef set_tag(self,
tag,
- value,
+ value,
value_type=None,
replace=True):
"""sets a particular field *tag* to *value* in the optional alignment
@@ -1721,7 +1746,7 @@ cdef class AlignedSegment:
# setting value to None deletes a tag
if value is None:
return
-
+
typecode = get_value_code(value, value_type)
if typecode == 0:
raise ValueError("can't guess type or invalid type code specified")
@@ -1763,7 +1788,7 @@ cdef class AlignedSegment:
# bam_aux_append copies data from value_ptr
bam_aux_append(src,
tag,
- typecode,
+ typecode,
value_size,
<uint8_t*>buffer.raw)
return
@@ -1772,7 +1797,7 @@ cdef class AlignedSegment:
bam_aux_append(src,
tag,
- typecode,
+ typecode,
value_size,
value_ptr)
@@ -1795,16 +1820,16 @@ cdef class AlignedSegment:
This method is the fastest way to access the optional
alignment section if only few tags need to be retrieved.
- Parameters
+ Parameters
----------
- tag :
+ tag :
data tag.
-
+
with_value_type : Optional[bool]
if set to True, the return value is a tuple of (tag value, type code).
(default False)
-
+
Returns
-------
@@ -1927,7 +1952,7 @@ cdef class AlignedSegment:
s += 1
if with_value_type:
- result.append((charptr_to_str(auxtag), value, auxtype))
+ result.append((charptr_to_str(auxtag), value, chr(auxtype)))
else:
result.append((charptr_to_str(auxtag), value))
@@ -1946,7 +1971,7 @@ cdef class AlignedSegment:
This method will not enforce the rule that the same tag may appear
only once in the optional alignment section.
"""
-
+
cdef bam1_t * src
cdef uint8_t * s
cdef char * temp
@@ -1961,7 +1986,7 @@ cdef class AlignedSegment:
buffer = ctypes.create_string_buffer(new_size)
struct.pack_into(fmt,
buffer,
- 0,
+ 0,
*args)
# delete the old data and allocate new space.
@@ -1981,11 +2006,11 @@ cdef class AlignedSegment:
# check if there is direct path from buffer.raw to tmp
p = buffer.raw
- # create handle to make sure buffer stays alive long
+ # create handle to make sure buffer stays alive long
# enough for memcpy, see issue 129
temp = p
memcpy(s, temp, new_size)
-
+
########################################################
# Compatibility Accessors
@@ -2001,26 +2026,33 @@ cdef class AlignedSegment:
# explicit declaration of getters/setters
########################################################
property qname:
+ """deprecated, use query_name instead"""
def __get__(self): return self.query_name
def __set__(self, v): self.query_name = v
property tid:
+ """deprecated, use reference_id instead"""
def __get__(self): return self.reference_id
def __set__(self, v): self.reference_id = v
property pos:
+ """deprecated, use reference_start instead"""
def __get__(self): return self.reference_start
def __set__(self, v): self.reference_start = v
property mapq:
+ """deprecated, use mapping_quality instead"""
def __get__(self): return self.mapping_quality
def __set__(self, v): self.mapping_quality = v
property rnext:
+ """deprecated, use next_reference_id instead"""
def __get__(self): return self.next_reference_id
def __set__(self, v): self.next_reference_id = v
property pnext:
+ """deprecated, use next_reference_start instead"""
def __get__(self):
return self.next_reference_start
def __set__(self, v):
self.next_reference_start = v
property cigar:
+ """deprecated, use cigartuples instead"""
def __get__(self):
r = self.cigartuples
if r is None:
@@ -2028,104 +2060,127 @@ cdef class AlignedSegment:
return r
def __set__(self, v): self.cigartuples = v
property tlen:
+ """deprecated, use template_length instead"""
def __get__(self):
return self.template_length
def __set__(self, v):
self.template_length = v
property seq:
+ """deprecated, use query_sequence instead"""
def __get__(self):
return self.query_sequence
def __set__(self, v):
self.query_sequence = v
property qual:
+ """deprecated, query_qualities instead"""
def __get__(self):
return array_to_qualitystring(self.query_qualities)
def __set__(self, v):
self.query_qualities = qualitystring_to_array(v)
property alen:
+ """deprecated, reference_length instead"""
def __get__(self):
return self.reference_length
def __set__(self, v):
self.reference_length = v
property aend:
+ """deprecated, reference_end instead"""
def __get__(self):
return self.reference_end
def __set__(self, v):
self.reference_end = v
property rlen:
+ """deprecated, query_length instead"""
def __get__(self):
return self.query_length
def __set__(self, v):
self.query_length = v
property query:
+ """deprecated, query_alignment_sequence instead"""
def __get__(self):
return self.query_alignment_sequence
def __set__(self, v):
self.query_alignment_sequence = v
property qqual:
+ """deprecated, query_alignment_qualities instead"""
def __get__(self):
return array_to_qualitystring(self.query_alignment_qualities)
def __set__(self, v):
self.query_alignment_qualities = qualitystring_to_array(v)
property qstart:
+ """deprecated, use query_alignment_start instead"""
def __get__(self):
return self.query_alignment_start
def __set__(self, v):
self.query_alignment_start = v
property qend:
+ """deprecated, use query_alignment_end instead"""
def __get__(self):
return self.query_alignment_end
def __set__(self, v):
self.query_alignment_end = v
property qlen:
+ """deprecated, use query_alignment_length instead"""
def __get__(self):
return self.query_alignment_length
def __set__(self, v):
self.query_alignment_length = v
property mrnm:
+ """deprecated, use next_reference_id instead"""
def __get__(self):
return self.next_reference_id
def __set__(self, v):
self.next_reference_id = v
property mpos:
+ """deprecated, use next_reference_start instead"""
def __get__(self):
return self.next_reference_start
def __set__(self, v):
self.next_reference_start = v
property rname:
+ """deprecated, use reference_id instead"""
def __get__(self):
return self.reference_id
def __set__(self, v):
self.reference_id = v
property isize:
+ """deprecated, use template_length instead"""
def __get__(self):
return self.template_length
def __set__(self, v):
self.template_length = v
property blocks:
+ """deprecated, use get_blocks() instead"""
def __get__(self):
return self.get_blocks()
property aligned_pairs:
+ """deprecated, use get_aligned_pairs() instead"""
def __get__(self):
return self.get_aligned_pairs()
property inferred_length:
+ """deprecated, use infer_query_length() instead"""
def __get__(self):
return self.infer_query_length()
property positions:
+ """deprecated, use get_reference_positions() instead"""
def __get__(self):
return self.get_reference_positions()
property tags:
+ """deprecated, use get_tags() instead"""
def __get__(self):
return self.get_tags()
def __set__(self, tags):
self.set_tags(tags)
def overlap(self):
+ """deprecated, use get_overlap() instead"""
return self.get_overlap()
def opt(self, tag):
+ """deprecated, use get_tag() instead"""
return self.get_tag(tag)
def setTag(self, tag, value, value_type=None, replace=True):
+ """deprecated, use set_tag() instead"""
return self.set_tag(tag, value, value_type, replace)
-
+
cdef class PileupColumn:
'''A pileup of reads at a particular reference sequence postion
@@ -2141,9 +2196,9 @@ cdef class PileupColumn:
raise TypeError("this class cannot be instantiated from Python")
def __str__(self):
- return "\t".join(map(str,
+ return "\t".join(map(str,
(self.reference_id,
- self.reference_pos,
+ self.reference_pos,
self.nsegments))) +\
"\n" +\
"\n".join(map(str, self.pileups))
@@ -2184,7 +2239,8 @@ cdef class PileupColumn:
# warning: there could be problems if self.n and self.buf are
# out of sync.
for x from 0 <= x < self.n_pu:
- pileups.append(makePileupRead(&(self.plp[0][x]), self._alignment_file))
+ pileups.append(makePileupRead(&(self.plp[0][x]),
+ self._alignment_file))
return pileups
########################################################
@@ -2202,13 +2258,13 @@ cdef class PileupColumn:
return self.reference_id
def __set__(self, v):
self.reference_id = v
-
+
property n:
def __get__(self):
return self.nsegments
def __set__(self, v):
self.nsegments = v
-
+
cdef class PileupRead:
'''Representation of a read aligned to a particular position in the
@@ -2227,7 +2283,7 @@ cdef class PileupRead:
self.indel, self.level,
self.is_del, self.is_head,
self.is_tail, self.is_refskip)))
-
+
property alignment:
"""a :class:`pysam.AlignedSegment` object of the aligned read"""
def __get__(self):
@@ -2236,7 +2292,7 @@ cdef class PileupRead:
property query_position:
"""position of the read base at the pileup site, 0-based.
None if is_del or is_refskip is set.
-
+
"""
def __get__(self):
if self.is_del or self.is_refskip:
@@ -2244,8 +2300,25 @@ cdef class PileupRead:
else:
return self._qpos
+ property query_position_or_next:
+ """position of the read base at the pileup site, 0-based.
+
+ If the current position is a deletion, returns the next
+ aligned base.
+
+ """
+ def __get__(self):
+ return self._qpos
+
property indel:
- """indel length; 0 for no indel, positive for ins and negative for del"""
+ """indel length for the position follwing the current pileup site.
+
+ This quantity peeks ahead to the next cigar operation in this
+ alignment. If the next operation is and insertion, indel will
+ be positve. If the next operation is a deletion, it will be
+ negation. 0 if the next operation is not an indel.
+
+ """
def __get__(self):
return self._indel
@@ -2260,10 +2333,12 @@ cdef class PileupRead:
return self._is_del
property is_head:
+ """1 iff the base on the padded read is the left-most base."""
def __get__(self):
return self._is_head
property is_tail:
+ """1 iff the base on the padded read is the right-most base."""
def __get__(self):
return self._is_tail
diff --git a/pysam/calignmentfile.pyx b/pysam/calignmentfile.pyx
index 57f2464..f258a66 100644
--- a/pysam/calignmentfile.pyx
+++ b/pysam/calignmentfile.pyx
@@ -11,11 +11,11 @@
# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
# the original sort order intact
#
-# Additionally this module defines numerous additional classes that are part
-# of the internal API. These are:
+# Additionally this module defines numerous additional classes that
+# are part of the internal API. These are:
#
-# Various iterator classes to iterate over alignments in sequential (IteratorRow)
-# or in a stacked fashion (IteratorColumn):
+# Various iterator classes to iterate over alignments in sequential
+# (IteratorRow) or in a stacked fashion (IteratorColumn):
#
# class IteratorRow
# class IteratorRowRegion
@@ -64,6 +64,12 @@ from cpython.version cimport PY_MAJOR_VERSION
from pysam.cutils cimport force_bytes, force_str, charptr_to_str
from pysam.cutils cimport encode_filename, from_string_and_size
from pysam.calignedsegment cimport makeAlignedSegment, makePileupColumn
+from pysam.chtslib cimport hisremote
+
+if PY_MAJOR_VERSION >= 3:
+ from io import StringIO
+else:
+ from StringIO import StringIO
cimport cython
@@ -209,10 +215,10 @@ cdef bam_hdr_t * build_header(new_header):
cdef class AlignmentFile:
- """
- AlignmentFile(filepath_or_object, mode=None, template=None,
+ """AlignmentFile(filepath_or_object, mode=None, template=None,
reference_names=None, reference_lengths=None, text=NULL,
- header=None, add_sq_text=False, check_header=True, check_sq=True)
+ header=None, add_sq_text=False, check_header=True, check_sq=True,
+ filename=None)
A :term:`SAM`/:term:`BAM` formatted file.
@@ -253,15 +259,16 @@ cdef class AlignmentFile:
----------
mode : string
`mode` should be ``r`` for reading or ``w`` for writing. The
- default is text mode (:term:`SAM`). For binary (:term:`BAM`) I/O
- you should append ``b`` for compressed or ``u`` for uncompressed
- :term:`BAM` output. Use ``h`` to output header information in
- text (:term:`TAM`) mode.
+ default is text mode (:term:`SAM`). For binary (:term:`BAM`)
+ I/O you should append ``b`` for compressed or ``u`` for
+ uncompressed :term:`BAM` output. Use ``h`` to output header
+ information in text (:term:`TAM`) mode. Use ``c`` for
+ :term:`CRAM` formatted files.
- If ``b`` is present, it must immediately follow ``r`` or ``w``.
- Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and
- ``wb0``. For instance, to open a :term:`BAM` formatted file for
- reading, type::
+ If ``b`` is present, it must immediately follow ``r`` or
+ ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
+ ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
+ :term:`BAM` formatted file for reading, type::
f = pysam.AlignmentFile('ex1.bam','rb')
@@ -276,33 +283,37 @@ cdef class AlignmentFile:
header : dict
when writing, build header from a multi-level dictionary. The
- first level are the four types ('HD', 'SQ', ...). The
- second level are a list of lines, with each line being a
- list of tag-value pairs. The header is constructed first
- from all the defined fields, followed by user tags in
- alphabetical order.
+ first level are the four types ('HD', 'SQ', ...). The second
+ level are a list of lines, with each line being a list of
+ tag-value pairs. The header is constructed first from all the
+ defined fields, followed by user tags in alphabetical order.
text : string
when writing, use the string provided as the header
- reference_names : list
+ reference_names : list
see referece_lengths
- reference_lengths : list
- when writing, build header from list of chromosome names and lengths.
- By default, 'SQ' and 'LN' tags will be added to the header
- text. This option can be changed by unsetting the flag
+ reference_lengths : list
+ when writing, build header from list of chromosome names and
+ lengths. By default, 'SQ' and 'LN' tags will be added to the
+ header text. This option can be changed by unsetting the flag
`add_sq_text`.
- add_sq_text : bool
- do not add 'SQ' and 'LN' tags to header. This option permits construction
- :term:`SAM` formatted files without a header.
+ add_sq_text : bool
+ do not add 'SQ' and 'LN' tags to header. This option permits
+ construction :term:`SAM` formatted files without a header.
- check_header : bool
+ check_header : bool
when reading, check if header is present (default=True)
- check_sq : bool
- when reading, check if SQ entries are present in header (default=True)
+ check_sq : bool
+ when reading, check if SQ entries are present in header
+ (default=True)
+
+ filename : string
+ Alternative to filepath_or_object. Filename of the file
+ to be opened.
"""
@@ -315,6 +326,10 @@ cdef class AlignmentFile:
self.is_cram = False
self.is_remote = False
+ if "filename" in kwargs:
+ args = [kwargs["filename"]]
+ del kwargs["filename"]
+
self._open(*args, **kwargs)
# allocate memory for iterator
@@ -366,6 +381,7 @@ cdef class AlignmentFile:
add_sq_text=True,
check_header=True,
check_sq=True,
+ filepath_index=None,
referencenames=None,
referencelengths=None):
'''open a sam, bam or cram formatted file.
@@ -374,6 +390,7 @@ cdef class AlignmentFile:
will be closed and a new file will be opened.
'''
cdef char *cfilename
+ cdef char *cindexname
cdef char *cmode
# for backwards compatibility:
@@ -382,36 +399,9 @@ cdef class AlignmentFile:
if referencelengths is not None:
reference_lengths = referencelengths
- # read mode autodetection
+ # autodetection for read
if mode is None:
- try:
- self._open(filepath_or_object,
- 'rb',
- template=template,
- reference_names=reference_names,
- reference_lengths=reference_lengths,
- reference_filename=reference_filename,
- text=text,
- header=header,
- port=port,
- check_header=check_header,
- check_sq=check_sq)
- return
- except ValueError, msg:
- pass
-
- self._open(filepath_or_object,
- 'r',
- template=template,
- reference_names=reference_names,
- reference_lengths=reference_lengths,
- reference_filename=reference_filename,
- text=text,
- header=header,
- port=port,
- check_header=check_header,
- check_sq=check_sq)
- return
+ mode = "r"
assert mode in ("r", "w", "rb", "wb", "wh",
"wbu", "rU", "wb0",
@@ -422,8 +412,15 @@ cdef class AlignmentFile:
if self.htsfile != NULL:
self.close()
+ # StringIO not supported
+ if isinstance(filepath_or_object, StringIO):
+ filename = "stringio"
+ raise NotImplementedError(
+ "access from StringIO objects not supported")
+ if filepath_or_object.closed:
+ raise ValueError('I/O operation on closed StringIO object')
# check if we are working with a File object
- if hasattr(filepath_or_object, "fileno"):
+ elif hasattr(filepath_or_object, "fileno"):
filename = filepath_or_object.name
if filepath_or_object.closed:
raise ValueError('I/O operation on closed file')
@@ -438,11 +435,8 @@ cdef class AlignmentFile:
self._filename = filename = encode_filename(filename)
# FIXME: Use htsFormat when it is available
- self.is_bam = len(mode) > 1 and mode[1] == 'b'
- self.is_cram = len(mode) > 1 and mode[1] == 'c'
self.is_stream = filename == b"-"
- self.is_remote = filename.startswith(b"http:") or \
- filename.startswith(b"ftp:")
+ self.is_remote = hisremote(filename)
cdef char * ctext
cdef hFILE * fp
@@ -512,6 +506,11 @@ cdef class AlignmentFile:
with nogil:
self.htsfile = hts_open(cfilename, cmode)
+ # htsfile.format does not get set until writing, so use
+ # the format specifier explicitely given by the user.
+ self.is_bam = "b" in mode
+ self.is_cram = "c" in mode
+
# set filename with reference sequences. If no filename
# is given, the CRAM reference arrays will be built from
# the @SQ header in the header
@@ -548,6 +547,9 @@ cdef class AlignmentFile:
"could not open file (mode='%s') - "
"is it SAM/BAM format?" % mode)
+ self.is_bam = self.htsfile.format.format == bam
+ self.is_cram = self.htsfile.format.format == cram
+
# bam files require a valid header
if self.is_bam or self.is_cram:
with nogil:
@@ -568,12 +570,11 @@ cdef class AlignmentFile:
"- is it SAM format?" % mode )
# self.header.ignore_sam_err = True
- # disabled for autodetection to work needs to be disabled
- # so that reading from sam-files without headers works
if check_sq and self.header.n_targets == 0:
raise ValueError(
- ("file header is empty (mode='%s') - "
- "is it SAM/BAM format?") % mode)
+ ("file has no sequences defined (mode='%s') - "
+ "is it SAM/BAM format? Consider opening with "
+ "check_seq=True") % mode)
if self.htsfile == NULL:
raise IOError("could not open file `%s`" % filename )
@@ -588,29 +589,49 @@ cdef class AlignmentFile:
if mode[0] == "r" and (self.is_bam or self.is_cram):
# open index for remote files
- if self.is_remote:
+ if self.is_remote and not filepath_index:
cfilename = filename
+
with nogil:
self.index = hts_idx_load(cfilename, format_index)
if self.index == NULL:
warnings.warn(
- "unable to open remote index for '%s'" % filename)
+ "unable to open remote index for '%s'" % cfilename)
else:
- if self.is_bam \
- and not os.path.exists(filename + b".bai") \
- and not os.path.exists(filename[:-4] + b".bai"):
- self.index = NULL
- elif self.is_cram \
- and not os.path.exists(filename + b".crai") \
- and not os.path.exists(filename[:-4] + b".crai"):
- self.index = NULL
+ has_index = True
+ cfilename = filename
+ if filepath_index:
+ if not os.path.exists(filepath_index):
+ warnings.warn(
+ "unable to open index at %s" % cfilename)
+ self.index = NULL
+ has_index = False
else:
+ if self.is_bam \
+ and not os.path.exists(filename + b".bai") \
+ and not os.path.exists(filename[:-4] + b".bai"):
+ self.index = NULL
+ has_index = False
+ elif self.is_cram \
+ and not os.path.exists(filename + b".crai") \
+ and not os.path.exists(filename[:-5] + b".crai"):
+ self.index = NULL
+ has_index = False
+
+ if has_index:
# returns NULL if there is no index or index could
# not be opened
- cfilename = filename
- with nogil:
- self.index = sam_index_load(self.htsfile,
- cfilename)
+ if filepath_index:
+ cindexname = filepath_index = encode_filename(filepath_index)
+ with nogil:
+ self.index = sam_index_load2(self.htsfile,
+ cfilename,
+ cindexname)
+
+ else:
+ with nogil:
+ self.index = sam_index_load(self.htsfile,
+ cfilename)
if self.index == NULL:
raise IOError(
"error while opening index for '%s'" %
@@ -813,8 +834,8 @@ cdef class AlignmentFile:
If only `reference` is set, all reads aligned to `reference`
will be fetched.
- Note that a :term:`SAM` file does not allow random access. If
- `region` or `reference` are given, an exception is raised.
+ A :term:`SAM` file does not allow random access. If `region`
+ or `reference` are given, an exception is raised.
:class:`~pysam.FastaFile`
:class:`~pysam.IteratorRow`
@@ -1083,15 +1104,15 @@ cdef class AlignmentFile:
start=None,
end=None,
region=None,
- until_eof=False):
- '''
- count the number of reads in :term:`region`
+ until_eof=False,
+ read_callback="nofilter"):
+ '''count the number of reads in :term:`region`
The region is specified by :term:`reference`, `start` and
`end`. Alternatively, a :term:`samtools` :term:`region` string
can be supplied.
- Note that a :term:`SAM` file does not allow random access and if
+ A :term:`SAM` file does not allow random access and if
`region` or `reference` are given, an exception is raised.
Parameters
@@ -1105,11 +1126,31 @@ cdef class AlignmentFile:
end : int
end of the genomic region
+
+ region : string
+ a region string in samtools format.
until_eof : bool
count until the end of the file, possibly including
unmapped reads as well.
+ read_callback: string or function
+
+ select a call-back to ignore reads when counting. It can
+ be either a string with the following values:
+
+ ``all``
+ skip reads in which any of the following
+ flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+ BAM_FDUP
+
+ ``nofilter``
+ uses every single read
+
+ Alternatively, `read_callback` can be a function
+ ``check_read(read)`` that should return True only for
+ those reads that shall be included in the counting.
+
Raises
------
@@ -1123,11 +1164,28 @@ cdef class AlignmentFile:
if not self.is_open():
raise ValueError( "I/O operation on closed file" )
+ cdef int filter_method = 0
+ if read_callback == "all":
+ filter_method = 1
+ elif read_callback == "nofilter":
+ filter_method = 2
+
for read in self.fetch(reference=reference,
start=start,
end=end,
region=region,
until_eof=until_eof):
+ # apply filter
+ if filter_method == 1:
+ # filter = "all"
+ if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+ continue
+ elif filter_method == 2:
+ # filter = "nofilter"
+ pass
+ else:
+ if not read_callback(read):
+ continue
counter += 1
return counter
@@ -1417,6 +1475,13 @@ cdef class AlignmentFile:
n = hts_idx_get_n_no_coor(self.index)
return n
+ property format:
+ '''string describing the file format'''
+ def __get__(self):
+ if not self.is_open():
+ raise ValueError( "I/O operation on closed file" )
+ return hts_format_description(&self.htsfile.format)
+
property text:
'''string with the full contents of the :term:`sam file` header as a
string.
@@ -1565,9 +1630,11 @@ cdef class AlignmentFile:
# Compatibility functions for pysam < 0.8.3
def gettid(self, reference):
+ """deprecated, use get_tid() instead"""
return self.get_tid(reference)
def getrname(self, tid):
+ """deprecated, use get_reference_name() instead"""
return self.get_reference_name(tid)
@@ -1966,11 +2033,12 @@ cdef int __advance_snpcalls(void * data, bam1_t * b):
if d.seq != NULL:
free(d.seq)
d.tid = b.core.tid
- d.seq = faidx_fetch_seq(
- d.fastafile,
- d.header.target_name[d.tid],
- 0, MAX_POS,
- &d.seq_len)
+ with nogil:
+ d.seq = faidx_fetch_seq(
+ d.fastafile,
+ d.header.target_name[d.tid],
+ 0, MAX_POS,
+ &d.seq_len)
if d.seq == NULL:
raise ValueError(
@@ -2069,10 +2137,11 @@ cdef class IteratorColumn:
cdef int cnext(self):
'''perform next iteration.
'''
+ # do not release gil here because of call-backs
self.plp = bam_plp_auto(self.pileup_iter,
&self.tid,
&self.pos,
- &self.n_plp )
+ &self.n_plp)
cdef char * getSequence(self):
'''return current reference sequence underlying the iterator.
@@ -2081,13 +2150,15 @@ cdef class IteratorColumn:
property seq_len:
'''current sequence length.'''
- def __get__(self): return self.iterdata.seq_len
+ def __get__(self):
+ return self.iterdata.seq_len
def addReference(self, Fastafile fastafile):
'''
add reference sequences in `fastafile` to iterator.'''
self.fastafile = fastafile
- if self.iterdata.seq != NULL: free(self.iterdata.seq)
+ if self.iterdata.seq != NULL:
+ free(self.iterdata.seq)
self.iterdata.tid = -1
self.iterdata.fastafile = self.fastafile.fastafile
@@ -2129,23 +2200,27 @@ cdef class IteratorColumn:
self._free_pileup_iter()
if self.stepper is None or self.stepper == "all":
- self.pileup_iter = bam_plp_init(
- <bam_plp_auto_f>&__advance_all,
- &self.iterdata)
+ with nogil:
+ self.pileup_iter = bam_plp_init(
+ <bam_plp_auto_f>&__advance_all,
+ &self.iterdata)
elif self.stepper == "nofilter":
- self.pileup_iter = bam_plp_init(
- <bam_plp_auto_f>&__advance_nofilter,
- &self.iterdata)
+ with nogil:
+ self.pileup_iter = bam_plp_init(
+ <bam_plp_auto_f>&__advance_nofilter,
+ &self.iterdata)
elif self.stepper == "samtools":
- self.pileup_iter = bam_plp_init(
- <bam_plp_auto_f>&__advance_snpcalls,
- &self.iterdata)
+ with nogil:
+ self.pileup_iter = bam_plp_init(
+ <bam_plp_auto_f>&__advance_snpcalls,
+ &self.iterdata)
else:
raise ValueError(
"unknown stepper option `%s` in IteratorColumn" % self.stepper)
if self.max_depth:
- bam_plp_set_maxcnt(self.pileup_iter, self.max_depth)
+ with nogil:
+ bam_plp_set_maxcnt(self.pileup_iter, self.max_depth)
# bam_plp_set_mask( self.pileup_iter, self.mask )
@@ -2160,12 +2235,14 @@ cdef class IteratorColumn:
# invalidate sequence if different tid
if self.tid != tid:
- if self.iterdata.seq != NULL: free( self.iterdata.seq )
+ if self.iterdata.seq != NULL:
+ free(self.iterdata.seq)
self.iterdata.seq = NULL
self.iterdata.tid = -1
# self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
- bam_plp_reset(self.pileup_iter)
+ with nogil:
+ bam_plp_reset(self.pileup_iter)
cdef _free_pileup_iter(self):
'''free the memory alloc'd by bam_plp_init.
@@ -2174,9 +2251,10 @@ cdef class IteratorColumn:
another pileup_iter, or else memory will be lost.
'''
if self.pileup_iter != <bam_plp_t>NULL:
- bam_plp_reset(self.pileup_iter)
- bam_plp_destroy(self.pileup_iter)
- self.pileup_iter = <bam_plp_t>NULL
+ with nogil:
+ bam_plp_reset(self.pileup_iter)
+ bam_plp_destroy(self.pileup_iter)
+ self.pileup_iter = <bam_plp_t>NULL
def __dealloc__(self):
# reset in order to avoid memory leak messages for iterators
@@ -2200,7 +2278,7 @@ cdef class IteratorColumnRegion(IteratorColumn):
**kwargs ):
# initialize iterator
- self.setupIteratorData( tid, start, end, 1 )
+ self.setupIteratorData(tid, start, end, 1)
self.start = start
self.end = end
self.truncate = truncate
@@ -2252,10 +2330,10 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
# return result, if within same reference
if self.plp != NULL:
return makePileupColumn(&self.plp,
- self.tid,
- self.pos,
- self.n_plp,
- self.samfile)
+ self.tid,
+ self.pos,
+ self.n_plp,
+ self.samfile)
# otherwise, proceed to next reference or stop
self.tid += 1
@@ -2265,8 +2343,6 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
raise StopIteration
-
-
cdef class SNPCall:
'''the results of a SNP call.'''
cdef int _tid
diff --git a/pysam/cbcf.pxd b/pysam/cbcf.pxd
index b56f7ed..b6e210a 100644
--- a/pysam/cbcf.pxd
+++ b/pysam/cbcf.pxd
@@ -141,13 +141,14 @@ cdef class VariantFile(object):
cdef htsFile *htsfile # pointer to htsFile structure
cdef int64_t start_offset # BGZF offset of first record
- cdef readonly object filename # filename as supplied by user
- cdef readonly object mode # file opening mode
+ cdef readonly object filename # filename as supplied by user
+ cdef readonly object mode # file opening mode
+ cdef readonly object index_filename # filename of index, if supplied by user
cdef readonly VariantHeader header
- cdef readonly BaseIndex index
+ cdef readonly BaseIndex index
- cdef readonly bint drop_samples # true if sample information is to be ignored
+ cdef readonly bint drop_samples # true if sample information is to be ignored
# FIXME: Temporary, use htsFormat when it is available
cdef readonly bint is_bcf # true if file is a bcf file
diff --git a/pysam/cbcf.pyx b/pysam/cbcf.pyx
index 4882503..2a19850 100644
--- a/pysam/cbcf.pyx
+++ b/pysam/cbcf.pyx
@@ -5,13 +5,14 @@
## Cython wrapper for htslib VCF/BCF reader/writer
###############################################################################
#
-# NOTICE: This code is incomplete and preliminary. It does offer a nearly
-# complete immutable Pythonic interface to VCF/BCF metadata and data
-# with reading and writing capability, but has no capability (yet)
-# to mutate the resulting data (beyond dropping all samples).
-# Documentation still needs to be written and a unit test suite is
-# in the works. The code is also superficially specific to Python 2
-# and will require a bit of work to properly adapt to Python 3.
+# NOTICE: This code is incomplete and preliminary. It offers a nearly
+# complete Pythonic interface to VCF/BCF metadata and data with
+# reading and writing capability. It has limited capability to to
+# mutate the resulting data. Documentation and a unit test suite
+# are in the works. The code is best tested under Python 2, but
+# should also work with Python 3. Please report any remaining
+# str/bytes issues on the github site when using Python 3 and I'll
+# fix them promptly.
#
# Here is a minimal example of how to use the API:
#
@@ -189,13 +190,28 @@ import os
import sys
from libc.string cimport strcmp, strpbrk
+from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
cimport cython
-from cpython cimport PyBytes_Check, PyUnicode_Check
+from cpython.object cimport PyObject
+from cpython.ref cimport Py_INCREF
+from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
+from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
+from cpython.bytes cimport PyBytes_FromStringAndSize
+from cpython.unicode cimport PyUnicode_DecodeASCII
from cpython.version cimport PY_MAJOR_VERSION
-__all__ = ['VariantFile', 'VariantHeader']
+from pysam.chtslib cimport hisremote
+
+
+from warnings import warn
+
+
+__all__ = ['VariantFile',
+ 'VariantHeader',
+ 'VariantHeaderRecord',
+ 'VariantRecord']
########################################################################
########################################################################
@@ -217,17 +233,46 @@ cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
## Python 3 compatibility functions
########################################################################
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
+from pysam.cutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
from pysam.cutils cimport encode_filename, from_string_and_size
########################################################################
########################################################################
+## VCF/BCF string intern system
+########################################################################
+
+cdef dict bcf_str_cache = {}
+
+cdef inline bcf_str_cache_get_charptr(const char* s):
+ if s == NULL:
+ return None
+
+ cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
+ if pystr:
+ return <object>pystr
+
+ if PY_MAJOR_VERSION < 3:
+ val = s
+ else:
+ val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
+
+ PyDict_SetItemString(bcf_str_cache, s, val)
+
+ return val
+
+
+########################################################################
+########################################################################
## Low level type conversion helpers
########################################################################
-cdef tuple char_array_to_tuple(const char **a, int n, int free_after=0):
+cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
+ return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
+
+
+cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
if not a:
return None
try:
@@ -237,7 +282,7 @@ cdef tuple char_array_to_tuple(const char **a, int n, int free_after=0):
free(a)
-cdef bcf_array_to_object(void *data, int type, int n, int scalar=0):
+cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
cdef char *datac
cdef int8_t *data8
cdef int16_t *data16
@@ -250,7 +295,13 @@ cdef bcf_array_to_object(void *data, int type, int n, int scalar=0):
if type == BCF_BT_CHAR:
datac = <char *>data
- value = datac[:n] if datac[0] != bcf_str_missing else None
+ while n and datac[n-1] == bcf_str_vector_end:
+ n -= 1
+ value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None
+ # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
+
+ value = tuple(v or None for v in value.split(',')) if value else ()
+ # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
else:
value = []
if type == BCF_BT_INT8:
@@ -280,45 +331,802 @@ cdef bcf_array_to_object(void *data, int type, int n, int scalar=0):
else:
raise TypeError('unsupported info type code')
- if not value:
+ # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
+ if not value:
+ if scalar:
value = None
- elif scalar and len(value) == 1:
- value = value[0]
+ elif count <= 0:
+ value = ()
else:
- value = tuple(value)
+ value = (None,)*count
+ elif scalar and len(value) == 1:
+ value = value[0]
+ else:
+ value = tuple(value)
+
+ return value
+
+
+cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
+ cdef char *datac
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ cdef float *dataf
+ cdef ssize_t i, value_count = len(values)
+
+ assert(value_count <= n)
+
+ if bt_type == BCF_BT_CHAR:
+ if not isinstance(values, (str, bytes)):
+ values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+ value_count = len(values)
+ assert(value_count <= n)
+ datac = <char *>data
+ memcpy(datac, <char *>values, value_count)
+ for i in range(value_count, n):
+ datac[i] = 0
+ elif bt_type == BCF_BT_INT8:
+ datai8 = <int8_t *>data
+ for i in range(value_count):
+ val = values[i]
+ datai8[i] = val if val is not None else bcf_int8_missing
+ for i in range(value_count, n):
+ datai8[i] = bcf_int8_vector_end
+ elif bt_type == BCF_BT_INT16:
+ datai16 = <int16_t *>data
+ for i in range(value_count):
+ val = values[i]
+ datai16[i] = val if val is not None else bcf_int16_missing
+ for i in range(value_count, n):
+ datai16[i] = bcf_int16_vector_end
+ elif bt_type == BCF_BT_INT32:
+ datai32 = <int32_t *>data
+ for i in range(value_count):
+ val = values[i]
+ datai32[i] = val if val is not None else bcf_int32_missing
+ for i in range(value_count, n):
+ datai32[i] = bcf_int32_vector_end
+ elif bt_type == BCF_BT_FLOAT:
+ dataf = <float *>data
+ for i in range(value_count):
+ val = values[i]
+ if val is None:
+ bcf_float_set(dataf + i, bcf_float_missing)
+ else:
+ dataf[i] = val
+ for i in range(value_count, n):
+ bcf_float_set(dataf + i, bcf_float_vector_end)
+ else:
+ raise TypeError('unsupported type')
+
+
+cdef bcf_empty_array(int type, ssize_t n, int vlen):
+ cdef char *datac
+ cdef int32_t *data32
+ cdef float *dataf
+ cdef int i
+
+ if n <= 0:
+ raise ValueError('Cannot create empty array')
+
+ if type == BCF_HT_STR:
+ value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
+ datac = <char *>value
+ for i in range(n):
+ datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
+ elif type == BCF_HT_INT:
+ value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
+ data32 = <int32_t *><char *>value
+ for i in range(n):
+ data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif type == BCF_HT_REAL:
+ value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
+ dataf = <float *><char *>value
+ for i in range(n):
+ bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
+ else:
+ raise TypeError('unsupported header type code')
return value
-cdef object bcf_info_value(const bcf_info_t *z):
+cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
+ void *dst_data, int dst_type, ssize_t dst_values,
+ int vlen):
+ cdef char *src_datac
+ cdef char *dst_datac
+ cdef int8_t *src_datai8
+ cdef int16_t *src_datai16
+ cdef int32_t *src_datai32
+ cdef int32_t *dst_datai
+ cdef float *src_dataf
+ cdef float *dst_dataf
+ cdef ssize_t src_size, dst_size, i, j
+ cdef int val
+
+ if src_values > dst_values:
+ raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
+
+ if src_type == dst_type == BCF_BT_CHAR:
+ src_datac = <char *>src_data
+ dst_datac = <char *>dst_data
+ memcpy(src_datac, dst_datac, src_values)
+ for i in range(src_values, dst_values):
+ dst_datac[i] = 0
+ elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
+ src_datai8 = <int8_t *>src_data
+ dst_datai = <int32_t *>dst_data
+ for i in range(src_values):
+ val = src_datai8[i]
+ if val == bcf_int8_missing:
+ val = bcf_int32_missing
+ elif val == bcf_int8_vector_end:
+ val = bcf_int32_vector_end
+ dst_datai[i] = val
+ for i in range(src_values, dst_values):
+ dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
+ src_datai16 = <int16_t *>src_data
+ dst_datai = <int32_t *>dst_data
+ for i in range(src_values):
+ val = src_datai16[i]
+ if val == bcf_int16_missing:
+ val = bcf_int32_missing
+ elif val == bcf_int16_vector_end:
+ val = bcf_int32_vector_end
+ dst_datai[i] = val
+ for i in range(src_values, dst_values):
+ dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
+ src_datai32 = <int32_t *>src_data
+ dst_datai = <int32_t *>dst_data
+ for i in range(src_values):
+ dst_datai[i] = src_datai32[i]
+ for i in range(src_values, dst_values):
+ dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
+ src_dataf = <float *>src_data
+ dst_dataf = <float *>dst_data
+ for i in range(src_values):
+ dst_dataf[i] = src_dataf[i]
+ for i in range(src_values, dst_values):
+ bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
+ else:
+ raise TypeError('unsupported types')
+
+
+cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf1_t *r = record.ptr
+ cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
+ cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
+
+ scalar[0] = 0
+
+ if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
+ count[0] = number
+ elif length == BCF_VL_FIXED:
+ if number == 1:
+ scalar[0] = 1
+ count[0] = number
+ elif length == BCF_VL_R:
+ count[0] = r.n_allele
+ elif length == BCF_VL_A:
+ count[0] = r.n_allele - 1
+ elif length == BCF_VL_G:
+ count[0] = r.n_allele * (r.n_allele + 1) // 2
+ elif length == BCF_VL_VAR:
+ count[0] = -1
+ else:
+ raise ValueError('Unknown format length')
+
+
+cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
+ cdef bcf_hdr_t *hdr = record.header.ptr
+
cdef char *s
+ cdef ssize_t count
+ cdef int scalar
- if not z:
- return None
- elif z.len == 0:
- value = True
+ bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar)
+
+ if z.len == 0:
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
+ value = True
+ elif scalar:
+ value = None
+ else:
+ value = ()
elif z.len == 1:
if z.type == BCF_BT_INT8:
- value = z.v1.i if z.v1.i != bcf_int8_missing else None
+ if z.v1.i == bcf_int8_missing:
+ value = None
+ elif z.v1.i == bcf_int8_vector_end:
+ value = ()
+ else:
+ value = z.v1.i
elif z.type == BCF_BT_INT16:
- value = z.v1.i if z.v1.i != bcf_int16_missing else None
+ if z.v1.i == bcf_int16_missing:
+ value = None
+ elif z.v1.i == bcf_int16_vector_end:
+ value = ()
+ else:
+ value = z.v1.i
elif z.type == BCF_BT_INT32:
- value = z.v1.i if z.v1.i != bcf_int32_missing else None
+ if z.v1.i == bcf_int32_missing:
+ value = None
+ elif z.v1.i == bcf_int32_vector_end:
+ value = ()
+ else:
+ value = z.v1.i
elif z.type == BCF_BT_FLOAT:
- value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
+ if bcf_float_is_missing(z.v1.f):
+ value = None
+ elif bcf_float_is_vector_end(z.v1.f):
+ value = ()
+ else:
+ value = z.v1.f
elif z.type == BCF_BT_CHAR:
- s = <char *>&z.v1.i
- value = s if not s or s[0] != bcf_str_missing else None
+ value = force_str(chr(z.v1.i))
else:
raise TypeError('unsupported info type code')
+
+ if not scalar and value != ():
+ value = (value,)
else:
- value = bcf_array_to_object(z.vptr, z.type, z.len)
+ value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
return value
-cdef inline int is_gt_fmt(bcf_hdr_t *hdr, bcf_fmt_t *fmt):
- return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id), "GT") == 0
+cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
+ int id, int bt_type, ssize_t bt_len, ssize_t *value_count,
+ int *scalar, int *realloc):
+
+ bcf_get_value_count(record, hl_type, id, value_count, scalar)
+
+ # Validate values now that we know the type and size
+ values = (value,) if not isinstance(value, tuple) else value
+
+ # Validate values now that we know the type and size
+ if ht_type == BCF_HT_FLAG:
+ value_count[0] = 1
+
+ if value_count[0] != -1 and value_count[0] != len(values):
+ if scalar[0]:
+ raise TypeError('value expected to be scalar'.format(value_count[0]))
+ else:
+ raise TypeError('values expected to be {:d}-tuple'.format(value_count[0]))
+
+ if ht_type == BCF_HT_REAL:
+ for v in values:
+ if not(v is None or isinstance(v, (float, int))):
+ raise TypeError('invalid value for Float format')
+ elif ht_type == BCF_HT_INT:
+ for v in values:
+ if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
+ raise TypeError('invalid value for Integer format')
+ for v in values:
+ if not(v is None or bcf_int32_missing < v <= INT32_MAX):
+ raise ValueError('Integer value too small/large to store in VCF/BCF')
+ elif ht_type == BCF_HT_STR:
+ values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+ elif ht_type == BCF_HT_FLAG:
+ if values[0] not in (True, False, None, 1, 0):
+ raise ValueError('Flag values must be: True, False, None, 1, 0')
+ else:
+ raise TypeError('unsupported type')
+
+ realloc[0] = 0
+ if len(values) <= 1 and hl_type == BCF_HL_INFO:
+ realloc[0] = 0
+ elif len(values) > bt_len:
+ realloc[0] = 1
+ elif bt_type == BCF_BT_INT8:
+ for v in values:
+ if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
+ realloc[0] = 1
+ break
+ elif bt_type == BCF_BT_INT16:
+ for v in values:
+ if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
+ realloc[0] = 1
+ break
+
+ return values
+
+
+cdef bcf_encode_alleles(VariantRecord record, values):
+ cdef bcf1_t *r = record.ptr
+ cdef int32_t nalleles = r.n_allele
+ cdef list gt_values = []
+ cdef char *s
+ cdef int i
+
+ if not values:
+ return ()
+
+ if not isinstance(values, (list, tuple)):
+ values = (values,)
+
+ for value in values:
+ if value is None:
+ gt_values.append(None)
+ elif isinstance(value, (str, bytes)):
+ bvalue = force_bytes(value)
+ s = bvalue
+ for i in range(r.n_allele):
+ if strcmp(r.d.allele[i], s) != 0:
+ gt_values.append(bcf_gt_unphased(i))
+ break
+ else:
+ raise ValueError('Unknown allele')
+ else:
+ i = value
+ if not (0 <= i < nalleles):
+ raise ValueError('Invalid allele index')
+ gt_values.append(bcf_gt_unphased(i))
+
+ return gt_values
+
+
+cdef bcf_info_set_value(VariantRecord record, key, value):
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf1_t *r = record.ptr
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
+ cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ if info:
+ info_id = info.key
+ else:
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+ raise KeyError('unknown INFO')
+
+ info_id = kh_val_vdict(d, k).id
+
+ info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
+ values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
+ info.type if info else -1, info.len if info else -1,
+ &value_count, &scalar, &realloc)
+
+ if info_type == BCF_HT_FLAG:
+ if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
+ raise ValueError('Unable to update INFO values')
+ return
+
+ vlen = value_count < 0
+ value_count = len(values)
+
+ # If we can, write updated values to existing allocated storage
+ if info and not realloc:
+ r.d.shared_dirty |= BCF1_DIRTY_INF
+
+ if value_count == 0:
+ info.len = 0
+ # FIXME: Check if need to free vptr if info.len > 0?
+ elif value_count == 1:
+ # FIXME: Check if need to free vptr if info.len > 0?
+ if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
+ bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
+ elif info.type == BCF_BT_FLOAT:
+ bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
+ else:
+ raise TypeError('unsupported info type code')
+ info.len = 1
+ else:
+ bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
+ return
+
+ alloc_len = max(1, value_count)
+ if info and info.len > alloc_len:
+ alloc_len = info.len
+
+ new_values = bcf_empty_array(info_type, alloc_len, vlen)
+ cdef char *valp = <char *>new_values
+
+ if info_type == BCF_HT_INT:
+ dst_type = BCF_BT_INT32
+ elif info_type == BCF_HT_REAL:
+ dst_type = BCF_BT_FLOAT
+ elif info_type == BCF_HT_STR:
+ dst_type = BCF_BT_CHAR
+ else:
+ raise ValueError('Unsupported INFO type')
+
+ bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
+
+ if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
+ raise ValueError('Unable to update INFO values')
+
+
+cdef bcf_info_del_value(VariantRecord record, key):
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf1_t *r = record.ptr
+ cdef ssize_t value_count
+ cdef int scalar
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ if not info:
+ raise KeyError(key)
+
+ bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar)
+
+ if value_count <= 0:
+ null_value = ()
+ elif scalar:
+ null_value = None
+ else:
+ null_value = (None,)*value_count
+
+ bcf_info_set_value(record, bkey, null_value)
+
+
+cdef bcf_format_get_value(VariantRecordSample sample, key):
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef ssize_t count
+ cdef int scalar
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError('invalid FORMAT')
+
+ if is_gt_fmt(hdr, fmt.id):
+ return bcf_format_get_allele_indices(sample)
+
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar)
+
+ if fmt.p and fmt.n and fmt.size:
+ return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
+ elif scalar:
+ return None
+ elif count <= 0:
+ return ()
+ else:
+ return (None,)*count
+
+
+cdef bcf_format_set_value(VariantRecordSample sample, key, value):
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int fmt_id
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
+ cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if fmt:
+ fmt_id = fmt.id
+ else:
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
+ raise KeyError('unknown format')
+
+ fmt_id = kh_val_vdict(d, k).id
+
+ fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
+
+ if fmt_type == BCF_HT_FLAG:
+ raise ValueError('Flag types are not allowed on FORMATs')
+
+ if is_gt_fmt(hdr, fmt_id):
+ value = bcf_encode_alleles(sample.record, value)
+
+ values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
+ fmt.type if fmt else -1, fmt.n if fmt else -1,
+ &value_count, &scalar, &realloc)
+
+ vlen = value_count < 0
+ value_count = len(values)
+
+ # If we can, write updated values to existing allocated storage
+ if fmt and not realloc:
+ r.d.indiv_dirty = 1
+ bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
+ return
+
+ alloc_len = max(1, value_count)
+ if fmt and fmt.n > alloc_len:
+ alloc_len = fmt.n
+
+ n = bcf_hdr_nsamples(hdr)
+ new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
+ cdef char *valp = <char *>new_values
+
+ if fmt_type == BCF_HT_INT:
+ dst_type = BCF_BT_INT32
+ dst_size = sizeof(int32_t) * alloc_len
+ elif fmt_type == BCF_HT_REAL:
+ dst_type = BCF_BT_FLOAT
+ dst_size = sizeof(float) * alloc_len
+ elif fmt_type == BCF_HT_STR:
+ dst_type = BCF_BT_CHAR
+ dst_size = sizeof(char) * alloc_len
+ else:
+ raise ValueError('Unsupported FORMAT type')
+
+ if fmt and n > 1:
+ for i in range(n):
+ bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
+ valp + i*dst_size, dst_type, alloc_len,
+ vlen)
+
+ bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
+
+ if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
+ raise ValueError('Unable to update format values')
+
+
+cdef bcf_format_del_value(VariantRecordSample sample, key):
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef ssize_t value_count
+ cdef int scalar
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError(key)
+
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar)
+
+ if value_count <= 0:
+ null_value = ()
+ elif scalar:
+ null_value = None
+ else:
+ null_value = (None,)*value_count
+
+ bcf_format_set_value(sample, bkey, null_value)
+
+
+cdef bcf_format_get_allele_indices(VariantRecordSample sample):
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ if sample.index < 0 or sample.index >= n or not r.n_fmt:
+ return ()
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ return ()
+
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ cdef int32_t a, nalleles = r.n_allele
+ cdef list alleles = []
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ elif data8[i] == bcf_int8_missing:
+ a = -1
+ else:
+ a = bcf_gt_allele(data8[i])
+ alleles.append(a if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ elif data16[i] == bcf_int16_missing:
+ a = -1
+ else:
+ a = bcf_gt_allele(data16[i])
+ alleles.append(a if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ elif data32[i] == bcf_int32_missing:
+ a = -1
+ else:
+ a = bcf_gt_allele(data32[i])
+ alleles.append(a if 0 <= a < nalleles else None)
+
+ return tuple(alleles)
+
+
+cdef bcf_format_get_alleles(VariantRecordSample sample):
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ cdef int32_t nalleles = r.n_allele
+
+ if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
+ return ()
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ return ()
+
+ cdef int32_t a
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ alleles = []
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ a = bcf_gt_allele(data8[i])
+ alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ a = bcf_gt_allele(data16[i])
+ alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ a = bcf_gt_allele(data32[i])
+ alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+ return tuple(alleles)
+
+
+cdef bint bcf_sample_get_phased(VariantRecordSample sample):
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ if sample.index < 0 or sample.index >= n or not r.n_fmt:
+ return False
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ return False
+
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+
+ cdef bint phased = False
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ elif data8[i] == bcf_int8_missing:
+ continue
+ elif i and not bcf_gt_is_phased(data8[i]):
+ return False
+ else:
+ phased = True
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ elif data16[i] == bcf_int16_missing:
+ continue
+ elif i and not bcf_gt_is_phased(data16[i]):
+ return False
+ else:
+ phased = True
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ elif data32[i] == bcf_int32_missing:
+ continue
+ elif i and not bcf_gt_is_phased(data32[i]):
+ return False
+ else:
+ phased = True
+
+ return phased
+
+
+cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ if sample.index < 0 or sample.index >= n or not r.n_fmt:
+ return
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ raise ValueError('Cannot set phased before genotype is set')
+
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ elif data8[i] == bcf_int8_missing:
+ continue
+ elif i:
+ data8[i] = (data8[i] & 0xFE) | phased
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ elif data16[i] == bcf_int16_missing:
+ continue
+ elif i:
+ data16[i] = (data16[i] & 0xFFFE) | phased
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ elif data32[i] == bcf_int32_missing:
+ continue
+ elif i:
+ data32[i] = (data32[i] & 0xFFFFFFFE) | phased
########################################################################
@@ -342,21 +1150,22 @@ cdef class VariantHeaderRecord(object):
"""header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
def __get__(self):
cdef bcf_hrec_t *r = self.ptr
- return r.key if r.key else None
+ return bcf_str_cache_get_charptr(r.key) if r.key else None
property value:
"""header value. Set only for generic lines, None for FILTER/INFO, etc."""
def __get__(self):
cdef bcf_hrec_t *r = self.ptr
- return r.value if r.value else None
+ return charptr_to_str(r.value) if r.value else None
property attrs:
"""sequence of additional header attributes"""
def __get__(self):
cdef bcf_hrec_t *r = self.ptr
cdef int i
- return tuple( (r.keys[i] if r.keys[i] else None,
- r.vals[i] if r.vals[i] else None) for i in range(r.nkeys) )
+ return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
+ charptr_to_str(r.vals[i]) if r.vals[i] else None)
+ for i in range(r.nkeys))
def __len__(self):
cdef bcf_hrec_t *r = self.ptr
@@ -364,17 +1173,16 @@ cdef class VariantHeaderRecord(object):
def __bool__(self):
cdef bcf_hrec_t *r = self.ptr
- cdef int i
- for i in range(r.nkeys):
- yield r.keys[i]
+ return r.nkeys != 0
def __getitem__(self, key):
"""get attribute value"""
cdef bcf_hrec_t *r = self.ptr
cdef int i
+ bkey = force_bytes(key)
for i in range(r.nkeys):
- if r.keys[i] and r.keys[i] == key:
- return r.vals[i] if r.vals[i] else None
+ if r.keys[i] and r.keys[i] == bkey:
+ return charptr_to_str(r.vals[i]) if r.vals[i] else None
raise KeyError('cannot find metadata key')
def __iter__(self):
@@ -382,7 +1190,7 @@ cdef class VariantHeaderRecord(object):
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
- yield r.keys[i]
+ yield bcf_str_cache_get_charptr(r.keys[i])
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -409,7 +1217,7 @@ cdef class VariantHeaderRecord(object):
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
- yield r.vals[i] if r.vals[i] else None
+ yield charptr_to_str(r.vals[i]) if r.vals[i] else None
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
@@ -417,7 +1225,7 @@ cdef class VariantHeaderRecord(object):
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
- yield r.keys[i], r.vals[i] if r.vals[i] else None
+ yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
def keys(self):
"""D.keys() -> list of D's keys"""
@@ -492,12 +1300,14 @@ cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
cdef class VariantMetadata(object):
- """filter, info or format metadata record from a :class:`VariantHeader` object"""
+ """filter, info or format metadata record from a :class:`VariantHeader`
+ object"""
+
property name:
"""metadata name"""
def __get__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
- return hdr.id[BCF_DT_ID][self.id].key
+ return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
# Q: Should this be exposed?
property id:
@@ -523,7 +1333,8 @@ cdef class VariantMetadata(object):
"""metadata value type"""
def __get__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
- if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or self.type == BCF_HL_FLT:
+ if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or \
+ self.type == BCF_HL_FLT:
return None
return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
@@ -533,10 +1344,11 @@ cdef class VariantMetadata(object):
descr = self.record.get('Description')
if descr:
descr = descr.strip('"')
- return descr
+ return force_str(descr)
property record:
- """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
+ """:class:`VariantHeaderRecord` associated with this
+ :class:`VariantMetadata` object"""
def __get__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
if not bcf_hdr_idinfo_exists(hdr, self.type, self.id):
@@ -586,7 +1398,10 @@ cdef class VariantHeaderMetadata(object):
if number is None:
number = '.'
- items = [('ID', id), ('Number', number), ('Type', type), ('Description', description)]
+ items = [('ID', id),
+ ('Number', number),
+ ('Type', type),
+ ('Description', description)]
items += kwargs.items()
self.header.add_meta(METADATA_TYPES[self.type], items=items)
@@ -600,7 +1415,6 @@ cdef class VariantHeaderMetadata(object):
idpair = hdr.id[BCF_DT_ID] + i
if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
n += 1
-
return n
def __bool__(self):
@@ -612,13 +1426,14 @@ cdef class VariantHeaderMetadata(object):
idpair = hdr.id[BCF_DT_ID] + i
if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
return True
-
return False
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
- cdef khiter_t k = kh_get_vdict(d, key)
+
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
raise KeyError('invalid filter')
@@ -633,7 +1448,7 @@ cdef class VariantHeaderMetadata(object):
for i in range(hdr.n[BCF_DT_ID]):
idpair = hdr.id[BCF_DT_ID] + i
if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
- yield idpair.key
+ yield bcf_str_cache_get_charptr(idpair.key)
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -700,7 +1515,7 @@ cdef class VariantContig(object):
"""contig name"""
def __get__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
- return hdr.id[BCF_DT_CTG][self.id].key
+ return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
property id:
"""contig internal id number"""
@@ -760,7 +1575,8 @@ cdef class VariantHeaderContigs(object):
return makeVariantContig(self.header, index)
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
- cdef khiter_t k = kh_get_vdict(d, key)
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d):
raise KeyError('invalid contig')
@@ -777,7 +1593,7 @@ cdef class VariantHeaderContigs(object):
assert n == hdr.n[BCF_DT_CTG]
for i in range(n):
- yield bcf_hdr_id2name(hdr, i)
+ yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -861,19 +1677,20 @@ cdef class VariantHeaderSamples(object):
if i < 0 or i >= n:
raise IndexError('invalid sample index')
- return hdr.samples[i]
+ return charptr_to_str(hdr.samples[i])
def __iter__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int32_t i, n = bcf_hdr_nsamples(hdr)
for i in range(n):
- yield hdr.samples[i]
+ yield charptr_to_str(hdr.samples[i])
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
- cdef khiter_t k = kh_get_vdict(d, key)
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
return k != kh_end(d)
@@ -929,7 +1746,7 @@ cdef class VariantHeader(object):
property version:
"""VCF version"""
def __get__(self):
- return bcf_hdr_get_version(self.ptr)
+ return force_str(bcf_hdr_get_version(self.ptr))
property samples:
"""samples (:class:`VariantHeaderSamples`)"""
@@ -962,15 +1779,19 @@ cdef class VariantHeader(object):
return makeVariantHeaderMetadata(self, BCF_HL_FMT)
property alts:
- """
- alt metadata (:class:`dict` ID->record). The data returned just a snapshot of alt records,
- is created every time the property is requested, and modifications will not be reflected
- in the header metadata and vice versa.
+ """alt metadata (:class:`dict` ID->record).
+
+ The data returned just a snapshot of alt records, is created
+ every time the property is requested, and modifications will
+ not be reflected in the header metadata and vice versa.
+
+ i.e. it is just a dict that reflects the state of alt records
+ at the time it is created.
- i.e. it is just a dict that reflects the state of alt records at the time it is created.
"""
def __get__(self):
- return { record['ID']:record for record in self.records if record.key.upper() == 'ALT' }
+ return {record['ID']:record for record in self.records
+ if record.key.upper() == 'ALT' }
# only safe to do when opening an htsfile
@@ -982,22 +1803,26 @@ cdef class VariantHeader(object):
if missing_samples:
# FIXME: add specialized exception with payload
- raise ValueError('missing {:d} requested samples'.format(len(missing_samples)))
+ raise ValueError(
+ 'missing {:d} requested samples'.format(
+ len(missing_samples)))
- keep_samples = ','.join(keep_samples)
+ keep_samples = force_bytes(b','.join(keep_samples))
cdef char *keep = <char *>keep_samples if keep_samples else NULL
cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
if ret != 0:
- raise ValueError('bcf_hdr_set_samples failed: ret = {}'.format(ret))
+ raise ValueError(
+ 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
def __str__(self):
cdef int hlen
cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
- ret = hstr[:hlen]
- free(hstr)
- return force_str(hstr)
+ try:
+ return charptr_to_str_w_len(hstr, hlen)
+ finally:
+ free(hstr)
def add_record(self, VariantHeaderRecord record):
"""Add an existing :class:`VariantHeaderRecord` to this header"""
@@ -1011,7 +1836,8 @@ cdef class VariantHeader(object):
def add_line(self, line):
"""Add a metadata line to this header"""
- if bcf_hdr_append(self.ptr, line) < 0:
+ bline = force_bytes(line)
+ if bcf_hdr_append(self.ptr, bline) < 0:
raise ValueError('invalid header line')
if self.ptr.dirty:
@@ -1026,17 +1852,19 @@ cdef class VariantHeader(object):
cdef int quoted
try:
+ key = force_bytes(key)
hrec.key = strdup(key)
if value is not None:
- hrec.value = strdup(value)
+ hrec.value = strdup(force_bytes(value))
else:
for key, value in items:
- bcf_hrec_add_key(hrec, key, len(key))
+ key = force_bytes(key)
+ bcf_hrec_add_key(hrec, key, <int>len(key))
- value = str(value)
+ value = force_bytes(str(value))
quoted = strpbrk(value, ' ;,"\t<>') != NULL
- bcf_hrec_set_val(hrec, hrec.nkeys-1, value, len(value), quoted)
+ bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
except:
bcf_hrec_destroy(hrec)
raise
@@ -1048,7 +1876,8 @@ cdef class VariantHeader(object):
def add_sample(self, name):
"""Add a new sample to this header"""
- if bcf_hdr_add_sample(self.ptr, name) < 0:
+ bname = force_bytes(name)
+ if bcf_hdr_add_sample(self.ptr, bname) < 0:
raise ValueError('Duplicated sample name: {}'.format(name))
if self.ptr.dirty:
bcf_hdr_sync(self.ptr)
@@ -1070,7 +1899,8 @@ cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
########################################################################
cdef class VariantRecordFilter(object):
- """mapping from filter index or name to :class:`VariantMetadata` object for filters set on a :class:`VariantRecord` object."""
+ """Filters set on a :class:`VariantRecord` object, presented as a mapping from
+ filter index or name to :class:`VariantMetadata` object"""
def __len__(self):
return self.record.ptr.d.n_flt
@@ -1095,20 +1925,54 @@ cdef class VariantRecordFilter(object):
if key == '.':
key = 'PASS'
- id = bcf_hdr_id2int(hdr, BCF_DT_ID, key)
+ bkey = force_bytes(key)
+ id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
- if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, self.record.ptr, key):
+ if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
+ or not bcf_has_filter(hdr, self.record.ptr, bkey):
raise KeyError('Invalid filter')
return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
+ def __delitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int index, id
+ cdef int n = r.d.n_flt
+
+ if isinstance(key, int):
+ index = key
+
+ if index < 0 or index >= n:
+ raise IndexError('invalid filter index')
+
+ id = r.d.flt[index]
+ else:
+ if key == '.':
+ key = 'PASS'
+
+ bkey = force_bytes(key)
+ id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+ if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
+ or not bcf_has_filter(hdr, self.record.ptr, bkey):
+ raise KeyError('Invalid filter')
+
+ bcf_remove_filter(hdr, r, id, 0)
+
+ def clear(self):
+ """Clear all filters"""
+ cdef bcf1_t *r = self.record.ptr
+ r.d.shared_dirty |= BCF1_DIRTY_FLT
+ r.d.n_flt = 0
+
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int i, n = r.d.n_flt
+ cdef int i
- for i in range(n):
- yield bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i])
+ for i in range(r.d.n_flt):
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -1120,7 +1984,8 @@ cdef class VariantRecordFilter(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- return bcf_has_filter(hdr, r, key) == 1
+ bkey = force_bytes(key)
+ return bcf_has_filter(hdr, r, bkey) == 1
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
@@ -1165,40 +2030,80 @@ cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
cdef class VariantRecordFormat(object):
- """mapping from format name or index to :class:`VariantMetadata` object for formats present in a :class:`VariantRecord` object."""
+ """Format data present for each sample in a :class:`VariantRecord` object,
+ presented as mapping from format name to :class:`VariantMetadata` object."""
def __len__(self):
- return self.record.ptr.n_fmt
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = 0
+
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ n += 1
+ return n
def __bool__(self):
- return self.record.ptr.n_fmt != 0
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i
+
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ return True
+ return False
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef bcf_fmt_t *fmt
- cdef int index
- cdef int n = r.n_fmt
- if isinstance(key, int):
- index = key
- if index < 0 or index >= n:
- raise IndexError('invalid format index')
- fmt = &r.d.fmt[index]
- else:
- fmt = bcf_get_fmt(hdr, r, key)
- if not fmt:
- raise KeyError('unknown format')
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError('unknown format')
return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
+ def __delitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError('unknown format')
+
+ if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
+ raise ValueError('Unable to delete FORMAT')
+
+ def clear(self):
+ """Clear all formats for all samples within the associated
+ :class:`VariantRecord` instance"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt
+ cdef const char *key
+ cdef int i
+
+ for i in reversed(range(r.n_fmt)):
+ fmt = &r.d.fmt[i]
+ if fmt.p:
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
+ if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
+ raise ValueError('Unable to delete FORMAT')
+
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int i, n = r.n_fmt
+ cdef bcf_fmt_t *fmt
+ cdef int i
- for i in range(n):
- yield bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.fmt[i].id)
+ for i in range(r.n_fmt):
+ fmt = &r.d.fmt[i]
+ if fmt.p:
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -1210,8 +2115,9 @@ cdef class VariantRecordFormat(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, key)
- return fmt != NULL
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+ return fmt != NULL and fmt.p != NULL
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
@@ -1249,7 +2155,8 @@ cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
if not record:
raise ValueError('invalid VariantRecord')
- cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
+ cdef VariantRecordFormat format = VariantRecordFormat.__new__(
+ VariantRecordFormat)
format.record = record
return format
@@ -1257,7 +2164,8 @@ cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
#TODO: Add a getmeta method to return the corresponding VariantMetadata?
cdef class VariantRecordInfo(object):
- """mapping from info metadata name to value for info data present in a :class:`VariantRecord` object."""
+ """Info data stored in a :class:`VariantRecord` object, presented as a
+ mapping from info metadata name to value."""
def __len__(self):
return self.record.ptr.n_info
@@ -1268,20 +2176,82 @@ cdef class VariantRecordInfo(object):
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef bcf_info_t *info = bcf_get_info(hdr, r, key)
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef info_id
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
if not info:
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+
+ info_id = kh_val_vdict(d, k).id
+ else:
+ info_id = info.key
+
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
+ return info != NULL and info.vptr != NULL
+
+ if not info or not info.vptr:
+ raise KeyError('Invalid INFO field: {}'.format(key))
+
+ return bcf_info_get_value(self.record, info)
+
+ def __setitem__(self, key, value):
+ bcf_info_set_value(self.record, key, value)
+
+ def __delitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ if not info or not info.vptr:
raise KeyError('Unknown INFO field: {}'.format(key))
- return bcf_info_value(info)
+ if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete INFO')
+
+ def clear(self):
+ """Clear all info data"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef const char *key
+ cdef int i
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ if info and info.vptr:
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete INFO')
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int i, n = r.n_info
+ cdef bcf_info_t *info
+ cdef int i
- for i in range(n):
- yield bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.info[i].key)
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ if info and info.vptr:
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -1293,7 +2263,12 @@ cdef class VariantRecordInfo(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef bcf_info_t *info = bcf_get_info(hdr, r, key)
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
return info != NULL
@@ -1305,24 +2280,26 @@ cdef class VariantRecordInfo(object):
"""D.itervalues() -> an iterator over the values of D"""
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
- cdef int i, n = r.n_info
+ cdef int i
- for i in range(n):
+ for i in range(r.n_info):
info = &r.d.info[i]
- yield bcf_info_value(info)
+ if info and info.vptr:
+ yield bcf_info_get_value(self.record, info)
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
- cdef int i, n = r.n_info
+ cdef int i
- for i in range(n):
+ for i in range(r.n_info):
info = &r.d.info[i]
- key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
- value = bcf_info_value(info)
- yield key, value
+ if info and info.vptr:
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ value = bcf_info_get_value(self.record, info)
+ yield bcf_str_cache_get_charptr(key), value
def keys(self):
"""D.keys() -> list of D's keys"""
@@ -1372,7 +2349,8 @@ cdef class VariantRecordSamples(object):
if isinstance(key, int):
sample_index = key
else:
- sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, key)
+ bkey = force_bytes(key)
+ sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
raise KeyError('invalid sample name')
@@ -1387,7 +2365,7 @@ cdef class VariantRecordSamples(object):
cdef int32_t i, n = bcf_hdr_nsamples(hdr)
for i in range(n):
- yield hdr.samples[i]
+ yield charptr_to_str(hdr.samples[i])
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -1407,7 +2385,8 @@ cdef class VariantRecordSamples(object):
if isinstance(key, int):
sample_index = key
else:
- sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, key)
+ bkey = force_bytes(key)
+ sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
raise KeyError('invalid sample name')
@@ -1433,7 +2412,7 @@ cdef class VariantRecordSamples(object):
cdef int32_t i, n = bcf_hdr_nsamples(hdr)
for i in range(n):
- yield hdr.samples[i], makeVariantRecordSample(self.record, i)
+ yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
def keys(self):
"""D.keys() -> list of D's keys"""
@@ -1457,7 +2436,8 @@ cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
if not record:
raise ValueError('invalid VariantRecord')
- cdef VariantRecordSamples samples = VariantRecordSamples.__new__(VariantRecordSamples)
+ cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
+ VariantRecordSamples)
samples.record = record
return samples
@@ -1485,10 +2465,11 @@ cdef class VariantRecord(object):
property chrom:
"""chromosome/contig name"""
def __get__(self):
- return bcf_hdr_id2name(self.header.ptr, self.ptr.rid)
+ return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
def __set__(self, chrom):
cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
- cdef khint_t k = kh_get_vdict(d, chrom)
+ bchrom = force_bytes(chrom)
+ cdef khint_t k = kh_get_vdict(d, bchrom)
if k == kh_end(d):
raise ValueError('Invalid chromosome/contig')
self.ptr.rid = kh_val_vdict(d, k).id
@@ -1496,10 +2477,11 @@ cdef class VariantRecord(object):
property contig:
"""chromosome/contig name"""
def __get__(self):
- return bcf_hdr_id2name(self.header.ptr, self.ptr.rid)
+ return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
def __set__(self, chrom):
cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
- cdef khint_t k = kh_get_vdict(d, chrom)
+ bchrom = force_bytes(chrom)
+ cdef khint_t k = kh_get_vdict(d, bchrom)
if k == kh_end(d):
raise ValueError('Invalid chromosome/contig')
self.ptr.rid = kh_val_vdict(d, k).id
@@ -1512,6 +2494,11 @@ cdef class VariantRecord(object):
if pos < 1:
raise ValueError('Position must be positive')
# FIXME: check start <= stop?
+ # KBJ: Can't or else certain mutating operations will become
+ # difficult or impossible. e.g. having to delete
+ # info['END'] before being able to reset pos is going to
+ # create subtle bugs. Better to check this when writing
+ # records.
self.ptr.pos = pos - 1
property start:
@@ -1522,6 +2509,7 @@ cdef class VariantRecord(object):
if start < 0:
raise ValueError('Start coordinate must be non-negative')
# FIXME: check start <= stop?
+ # KBJ: See above.
self.ptr.pos = start
property stop:
@@ -1550,92 +2538,105 @@ cdef class VariantRecord(object):
if qual is not None:
self.ptr.qual = qual
else:
- memcpy(&self.ptr.qual, &bcf_float_missing, 4)
-
-# property n_info:
-# def __get__(self):
-# if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
-# raise ValueError('Error unpacking BCFRecord')
-# return self.ptr.n_info
+ bcf_float_set(&self.ptr.qual, bcf_float_missing)
# property n_allele:
# def __get__(self):
# return self.ptr.n_allele
-# property n_fmt:
-# def __get__(self):
-# return self.ptr.n_fmt
-
# property n_sample:
# def __get__(self):
# return self.ptr.n_sample
-# property shared:
-# def __get__(self):
-# return self.ptr.shared.s
-
-# property indiv:
-# def __get__(self):
-# return self.ptr.indiv.s
-
-# property n_flt:
-# def __get__(self):
-# if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
-# raise ValueError('Error unpacking VariantRecord')
-# return self.ptr.d.n_flt
-
property id:
"""record identifier or None if not available"""
def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- id = self.ptr.d.id
- return id if id != b'.' else None
+ return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
def __set__(self, id):
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
cdef char *idstr = NULL
if id is not None:
- idstr = id
+ bid = force_bytes(id)
+ idstr = bid
if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
raise ValueError('Error updating id')
property ref:
"""reference allele"""
def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- return self.ptr.d.allele[0] if self.ptr.d.allele else None
+ return charptr_to_str(r.d.allele[0]) if r.d.allele else None
def __set__(self, ref):
- alleles = list(self.alleles)
- alleles[0] = ref
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ #FIXME: Set alleles directly -- this is stupid
+ if not ref:
+ raise ValueError('ref allele cannot be null')
+ ref = force_bytes(ref)
+ if r.d.allele and r.n_allele:
+ alleles = [r.d.allele[i] for i in range(r.n_allele)]
+ alleles[0] = ref
+ else:
+ alleles = [ref]
self.alleles = alleles
property alleles:
"""tuple of reference allele followed by alt alleles"""
def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- if not self.ptr.d.allele:
+ if not r.d.allele:
return None
- return tuple(self.ptr.d.allele[i] for i in range(self.ptr.n_allele))
+ cdef tuple res = PyTuple_New(r.n_allele)
+ for i in range(r.n_allele):
+ a = charptr_to_str(r.d.allele[i])
+ PyTuple_SET_ITEM(res, i, a)
+ Py_INCREF(a)
+ return res
def __set__(self, values):
- if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- values = ','.join(values)
- if bcf_update_alleles_str(self.header.ptr, self.ptr, values) < 0:
+ values = [force_bytes(v) for v in values]
+ if b'' in values:
+ raise ValueError('cannot set null allele')
+ values = b','.join(values)
+ if bcf_update_alleles_str(self.header.ptr, r, values) < 0:
raise ValueError('Error updating alleles')
property alts:
"""tuple of alt alleles"""
def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- if self.ptr.n_allele < 2 or not self.ptr.d.allele:
+ if r.n_allele < 2 or not r.d.allele:
return None
- return tuple(self.ptr.d.allele[i] for i in range(1,self.ptr.n_allele))
- def __set__(self, alts):
- alleles = [self.ref]
- alleles.extend(alts)
- self.alleles = alleles
+ cdef tuple res = PyTuple_New(r.n_allele - 1)
+ for i in range(1, r.n_allele):
+ a = charptr_to_str(r.d.allele[i])
+ PyTuple_SET_ITEM(res, i - 1, a)
+ Py_INCREF(a)
+ return res
+ def __set__(self, values):
+ #FIXME: Set alleles directly -- this is stupid
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ values = [force_bytes(v) for v in values]
+ if b'' in values:
+ raise ValueError('cannot set null alt allele')
+ ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
+ self.alleles = ref + values
property filter:
"""filter information (see :class:`VariantRecordFilter`)"""
@@ -1661,7 +2662,7 @@ cdef class VariantRecord(object):
property samples:
"""sample data (see :class:`VariantRecordSamples`)"""
def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_IND) < 0:
+ if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
return makeVariantRecordSamples(self)
@@ -1684,8 +2685,7 @@ cdef class VariantRecord(object):
# break
# line.l -= 1
- ret = line.s[:line.l]
- ret = force_str(ret)
+ ret = charptr_to_str_w_len(line.s, line.l)
if line.m:
free(line.s)
@@ -1715,7 +2715,8 @@ cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
cdef class VariantRecordSample(object):
"""Data for a single sample from a :class:`VariantRecord` object.
- Provides data accessors for genotypes and a mapping interface from format name to values.
+ Provides data accessors for genotypes and a mapping interface
+ from format name to values.
"""
property name:
@@ -1728,183 +2729,90 @@ cdef class VariantRecordSample(object):
if self.index < 0 or self.index >= n:
raise ValueError('invalid sample index')
- return hdr.samples[self.index]
+ return charptr_to_str(hdr.samples[self.index])
property allele_indices:
"""allele indices for called genotype, if present. Otherwise None"""
def __get__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
-
- if self.index < 0 or self.index >= n or not r.n_fmt:
- return None
-
- cdef bcf_fmt_t *fmt0 = r.d.fmt
- cdef int gt0 = is_gt_fmt(hdr, fmt0)
-
- if not gt0 or not fmt0.n:
- return None
-
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
- alleles = []
-
- if fmt0.type == BCF_BT_INT8:
- data8 = <int8_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data8[i] == bcf_int8_vector_end:
- break
- alleles.append( (data8[i] >> 1) - 1 )
- elif fmt0.type == BCF_BT_INT16:
- data16 = <int16_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data16[i] == bcf_int16_vector_end:
- break
- alleles.append( (data16[i] >> 1) - 1 )
- elif fmt0.type == BCF_BT_INT32:
- data32 = <int32_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data32[i] == bcf_int32_vector_end:
- break
- alleles.append( (data32[i] >> 1) - 1 )
-
- return tuple(alleles)
+ return bcf_format_get_allele_indices(self)
+ def __set__(self, values):
+ self['GT'] = values
+ def __del__(self):
+ self['GT'] = ()
property alleles:
"""alleles for called genotype, if present. Otherwise None"""
def __get__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
- cdef int32_t nalleles = r.n_allele
-
- if self.index < 0 or self.index >= nsamples or not r.n_fmt:
- return None
-
- cdef bcf_fmt_t *fmt0 = r.d.fmt
- cdef int gt0 = is_gt_fmt(hdr, fmt0)
-
- if not gt0 or not fmt0.n:
- return None
-
- cdef int32_t a
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
- alleles = []
-
- if fmt0.type == BCF_BT_INT8:
- data8 = <int8_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data8[i] == bcf_int8_vector_end:
- break
- a = (data8[i] >> 1) - 1
- alleles.append(r.d.allele[a] if 0 <= a < nalleles else None)
- elif fmt0.type == BCF_BT_INT16:
- data16 = <int16_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data16[i] == bcf_int16_vector_end:
- break
- a = (data16[i] >> 1) - 1
- alleles.append(r.d.allele[a] if 0 <= a < nalleles else None)
- elif fmt0.type == BCF_BT_INT32:
- data32 = <int32_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data32[i] == bcf_int32_vector_end:
- break
- a = (data32[i] >> 1) - 1
- alleles.append(r.d.allele[a] if 0 <= a < nalleles else None)
-
- return tuple(alleles)
+ return bcf_format_get_alleles(self)
+ def __set__(self, values):
+ self['GT'] = values
+ def __del__(self):
+ self['GT'] = ()
property phased:
"""False if genotype is missing or any allele is unphased. Otherwise True."""
def __get__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ return bcf_sample_get_phased(self)
+ def __set__(self, value):
+ bcf_sample_set_phased(self, value)
- if self.index < 0 or self.index >= n or not r.n_fmt:
- return False
+ def __len__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = 0
- cdef bcf_fmt_t *fmt0 = r.d.fmt
- cdef int gt0 = is_gt_fmt(hdr, fmt0)
+ if bcf_unpack(r, BCF_UN_FMT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
- if not gt0 or not fmt0.n:
- return False
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ n += 1
+ return n
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
-
- phased = False
-
- if fmt0.type == BCF_BT_INT8:
- data8 = <int8_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data8[i] == bcf_int8_vector_end:
- break
- if i and data8[i] & 1 == 0:
- return False
- phased = True
- elif fmt0.type == BCF_BT_INT16:
- data16 = <int16_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data16[i] == bcf_int16_vector_end:
- break
- if i and data16[i] & 1 == 0:
- return False
- phased = True
- elif fmt0.type == BCF_BT_INT32:
- data32 = <int32_t *>(fmt0.p + self.index * fmt0.size)
- for i in range(fmt0.n):
- if data32[i] == bcf_int32_vector_end:
- break
- if i and data32[i] & 1 == 0:
- return False
- phased = True
-
- return phased
+ def __bool__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i
- def __len__(self):
- return self.record.ptr.n_fmt
+ if bcf_unpack(r, BCF_UN_FMT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
- def __bool__(self):
- return self.record.ptr.n_fmt != 0
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ return True
+ return False
def __getitem__(self, key):
+ return bcf_format_get_value(self, key)
+
+ def __setitem__(self, key, value):
+ bcf_format_set_value(self, key, value)
+
+ def __delitem__(self, key):
+ bcf_format_del_value(self, key)
+
+ def clear(self):
+ """Clear all format data (including genotype) for this sample"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_fmt_t *fmt
- cdef int index
-
- if isinstance(key, int):
- index = key
- if index < 0 or index >= r.n_fmt:
- raise IndexError('invalid format index')
- fmt = r.d.fmt + index
- else:
- fmt = bcf_get_fmt(hdr, r, key)
-
- if not fmt:
- raise KeyError('invalid format requested')
+ cdef int i
- if is_gt_fmt(hdr, fmt):
- return self.alleles
- elif fmt.p and fmt.n and fmt.size:
- return bcf_array_to_object(fmt.p + self.index * fmt.size, fmt.type, fmt.n, scalar=1)
- else:
- return None
+ for i in range(r.n_fmt):
+ fmt = &r.d.fmt[i]
+ if fmt.p:
+ bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int i, n = r.n_fmt
+ cdef bcf_fmt_t *fmt
+ cdef int i
- for i in range(n):
- yield bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.fmt[i].id)
+ for i in range(r.n_fmt):
+ fmt = &r.d.fmt[i]
+ if r.d.fmt[i].p:
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
@@ -1916,8 +2824,9 @@ cdef class VariantRecordSample(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, key)
- return fmt != NULL
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+ return fmt != NULL and fmt.p != NULL
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
@@ -2155,14 +3064,18 @@ cdef class BCFIterator(BaseIterator):
if contig is not None or start is not None or stop is not None:
raise ValueError # FIXME
- cregion = region
+ bregion = force_bytes(region)
+ cregion = bregion
with nogil:
self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion)
else:
if contig is None:
raise ValueError # FIXME
- rid = index.refmap.get(contig, -1)
+ try:
+ rid = index.refmap[contig]
+ except KeyError:
+ raise('Unknown contig specified')
if start is None:
start = 0
@@ -2314,7 +3227,7 @@ cdef class TabixIterator(BaseIterator):
cdef class VariantFile(object):
- """*(filename, mode=None, header=None, drop_samples=False)*
+ """*(filename, mode=None, index_filename=None, header=None, drop_samples=False)*
A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
opened.
@@ -2346,15 +3259,16 @@ cdef class VariantFile(object):
self.htsfile = NULL
def __init__(self, *args, **kwargs):
- self.header = None
- self.index = None
- self.filename = None
- self.mode = None
- self.is_stream = False
- self.is_remote = False
- self.is_reading = False
- self.drop_samples = False
- self.start_offset = -1
+ self.header = None
+ self.index = None
+ self.filename = None
+ self.mode = None
+ self.index_filename = None
+ self.is_stream = False
+ self.is_remote = False
+ self.is_reading = False
+ self.drop_samples = False
+ self.start_offset = -1
self.open(*args, **kwargs)
@@ -2371,7 +3285,8 @@ cdef class VariantFile(object):
return False
property category:
- """General file format category. One of UNKNOWN, ALIGNMENTS, VARIANTS, INDEX, REGIONS"""
+ """General file format category. One of UNKNOWN, ALIGNMENTS,
+ VARIANTS, INDEX, REGIONS"""
def __get__(self):
if not self.htsfile:
raise ValueError('metadata not available on closed file')
@@ -2379,7 +3294,9 @@ cdef class VariantFile(object):
property format:
"""File format.
- One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
+
+ One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
+ BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
"""
def __get__(self):
if not self.htsfile:
@@ -2391,10 +3308,13 @@ cdef class VariantFile(object):
def __get__(self):
if not self.htsfile:
raise ValueError('metadata not available on closed file')
- return self.htsfile.format.version.major, self.htsfile.format.version.minor
+ return (self.htsfile.format.version.major,
+ self.htsfile.format.version.minor)
property compression:
- """File compression. One of NONE, GZIP, BGZF, CUSTOM."""
+ """File compression.
+
+ One of NONE, GZIP, BGZF, CUSTOM."""
def __get__(self):
if not self.htsfile:
raise ValueError('metadata not available on closed file')
@@ -2407,7 +3327,7 @@ cdef class VariantFile(object):
raise ValueError('metadata not available on closed file')
cdef char *desc = hts_format_description(&self.htsfile.format)
try:
- return force_str(desc)
+ return charptr_to_str(desc)
finally:
free(desc)
@@ -2427,8 +3347,9 @@ cdef class VariantFile(object):
if not self.is_open:
raise ValueError('I/O operation on closed file')
- if self.mode[0] != b'r':
- raise ValueError('cannot iterate over Variantfile opened for writing')
+ if not self.mode.startswith(b'r'):
+ raise ValueError(
+ 'cannot iterate over Variantfile opened for writing')
self.is_reading = 1
return self
@@ -2461,7 +3382,8 @@ cdef class VariantFile(object):
cdef VariantFile vars = VariantFile.__new__(VariantFile)
cdef bcf_hdr_t *hdr
- cdef char *cfilename, *cmode
+ cdef char *cfilename
+ cdef char *cmode
# FIXME: re-open using fd or else header and index could be invalid
cfilename, cmode = self.filename, self.mode
@@ -2473,16 +3395,17 @@ cdef class VariantFile(object):
# minimize overhead by re-using header and index. This approach is
# currently risky, but see above for how this can be mitigated.
- vars.header = self.header
- vars.index = self.index
-
- vars.filename = self.filename
- vars.mode = self.mode
- vars.drop_samples = self.drop_samples
- vars.is_stream = self.is_stream
- vars.is_remote = self.is_remote
- vars.is_reading = self.is_reading
- vars.start_offset = self.start_offset
+ vars.header = self.header
+ vars.index = self.index
+
+ vars.filename = self.filename
+ vars.mode = self.mode
+ vars.index_filename = self.index_filename
+ vars.drop_samples = self.drop_samples
+ vars.is_stream = self.is_stream
+ vars.is_remote = self.is_remote
+ vars.is_reading = self.is_reading
+ vars.start_offset = self.start_offset
if self.htsfile.is_bin:
vars.seek(self.tell())
@@ -2493,51 +3416,50 @@ cdef class VariantFile(object):
return vars
- def open(self, filename, mode=None, VariantHeader header=None, drop_samples=False):
+ def open(self, filename, mode='rb',
+ index_filename=None,
+ VariantHeader header=None,
+ drop_samples=False):
"""open a vcf/bcf file.
If open is called on an existing VariantFile, the current file will be
closed and a new file will be opened.
"""
cdef bcf_hdr_t *hdr
+ cdef BGZF *bgzfp
cdef hts_idx_t *idx
cdef tbx_t *tidx
- cdef char *cfilename, *cmode
+ cdef char *cfilename
+ cdef char *cindex_filename = NULL
+ cdef char *cmode
# close a previously opened file
if self.is_open:
self.close()
- # read mode autodetection
- if mode is None:
- try:
- self.open(filename, 'rb', header=header)
- return
- except ValueError, msg:
- pass
-
- self.open(filename, 'r', header=header)
- return
-
if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'):
raise ValueError('invalid file opening mode `{}`'.format(mode))
- mode = mode.encode('ascii')
-
# for htslib, wbu seems to not work
- if mode == b'wbu':
- mode = b'wb0'
+ if mode == 'wbu':
+ mode = 'wb0'
- self.mode = mode
+ self.mode = mode = force_bytes(mode)
self.filename = filename = encode_filename(filename)
+ if index_filename is not None:
+ self.index_filename = index_filename = encode_filename(index_filename)
+ else:
+ self.index_filename = None
self.drop_samples = bool(drop_samples)
+ self.header = None
- # FIXME: Use htsFormat when it is available
- self.is_remote = filename.startswith(b'http:') or filename.startswith(b'ftp:')
+ self.is_remote = hisremote(filename)
self.is_stream = filename == b'-'
- if mode[0] == b'w':
+ if mode.startswith(b'w'):
# open file for writing
+ if index_filename is not None:
+ raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
# header structure (used for writing)
if header:
@@ -2545,8 +3467,9 @@ cdef class VariantFile(object):
else:
raise ValueError('a VariantHeader must be specified')
- # open file. Header gets written to file at the same time for bam files
- # and sam files (in the latter case, the mode needs to be wh)
+ # open file. Header gets written to file at the same time
+ # for bam files and sam files (in the latter case, the
+ # mode needs to be wh)
cfilename, cmode = filename, mode
with nogil:
self.htsfile = hts_open(cfilename, cmode)
@@ -2557,7 +3480,7 @@ cdef class VariantFile(object):
with nogil:
bcf_hdr_write(self.htsfile, self.header.ptr)
- elif mode[0] == b'r':
+ elif mode.startswith(b'r'):
# open file for reading
if filename != b'-' and not self.is_remote and not os.path.exists(filename):
raise IOError('file `{}` not found'.format(filename))
@@ -2567,52 +3490,65 @@ cdef class VariantFile(object):
self.htsfile = hts_open(cfilename, cmode)
if not self.htsfile:
- raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format((filename, mode)))
+ raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+ if self.htsfile.format.format not in (bcf, vcf):
+ raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+ if self.htsfile.format.compression == bgzf:
+ bgzfp = hts_get_bgzfp(self.htsfile)
+ if bgzfp and bgzf_check_EOF(bgzfp) == 0:
+ warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename))
with nogil:
hdr = bcf_hdr_read(self.htsfile)
- self.header = makeVariantHeader(hdr)
- if not self.header:
- raise ValueError("file `{}` does not have valid header (mode='{}') - is it BCF format?".format((filename, mode)))
+ try:
+ self.header = makeVariantHeader(hdr)
+ except ValueError:
+ raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode))
# check for index and open if present
if self.htsfile.format.format == bcf:
- cfilename = filename
+ if index_filename is not None:
+ cindex_filename = index_filename
with nogil:
- idx = bcf_index_load(cfilename)
+ idx = bcf_index_load2(cfilename, cindex_filename)
self.index = makeBCFIndex(self.header, idx)
- else:
- tabix_filename = filename + '.tbi'
- cfilename = tabix_filename
+
+ elif self.htsfile.format.compression == bgzf:
+ if index_filename is not None:
+ cindex_filename = index_filename
with nogil:
- tidx = tbx_index_load(cfilename)
+ tidx = tbx_index_load2(cfilename, cindex_filename)
self.index = makeTabixIndex(tidx)
if not self.is_stream:
self.start_offset = self.tell()
+ else:
+ raise ValueError("unknown mode {}".format(mode))
def reset(self):
"""reset file position to beginning of file just after the header."""
return self.seek(self.start_offset, 0)
def seek(self, uint64_t offset):
- """move file pointer to position *offset*, see :meth:`pysam.VariantFile.tell`."""
+ """move file pointer to position *offset*, see
+ :meth:`pysam.VariantFile.tell`."""
if not self.is_open:
raise ValueError('I/O operation on closed file')
if self.is_stream:
raise OSError('seek not available in streams')
- cdef int ret
+ cdef int64_t ret
if self.htsfile.format.compression != no_compression:
with nogil:
ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
else:
with nogil:
- ret = hts_useek(self.htsfile, offset, SEEK_SET)
+ ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
return ret
-
def tell(self):
"""return current file position, see :meth:`pysam.VariantFile.seek`."""
if not self.is_open:
@@ -2620,7 +3556,7 @@ cdef class VariantFile(object):
if self.is_stream:
raise OSError('tell not available in streams')
- cdef int ret
+ cdef int64_t ret
if self.htsfile.format.compression != no_compression:
with nogil:
ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
@@ -2646,14 +3582,16 @@ cdef class VariantFile(object):
If only *contig* is set, all records on *contig* will be fetched.
If both *region* and *contig* are given, an exception is raised.
- Note that a :term:`VCF` file without a tabix index (.tbi) or a
- :term:`BCF` file without a CSI index can only be read sequentially.
+ Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
+ (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
+ read sequentially.
"""
if not self.is_open:
raise ValueError('I/O operation on closed file')
- if self.mode[0] != b'r':
- raise ValueError('cannot fetch from Variantfile opened for writing')
+ if not self.mode.startswith(b'r'):
+ raise ValueError('cannot fetch from Variantfile opened '
+ 'for writing')
if contig is None and region is None:
self.is_reading = 1
@@ -2674,7 +3612,13 @@ cdef class VariantFile(object):
returns the number of bytes written.
"""
if not self.is_open:
- return 0
+ return ValueError('I/O operation on closed file')
+
+ if not self.mode.startswith(b'w'):
+ raise ValueError('cannot write to a Variantfile opened for reading')
+
+ #if record.header is not self.header:
+ # raise ValueError('Writing records from a different VariantFile is not yet supported')
cdef int ret
@@ -2694,8 +3638,9 @@ cdef class VariantFile(object):
if not self.is_open:
raise ValueError('I/O operation on closed file')
- if self.mode[0] != b'r':
- raise ValueError('cannot subset samples from Variantfile opened for writing')
+ if not self.mode.startswith(b'r'):
+ raise ValueError('cannot subset samples from Variantfile '
+ 'opened for writing')
if self.is_reading:
raise ValueError('cannot subset samples after fetching records')
diff --git a/pysam/cfaidx.pxd b/pysam/cfaidx.pxd
index 34e825e..d3aff09 100644
--- a/pysam/cfaidx.pxd
+++ b/pysam/cfaidx.pxd
@@ -38,6 +38,7 @@ cdef extern from "pysam_stream.h" nogil:
int * dret)
cdef class FastaFile:
+ cdef bint is_remote
cdef object _filename, _references, _lengths, reference2length
cdef faidx_t* fastafile
cdef char* _fetch(self, char* reference,
diff --git a/pysam/cfaidx.pyx b/pysam/cfaidx.pyx
index a1dc488..4db754e 100644
--- a/pysam/cfaidx.pyx
+++ b/pysam/cfaidx.pyx
@@ -1,4 +1,4 @@
- # cython: embedsignature=True
+# cython: embedsignature=True
# cython: profile=True
###############################################################################
###############################################################################
@@ -47,6 +47,7 @@
###############################################################################
import sys
import os
+import re
from cpython cimport array
from cpython cimport PyErr_SetString, \
@@ -59,7 +60,7 @@ from cpython.version cimport PY_MAJOR_VERSION
from pysam.chtslib cimport \
faidx_nseq, fai_load, fai_destroy, fai_fetch, \
faidx_seq_len, \
- faidx_fetch_seq, gzopen, gzclose
+ faidx_fetch_seq, gzopen, gzclose, hisremote
from pysam.cutils cimport force_bytes, force_str, charptr_to_str
from pysam.cutils cimport encode_filename, from_string_and_size
@@ -88,9 +89,13 @@ cdef class FastaFile:
filename : string
Filename of fasta file to be opened.
+ filepath_index : string
+ Optional, filename of the index. By default this is
+ the filename + ".fai".
+
Raises
------
-
+
ValueError
if index file is missing
@@ -117,7 +122,7 @@ cdef class FastaFile:
return faidx_nseq(self.fastafile)
- def _open(self, filename):
+ def _open(self, filename, filepath_index=None):
'''open an indexed fasta file.
This method expects an indexed fasta file.
@@ -126,20 +131,35 @@ cdef class FastaFile:
# close a previously opened file
if self.fastafile != NULL:
self.close()
+
self._filename = encode_filename(filename)
cdef char *cfilename = self._filename
+ self.is_remote = hisremote(cfilename)
+
+ # open file for reading
+ if (self._filename != b"-"
+ and not self.is_remote
+ and not os.path.exists(filename)):
+ raise IOError("file `%s` not found" % filename)
+
with nogil:
self.fastafile = fai_load(cfilename)
if self.fastafile == NULL:
raise IOError("could not open file `%s`" % filename)
- # read index
- if not os.path.exists(self._filename + b".fai"):
- raise ValueError("could not locate index file")
+ if self.is_remote:
+ filepath_index = os.path.basename(
+ re.sub("[^:]+:[/]*", "", filename)) + ".fai"
+ elif filepath_index is None:
+ filepath_index = filename + ".fai"
+
+ if not os.path.exists(filepath_index):
+ raise ValueError("could not locate index file {}".format(
+ filepath_index))
- with open( self._filename + b".fai" ) as inf:
- data = [ x.split("\t") for x in inf ]
+ with open(filepath_index) as inf:
+ data = [x.split("\t") for x in inf]
self._references = tuple(x[0] for x in data)
self._lengths = tuple(int(x[1]) for x in data)
self.reference2length = dict(zip(self._references, self._lengths))
@@ -162,8 +182,8 @@ cdef class FastaFile:
return False
property closed:
- """"bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
+ """"bool indicating the current state of the file object.
+ This is a read-only attribute; the close() method changes the value.
"""
def __get__(self):
return not self.is_open()
@@ -203,14 +223,14 @@ cdef class FastaFile:
Alternatively, a samtools :term:`region` string can be
supplied.
-
+
If any of the coordinates are missing they will be replaced by the
minimum (`start`) or maximum (`end`) coordinate.
Note that region strings are 1-based, while `start` and `end` denote
an interval in python coordinates.
The region is specified by :term:`reference`, `start` and `end`.
-
+
Returns
-------
@@ -221,7 +241,7 @@ cdef class FastaFile:
IndexError
if the coordinates are out of range
-
+
ValueError
if the region is invalid
@@ -244,7 +264,8 @@ cdef class FastaFile:
return ""
ref = reference
- length = faidx_seq_len(self.fastafile, ref)
+ with nogil:
+ length = faidx_seq_len(self.fastafile, ref)
if length == -1:
raise KeyError("sequence '%s' not present" % reference)
if rstart >= length:
@@ -290,13 +311,16 @@ cdef class FastaFile:
cdef class FastqProxy:
+ """A single entry in a fastq file."""
def __init__(self): pass
property name:
+ """The name of each entry in the fastq file."""
def __get__(self):
return charptr_to_str(self._delegate.name.s)
property sequence:
+ """The sequence of each entry in the fastq file."""
def __get__(self):
return charptr_to_str(self._delegate.seq.s)
@@ -308,6 +332,7 @@ cdef class FastqProxy:
return None
property quality:
+ """The quality score of each entry in the fastq file, represented as a string."""
def __get__(self):
if self._delegate.qual.l:
return charptr_to_str(self._delegate.qual.s)
@@ -330,7 +355,7 @@ cdef class FastqProxy:
return self.tostring()
cpdef array.array get_quality_array(self, int offset=33):
- '''return quality values as array after subtracting offset.'''
+ '''return quality values as integer array after subtracting offset.'''
if self.quality is None:
return None
return qualitystring_to_array(force_bytes(self.quality),
@@ -388,19 +413,33 @@ cdef class FastxFile:
filename : string
Filename of fasta/fastq file to be opened.
- persist : bool
+ persist : bool
If True (default) make a copy of the entry in the file during
iteration. If set to False, no copy will be made. This will
permit faster iteration, but an entry will not persist when
the iteration continues.
-
+
+ Notes
+ -----
+ Prior to version 0.8.2, this was called FastqFile.
+
Raises
------
-
+
IOError
if file could not be opened
+
+ Examples
+ --------
+ >>> with pysam.FastxFile(filename) as fh:
+ ... for entry in fh:
+ ... print(entry.name)
+ ... print(entry.sequence)
+ ... print(entry.comment)
+ ... print(entry.quality)
+
"""
def __cinit__(self, *args, **kwargs):
# self.fastqfile = <gzFile*>NULL
@@ -432,8 +471,8 @@ cdef class FastxFile:
self.persist = persist
- filename = encode_filename(filename)
- cdef char *cfilename = filename
+ self._filename = encode_filename(filename)
+ cdef char *cfilename = self._filename
with nogil:
self.fastqfile = gzopen(cfilename, "r")
self.entry = kseq_init(self.fastqfile)
@@ -446,13 +485,21 @@ cdef class FastxFile:
if self.entry:
kseq_destroy(self.entry)
self.entry = NULL
-
+
def __dealloc__(self):
self.close()
+ # context manager interface
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
property closed:
- """"bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
+ """"bool indicating the current state of the file object.
+ This is a read-only attribute; the close() method changes the value.
"""
def __get__(self):
return not self.is_open()
@@ -483,7 +530,7 @@ cdef class FastxFile:
cdef int l
with nogil:
l = kseq_read(self.entry)
- if (l > 0):
+ if (l >= 0):
if self.persist:
return PersistentFastqProxy(makeFastqProxy(self.entry))
return makeFastqProxy(self.entry)
@@ -492,15 +539,15 @@ cdef class FastxFile:
# Compatibility Layer for pysam 0.8.1
cdef class FastqFile(FastxFile):
+ """FastqFile is deprecated: use FastxFile instead"""
pass
# Compatibility Layer for pysam < 0.8
cdef class Fastafile(FastaFile):
+ """Fastafile is deprecated: use FastaFile instead"""
pass
__all__ = ["FastaFile",
"FastqFile",
"FastxFile",
"Fastafile"]
-
-
diff --git a/pysam/chtslib.pxd b/pysam/chtslib.pxd
index 299e84a..0cee075 100644
--- a/pysam/chtslib.pxd
+++ b/pysam/chtslib.pxd
@@ -28,7 +28,11 @@ cdef extern from "htslib/kstring.h" nogil:
size_t l, m
char *s
+
cdef extern from "htslib_util.h" nogil:
+ int hts_set_verbosity(int verbosity)
+ int hts_get_verbosity()
+
ctypedef uint32_t khint32_t
ctypedef uint32_t khint_t
ctypedef khint_t khiter_t
@@ -148,8 +152,8 @@ cdef extern from "htslib/bgzf.h" nogil:
ctypedef struct z_stream
ctypedef struct BGZF:
- int errcode
- int is_write
+ unsigned errcode
+ unsigned is_write
int is_be
int compress_level
int is_compressed
@@ -308,6 +312,18 @@ cdef extern from "htslib/bgzf.h" nogil:
int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+ # Compress a single BGZF block.
+ #
+ # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
+ # @param dlen size of output buffer; updated on return to the number
+ # of bytes actually written to dst
+ # @param src buffer to be compressed
+ # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE)
+ # @param level compression level
+ # @return 0 on success and negative on error
+ #
+ int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
+
#*******************
# bgzidx routines *
# BGZF at the uncompressed offset
@@ -357,13 +373,13 @@ cdef extern from "htslib/hts.h" nogil:
ctypedef struct cram_fd
- ctypedef union FilePointerUnion:
+ union FilePointerUnion:
BGZF *bgzf
cram_fd *cram
hFILE *hfile
void *voidp
- cdef enum htsFormatCategory:
+ enum htsFormatCategory:
unknown_category
sequence_data # Sequence data -- SAM, BAM, CRAM, etc
variant_data # Variant calling data -- VCF, BCF, etc
@@ -371,18 +387,18 @@ cdef extern from "htslib/hts.h" nogil:
region_list # Coordinate intervals or regions -- BED, etc
category_maximum
- cdef enum htsExactFormat:
+ enum htsExactFormat:
unknown_format
binary_format
text_format
sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
format_maximum
- cdef enum htsCompression:
+ enum htsCompression:
no_compression, gzip, bgzf, custom
compression_maximum
- cdef struct htsVersion:
+ ctypedef struct htsVersion:
short major, minor
ctypedef struct htsFormat:
@@ -390,6 +406,8 @@ cdef extern from "htslib/hts.h" nogil:
htsExactFormat format
htsVersion version
htsCompression compression
+ short compression_level
+ void *specific
ctypedef struct htsFile:
uint8_t is_bin
@@ -436,7 +454,7 @@ cdef extern from "htslib/hts.h" nogil:
# @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
# @param fn The file name or "-" for stdin/stdout
- # @param mode Mode matching /[rwa][bcuz0-9]+/
+ # @param mode Mode matching / [rwa][bceguxz0-9]* /
# @discussion
# With 'r' opens for reading; any further format mode letters are ignored
# as the format is detected by checking the first few bytes or BGZF blocks
@@ -448,16 +466,33 @@ cdef extern from "htslib/hts.h" nogil:
# u uncompressed
# z bgzf compressed
# [0-9] zlib compression level
+ # and with non-format option letters (for any of 'r'/'w'/'a'):
+ # e close the file on exec(2) (opens with O_CLOEXEC, where supported)
+ # x create the file exclusively (opens with O_EXCL, where supported)
# Note that there is a distinction between 'u' and '0': the first yields
# plain uncompressed output whereas the latter outputs uncompressed data
# wrapped in the zlib format.
# @example
- # [rw]b .. compressed BCF, BAM, FAI
- # [rw]u .. uncompressed BCF
- # [rw]z .. compressed VCF
- # [rw] .. uncompressed VCF
+ # [rw]b .. compressed BCF, BAM, FAI
+ # [rw]bu .. uncompressed BCF
+ # [rw]z .. compressed VCF
+ # [rw] .. uncompressed VCF
htsFile *hts_open(const char *fn, const char *mode)
+ # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
+ # @param fn The file name or "-" for stdin/stdout
+ # @param mode Open mode, as per hts_open()
+ # @param fmt Optional format specific parameters
+ # @discussion
+ # See hts_open() for description of fn and mode.
+ # // TODO Update documentation for s/opts/fmt/
+ # Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
+ # if defined, override mode. Opts also contains a linked list of hts_opt
+ # structures to apply to the open file handle. These can contain things
+ # like pointers to the reference or information on compression levels,
+ # block sizes, etc.
+ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
+
# @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
# @param fp The already-open file handle
# @param fn The file name or "-" for stdin/stdout
@@ -474,12 +509,17 @@ cdef extern from "htslib/hts.h" nogil:
# @return Read-only pointer to the file's htsFormat.
const htsFormat *hts_get_format(htsFile *fp)
+ # @ abstract Returns a string containing the file format extension.
+ # @ param format Format structure containing the file type.
+ # @ return A string ("sam", "bam", etc) or "?" for unknown formats.
+ const char *hts_format_file_extension(const htsFormat *format)
+
# @abstract Sets a specified CRAM option on the open file handle.
# @param fp The file handle open the open file.
# @param opt The CRAM_OPT_* option.
# @param ... Optional arguments, dependent on the option used.
# @return 0 for success, or negative if an error occurred.
- #int hts_set_opt(htsFile *fp, enum cram_option opt, ...)
+ #int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
char **hts_readlines(const char *fn, int *_n)
@@ -546,9 +586,34 @@ cdef extern from "htslib/hts.h" nogil:
int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
- void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
+ #### Save an index to a file
+ # @param idx Index to be written
+ # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
+ # @param fmt One of the HTS_FMT_* index formats
+ # @return 0 if successful, or negative if an error occurred.
+ int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
+
+ #### Save an index to a specific file
+ # @param idx Index to be written
+ # @param fn Input BAM/BCF/etc filename
+ # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
+ # @param fmt One of the HTS_FMT_* index formats
+ # @return 0 if successful, or negative if an error occurred.
+ int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
+
+ #### Load an index file
+ # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
+ # the extension substituted, to search for an existing index file
+ # @param fmt One of the HTS_FMT_* index formats
+ # @return The index, or NULL if an error occurred.
hts_idx_t *hts_idx_load(const char *fn, int fmt)
+ #### Load a specific index file
+ # @param fn Input BAM/BCF/etc filename
+ # @param fnidx The input index filename
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
+
uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
@@ -557,7 +622,29 @@ cdef extern from "htslib/hts.h" nogil:
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
- const char *hts_parse_reg(const char *s, int *beg, int *end)
+ int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers
+
+ # Parse a numeric string
+ # The number may be expressed in scientific notation, and optionally may
+ # contain commas in the integer part (before any decimal point or E notation).
+ # @param str String to be parsed
+ # @param strend If non-NULL, set on return to point to the first character
+ # in @a str after those forming the parsed number
+ # @param flags Or'ed-together combination of HTS_PARSE_* flags
+ # @return Converted value of the parsed number.
+ #
+ # When @a strend is NULL, a warning will be printed (if hts_verbose is 2
+ # or more) if there are any trailing characters after the number.
+ long long hts_parse_decimal(const char *str, char **strend, int flags)
+
+ # Parse a "CHR:START-END"-style region string
+ # @param str String to be parsed
+ # @param beg Set on return to the 0-based start of the region
+ # @param end Set on return to the 1-based end of the region
+ # @return Pointer to the colon or '\0' after the reference sequence name,
+ # or NULL if @a str could not be parsed.
+ const char *hts_parse_reg(const char *str, int *beg, int *end)
+
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
void hts_itr_destroy(hts_itr_t *iter)
@@ -837,13 +924,38 @@ cdef extern from "htslib/sam.h" nogil:
hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
- # Load .csi or .bai BAM index file.
- hts_idx_t *bam_idx_load(const char *fn)
+ # Load/build .csi or .bai BAM index file. Does not work with CRAM.
+ # It is recommended to use the sam_index_* functions below instead.
+ hts_idx_t *bam_index_load(const char *fn)
int bam_index_build(const char *fn, int min_shift)
- # Load BAM (.csi or .bai) or CRAM (.crai) index file.
+ # Load a BAM (.csi or .bai) or CRAM (.crai) index file
+ # @param fp File handle of the data file whose index is being opened
+ # @param fn BAM/CRAM/etc filename to search alongside for the index file
+ # @return The index, or NULL if an error occurred.
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
+ # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
+ # @param fp File handle of the data file whose index is being opened
+ # @param fn BAM/CRAM/etc data file filename
+ # @param fnidx Index filename, or NULL to search alongside @a fn
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
+
+ # Generate and save an index file
+ # @param fn Input BAM/etc filename, to which .csi/etc will be added
+ # @param min_shift Positive to generate CSI, or 0 to generate BAI
+ # @return 0 if successful, or negative if an error occurred (usually -1; or
+ # -2: opening fn failed; -3: format not indexable)
+ int sam_index_build(const char *fn, int min_shift)
+
+ # Generate and save an index to a specific file
+ # @param fn Input BAM/CRAM/etc filename
+ # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
+ # @param min_shift Positive to generate CSI, or 0 to generate BAI
+ # @return 0 if successful, or negative if an error occurred.
+ int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
+
void sam_itr_destroy(hts_itr_t *iter)
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
@@ -854,8 +966,16 @@ cdef extern from "htslib/sam.h" nogil:
#***************
htsFile *sam_open(const char *fn, const char *mode)
+ htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
int sam_close(htsFile *fp)
+ int sam_open_mode(char *mode, const char *fn, const char *format)
+
+ # A version of sam_open_mode that can handle ,key=value options.
+ # The format string is allocated and returned, to be freed by the caller.
+ # Prefix should be "r" or "w",
+ char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
+
bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
bam_hdr_t *sam_hdr_read(htsFile *fp)
int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
@@ -1012,11 +1132,11 @@ cdef extern from "htslib/tbx.h" nogil:
int tbx_name2id(tbx_t *tbx, char *ss)
- int tbx_index_build(char *fn,
- int min_shift,
- tbx_conf_t *conf)
+ int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
+ int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
tbx_t * tbx_index_load(char *fn)
+ tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
# free the array but not the values
char **tbx_seqnames(tbx_t *tbx, int *n)
@@ -1081,7 +1201,7 @@ cdef extern from "htslib/vcf.h" nogil:
const bcf_idinfo_t *val
ctypedef struct bcf_hdr_t:
- int32_t n[3]
+ int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
bcf_idpair_t *id[3]
void *dict[3] # ID dictionary, contig dict and sample dict
char **samples
@@ -1092,6 +1212,7 @@ cdef extern from "htslib/vcf.h" nogil:
int nsamples_ori # for bcf_hdr_set_samples()
uint8_t *keep_samples
kstring_t mem
+ int32_t m[3] # m: allocated size of the dictionary block in use (see n above)
uint8_t bcf_type_shift[]
@@ -1121,7 +1242,7 @@ cdef extern from "htslib/vcf.h" nogil:
uint32_t p_off
uint8_t p_free
- ctypedef union bcf_info_union_t:
+ union bcf_info_union_t:
int32_t i # integer value
float f # float value
@@ -1159,6 +1280,7 @@ cdef extern from "htslib/vcf.h" nogil:
uint8_t BCF_ERR_CTG_UNDEF
uint8_t BCF_ERR_TAG_UNDEF
uint8_t BCF_ERR_NCOLS
+ uint8_t BCF_ERR_LIMITS
# The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
# is slower because the string is first to be parsed, packed into BCF line
@@ -1229,7 +1351,7 @@ cdef extern from "htslib/vcf.h" nogil:
# ^LIST|FILE .. exclude samples from list/file
# - .. include all samples
# NULL .. exclude all samples
- # @is_file: @samples is a file (1) or a comma-separated list (1)
+ # @is_file: @samples is a file (1) or a comma-separated list (0)
#
# The bottleneck of VCF reading is parsing of genotype fields. If the
# reader knows in advance that only subset of samples is needed (possibly
@@ -1288,7 +1410,7 @@ cdef extern from "htslib/vcf.h" nogil:
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
# bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
- int bcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
# The following functions work only with VCFs and should rarely be called
# directly. Usually one wants to use their bcf_* alternatives, which work
@@ -1311,6 +1433,20 @@ cdef extern from "htslib/vcf.h" nogil:
# # todo
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
+ # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
+ # @param dst: the destination header to be merged into, NULL on the first pass
+ # @param src: the source header
+ #
+ # Notes:
+ # - use as:
+ # bcf_hdr_t *dst = NULL;
+ # for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
+ #
+ # - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
+ # combining multiple BCF headers. The current bcf_hdr_combine()
+ # does not have this problem, but became slow when used for many files.
+ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
+
# bcf_hdr_add_sample() - add a new sample.
# @param sample: sample name to be added
int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
@@ -1333,7 +1469,7 @@ cdef extern from "htslib/vcf.h" nogil:
# bcf_hdr_remove() - remove VCF header tag
# @param type: one of BCF_HL_*
- # @param key: tag name
+ # @param key: tag name or NULL to remove all tags of the given type
void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
# bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
@@ -1396,18 +1532,18 @@ cdef extern from "htslib/vcf.h" nogil:
int bcf_is_snp(bcf1_t *v)
# bcf_update_filter() - sets the FILTER column
- # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_id2int(hdr, BCF_DT_ID, "PASS")
+ # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
# @n: Number of filters. If n==0, all filters are removed
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
# bcf_add_filter() - adds to the FILTER column
- # @flt_id: filter ID to add, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS")
+ # @flt_id: The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
#
# If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
# bcf_remove_filter() - removes from the FILTER column
- # @flt_id: filter ID to remove, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS")
+ # @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
# @pass: when set to 1 and no filters are present, set to PASS
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
@@ -1418,15 +1554,13 @@ cdef extern from "htslib/vcf.h" nogil:
# @alleles: Array of alleles
# @nals: Number of alleles
# @alleles_string: Comma-separated alleles, starting with the REF allele
- #
- # Not that in order for indexing to work correctly in presence of INFO/END tag,
- # the length of reference allele (line->rlen) must be set explicitly by the caller,
- # or otherwise, if rlen is zero, strlen(line->d.allele[0]) is used to set the length
- # on bcf_write().
- #
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
+
+ # bcf_update_id() - sets new ID string
+ # bcf_add_id() - adds to the ID string checking for duplicates
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+ int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
# bcf_update_info_*() - functions for updating INFO fields
# @hdr: the BCF header
@@ -1579,7 +1713,7 @@ cdef extern from "htslib/vcf.h" nogil:
#
# bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
# @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
- # @int_id: return value of bcf_id2int, must be >=0
+ # @int_id: return value of bcf_hdr_id2int, must be >=0
#
# The returned values are:
# bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
@@ -1612,7 +1746,9 @@ cdef extern from "htslib/vcf.h" nogil:
# both indexed BCFs and VCFs.
#************************************************************************
+ hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
int bcf_index_build(const char *fn, int min_shift)
+ int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
#*******************
# Typed value I/O *
@@ -1663,6 +1799,7 @@ cdef extern from "htslib/vcf.h" nogil:
int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
void bcf_destroy1(bcf1_t *v)
+ void bcf_empty1(bcf1_t *v)
int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
void bcf_clear1(bcf1_t *v)
int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
@@ -1675,7 +1812,78 @@ cdef extern from "htslib/vcf.h" nogil:
hts_idx_t *bcf_index_load(const char *fn)
const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
-cdef extern from "htslib_util.h":
- int hts_set_verbosity(int verbosity)
- int hts_get_verbosity()
+# VCF/BCF utility functions
+cdef extern from "htslib/vcfutils.h" nogil:
+ struct kbitset_t
+
+ # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtain from vcf_parse1
+ #
+ # Returns the number of removed alleles on success or negative
+ # on error:
+ # -1 .. some allele index is out of bounds
+ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
+
+ # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @mask: alleles to remove
+ #
+ # If you have more than 31 alleles, then the integer bit mask will
+ # overflow, so use bcf_remove_allele_set instead
+ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
+
+ # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @rm_set: pointer to kbitset_t object with bits set for allele
+ # indexes to remove
+ #
+ # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
+ void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
+
+ # bcf_calc_ac() - calculate the number of REF and ALT alleles
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @ac: array of length line->n_allele
+ # @which: determine if INFO/AN,AC and indv fields be used
+ #
+ # Returns 1 if the call succeeded, or 0 if the value could not
+ # be determined.
+ #
+ # The value of @which determines if existing INFO/AC,AN can be
+ # used (BCF_UN_INFO) and and if indv fields can be splitted
+ # (BCF_UN_FMT).
+ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
+
+ # bcf_gt_type() - determines type of the genotype
+ # @fmt_ptr: the GT format field as set for example by set_fmt_ptr
+ # @isample: sample index (starting from 0)
+ # @ial: index of the 1st non-reference allele (starting from 1)
+ # @jal: index of the 2nd non-reference allele (starting from 1)
+ #
+ # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
+ # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
+ # is not NULL and the genotype has one or more non-reference
+ # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
+ # position of the allele which appeared first in ALT. If $jal is
+ # not null and the genotype is GT_HET_AA, $jal will be set and is
+ # the position of the second allele in ALT.
+ uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation
+ uint8_t GT_HOM_AA
+ uint8_t GT_HET_RA
+ uint8_t GT_HET_AA
+ uint8_t GT_HAPL_R
+ uint8_t GT_HAPL_A
+ uint8_t GT_UNKN
+ int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
+
+ int bcf_acgt2int(char c)
+ char bcf_int2acgt(int i)
+
+ # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
+ # @i,j: allele indexes, 0-based, i<=j
+ # Returns index to the Number=G diploid array
+ uint32_t bcf_ij2G(uint32_t i, uint32_t j)
diff --git a/pysam/csamtools.pxd b/pysam/csamtools.pxd
deleted file mode 100644
index 53e04ea..0000000
--- a/pysam/csamtools.pxd
+++ /dev/null
@@ -1,8 +0,0 @@
-from libc.stdlib cimport calloc, free
-
-cdef extern from "pysam_util.h":
-
- int pysam_dispatch(int argc, char *argv[])
- void pysam_set_stderr(int fd)
- void pysam_unset_stderr()
-
diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx
deleted file mode 100644
index c49f668..0000000
--- a/pysam/csamtools.pyx
+++ /dev/null
@@ -1,146 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-# adds doc-strings for sphinx
-import tempfile
-import os
-import sys
-
-from pysam.cutils cimport force_bytes, force_cmdline_bytes
-
-class Outs:
- '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
- def __init__(self, id = 1):
- self.streams = []
- self.id = id
-
- def setdevice(self, filename):
- '''open an existing file, like "/dev/null"'''
- fd = os.open(filename, os.O_WRONLY)
- self.setfd(fd)
-
- def setfile(self, filename):
- '''open a new file.'''
- fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660)
- self.setfd(fd)
-
- def setfd(self, fd):
- ofd = os.dup(self.id) # Save old stream on new unit.
- self.streams.append(ofd)
- sys.stdout.flush() # Buffered data goes to old stream.
- sys.stderr.flush() # Buffered data goes to old stream.
- os.dup2(fd, self.id) # Open unit 1 on new stream.
- os.close(fd) # Close other unit (look out, caller.)
-
- def restore(self):
- '''restore previous output stream'''
- if self.streams:
- # the following was not sufficient, hence flush both stderr and stdout
- # os.fsync( self.id )
- sys.stdout.flush()
- sys.stderr.flush()
- os.dup2(self.streams[-1], self.id)
- os.close(self.streams[-1])
- del self.streams[-1]
-
-
-def _samtools_dispatch(method,
- args=(),
- catch_stdout=True):
- '''call ``method`` in samtools providing arguments in args.
-
- .. note::
- This method redirects stdout to capture it
- from samtools. If for some reason stdout disappears
- the reason might be in this method.
-
- .. note::
- The current implementation might only work on linux.
-
- .. note::
- This method captures stdout and stderr using temporary files,
- which are then read into memory in their entirety. This method
- is slow and might cause large memory overhead.
-
- Catching of stdout can be turned of by setting *catch_stdout* to False.
-
- See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
- on the topic of redirecting stderr/stdout.
- '''
-
- # note that debugging this module can be a problem
- # as stdout/stderr will not appear on the terminal
-
- # some special cases
- if method == "index":
- if not os.path.exists(args[0]):
- raise IOError("No such file or directory: '%s'" % args[0])
-
- # redirect stderr and stdout to file
- stderr_h, stderr_f = tempfile.mkstemp()
- pysam_set_stderr(stderr_h)
-
- if catch_stdout:
- stdout_h, stdout_f = tempfile.mkstemp()
- try:
- stdout_save = Outs(sys.stdout.fileno())
- stdout_save.setfd(stdout_h)
- except AttributeError:
- # stdout has already been redirected
- catch_stdout = False
-
- # patch for `samtools view`
- # samtools `view` closes stdout, from which I can not
- # recover. Thus redirect output to file with -o option.
- if method == "view":
- if "-o" in args:
- raise ValueError("option -o is forbidden in samtools view")
- args = ("-o", stdout_f) + args
-
- # do the function call to samtools
- cdef char ** cargs
- cdef int i, n, retval
-
- n = len(args)
- method = force_cmdline_bytes(method)
- args = [force_cmdline_bytes(a) for a in args ]
-
- # allocate two more for first (dummy) argument (contains command)
- cargs = <char**>calloc(n + 2, sizeof(char *))
- cargs[0] = "samtools"
- cargs[1] = method
- for i from 0 <= i < n:
- cargs[i + 2] = args[i]
-
- retval = pysam_dispatch(n+2, cargs)
- free(cargs)
-
- # restore stdout/stderr. This will also flush, so
- # needs to be before reading back the file contents
- if catch_stdout:
- stdout_save.restore()
- try:
- with open(stdout_f, "r") as inf:
- out_stdout = inf.readlines()
- except UnicodeDecodeError:
- with open( stdout_f, "rb") as inf:
- # read binary output
- out_stdout = inf.read()
- os.remove(stdout_f)
- else:
- out_stdout = []
-
- # get error messages
- pysam_unset_stderr()
- out_stderr = []
- try:
- with open(stderr_f, "r") as inf:
- out_stderr = inf.readlines()
- except UnicodeDecodeError:
- with open( stderr_f, "rb") as inf:
- # read binary output
- out_stderr = inf.read()
- finally:
- os.remove(stderr_f)
-
- return retval, out_stderr, out_stdout
-
diff --git a/pysam/ctabix.pxd b/pysam/ctabix.pxd
index 2d7c546..39eed77 100644
--- a/pysam/ctabix.pxd
+++ b/pysam/ctabix.pxd
@@ -66,10 +66,10 @@ cdef class TabixFile:
cdef tbx_t * index
# flag indicating whether file is remote
- cdef int isremote
+ cdef int is_remote
- cdef _filename
- cdef _filename_index
+ cdef object _filename
+ cdef object _filename_index
cdef Parser parser
diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx
index 58d0ffb..0bb1284 100644
--- a/pysam/ctabix.pyx
+++ b/pysam/ctabix.pyx
@@ -72,7 +72,7 @@ from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
BGZF, bgzf_open, bgzf_close, bgzf_write, gzFile, \
tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
- tbx_destroy, gzopen, gzclose, gzerror, gzdopen
+ tbx_destroy, gzopen, gzclose, gzerror, gzdopen, hisremote
from pysam.cutils cimport force_bytes, force_str, charptr_to_str
from pysam.cutils cimport encode_filename, from_string_and_size
@@ -317,31 +317,28 @@ cdef class TabixFile:
self.tabixfile = NULL
filename_index = index or (filename + ".tbi")
- self.isremote = filename.startswith("http:") or filename.startswith("ftp:")
+ # encode all the strings to pass to tabix
+ self._filename = encode_filename(filename)
+ self._filename_index = encode_filename(filename_index)
+
+ self.is_remote = hisremote(self._filename)
- if not self.isremote:
+ if not self.is_remote:
if not os.path.exists(filename):
raise IOError("file `%s` not found" % filename)
if not os.path.exists(filename_index):
raise IOError("index `%s` not found" % filename_index)
- self._filename = filename
- self._filename_index = filename_index
-
- # encode all the strings to pass to tabix
- _encoded_filename = encode_filename(filename)
- _encoded_index = encode_filename(filename_index)
-
# open file
- cdef char *cfilename = _encoded_filename
+ cdef char *cfilename = self._filename
with nogil:
self.tabixfile = hts_open(cfilename, 'r')
if self.tabixfile == NULL:
raise IOError("could not open file `%s`" % filename)
- cfilename = _encoded_index
+ cfilename = self._filename_index
with nogil:
self.index = tbx_index_load(cfilename)
@@ -394,22 +391,32 @@ cdef class TabixFile:
if not self.is_open():
raise ValueError("I/O operation on closed file")
- # convert coordinates to region string
+ # convert coordinates to region string, which is one-based
if reference:
if end is not None:
+ if end < 0:
+ raise ValueError("end out of range (%i)" % end)
if start is None:
start = 0
- region = '%s:%i-%i' % (reference, start + 1, end)
- if start > end:
+
+ if start < 0:
+ raise ValueError("start out of range (%i)" % end)
+ elif start > end:
raise ValueError(
- 'start (%i) > end (%i)' % (start, end))
+ 'start (%i) >= end (%i)' % (start, end))
+ elif start == end:
+ return EmptyIterator()
+ else:
+ region = '%s:%i-%i' % (reference, start + 1, end)
elif start is not None:
+ if start < 0:
+ raise ValueError("start out of range (%i)" % end)
region = '%s:%i' % (reference, start + 1)
else:
region = reference
# get iterator
- cdef hts_itr_t * iter
+ cdef hts_itr_t * itr
cdef char *cstr
cdef TabixFile fileobj
@@ -422,7 +429,7 @@ cdef class TabixFile:
if region is None:
# without region or reference - iterate from start
with nogil:
- iter = tbx_itr_queryi(fileobj.index,
+ itr = tbx_itr_queryi(fileobj.index,
HTS_IDX_START,
0,
0)
@@ -430,13 +437,20 @@ cdef class TabixFile:
s = force_bytes(region, encoding=fileobj.encoding)
cstr = s
with nogil:
- iter = tbx_itr_querys(fileobj.index, cstr)
+ itr = tbx_itr_querys(fileobj.index, cstr)
- if iter == NULL:
+ if itr == NULL:
if region is None:
- # possible reason is that the file is empty -
- # return an empty iterator
- return EmptyIterator()
+ if len(self.contigs) > 0:
+ # when accessing a tabix file created prior tabix 1.0
+ # the full-file iterator is empty.
+ raise ValueError(
+ "could not create iterator, possible "
+ "tabix version mismatch")
+ else:
+ # possible reason is that the file is empty -
+ # return an empty iterator
+ return EmptyIterator()
else:
raise ValueError(
"could not create iterator for region '%s'" %
@@ -454,7 +468,7 @@ cdef class TabixFile:
a = TabixIteratorParsed(parser)
a.tabixfile = fileobj
- a.iterator = iter
+ a.iterator = itr
return a
@@ -502,7 +516,7 @@ cdef class TabixFile:
'''
def __get__(self):
- if self.isremote:
+ if self.is_remote:
raise AttributeError(
"the header is not available for remote files")
return GZIteratorHead(self.filename)
@@ -513,7 +527,8 @@ cdef class TabixFile:
cdef char ** sequences
cdef int nsequences
- sequences = tbx_seqnames(self.index, &nsequences)
+ with nogil:
+ sequences = tbx_seqnames(self.index, &nsequences)
cdef int x
result = []
for x from 0 <= x < nsequences:
@@ -754,16 +769,17 @@ def tabix_compress(filename_in,
is set.
'''
- if not force and os.path.exists(filename_out ):
+ if not force and os.path.exists(filename_out):
raise IOError(
- "Filename '%s' already exists, use *force* to overwrite" % filename_out)
+ "Filename '%s' already exists, use *force* to "
+ "overwrite" % filename_out)
cdef int WINDOW_SIZE
cdef int c, r
cdef void * buffer
cdef BGZF * fp
cdef int fd_src
-
+ cdef bint is_empty = True
cdef int O_RDONLY
O_RDONLY = os.O_RDONLY
@@ -774,19 +790,21 @@ def tabix_compress(filename_in,
with nogil:
fp = bgzf_open(cfn, "w")
if fp == NULL:
- raise IOError("could not open '%s' for writing" % (filename_out, ))
+ raise IOError("could not open '%s' for writing" % filename_out)
fn = encode_filename(filename_in)
fd_src = open(fn, O_RDONLY)
if fd_src == 0:
- raise IOError("could not open '%s' for reading" % (filename_in, ))
+ raise IOError("could not open '%s' for reading" % filename_in)
buffer = malloc(WINDOW_SIZE)
c = 1
-
+
while c > 0:
with nogil:
c = read(fd_src, buffer, WINDOW_SIZE)
+ if c > 0:
+ is_empty = False
r = bgzf_write(fp, buffer, c)
if r < 0:
free(buffer)
@@ -795,11 +813,14 @@ def tabix_compress(filename_in,
free(buffer)
r = bgzf_close(fp)
if r < 0:
- raise OSError("writing to file %s failed" % filename_out)
+ raise OSError("error %i when writing to file %s" % (r, filename_out))
r = close(fd_src)
+ # an empty file will return with -1, thus ignore this.
if r < 0:
- raise OSError("error when closing file %s" % filename_in)
+ if not (r == -1 and is_empty):
+ raise OSError("error %i when closing file %s" % (r, filename_in))
+
def tabix_index( filename,
force = False,
@@ -1163,6 +1184,7 @@ def tabix_iterator(infile, parser):
# return tabix_generic_iterator( infile, parser )
cdef class Tabixfile(TabixFile):
+ """Tabixfile is deprecated: use TabixFile instead"""
pass
diff --git a/pysam/cutils.pxd b/pysam/cutils.pxd
index c2a7c5f..36fe554 100644
--- a/pysam/cutils.pxd
+++ b/pysam/cutils.pxd
@@ -9,7 +9,7 @@ cpdef parse_region(reference=*, start=*, end=*, region=*)
#########################################################################
# Utility functions for quality string conversions
-cpdef c_array.array qualitystring_to_array(bytes input_str, int offset=*)
+cpdef c_array.array qualitystring_to_array(input_str, int offset=*)
cpdef array_to_qualitystring(c_array.array arr, int offset=*)
cpdef qualities_to_qualitystring(qualities, int offset=*)
@@ -18,10 +18,18 @@ cpdef qualities_to_qualitystring(qualities, int offset=*)
########################################################################
## Python 3 compatibility functions
########################################################################
-cdef charptr_to_str(char *s, encoding=*)
+cdef charptr_to_str(const char *s, encoding=*)
+cdef bytes charptr_to_bytes(const char *s, encoding=*)
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*)
cdef force_str(object s, encoding=*)
cdef bytes force_bytes(object s, encoding=*)
-cdef bytes force_cmdline_bytes(object s, encoding=*)
cdef bytes encode_filename(object filename)
-cdef from_string_and_size(char *s, size_t length)
+cdef from_string_and_size(const char *s, size_t length)
+cdef extern from "pysam_util.h":
+
+ int samtools_main(int argc, char *argv[])
+ int bcftools_main(int argc, char *argv[])
+ void pysam_set_stderr(int fd)
+ void pysam_unset_stderr()
+ void set_optind(int)
diff --git a/pysam/cutils.pyx b/pysam/cutils.pyx
index afbd97d..482db89 100644
--- a/pysam/cutils.pyx
+++ b/pysam/cutils.pyx
@@ -2,12 +2,18 @@ import types
import sys
import string
import re
+import tempfile
+import os
+import io
+from contextlib import contextmanager
from cpython.version cimport PY_MAJOR_VERSION
from cpython cimport PyBytes_Check, PyUnicode_Check
-
from cpython cimport array as c_array
-cimport cython
+from libc.stdlib cimport calloc, free
+from libc.string cimport strncpy
+from libc.stdio cimport fprintf, stderr, fflush
+from libc.stdio cimport stdout as c_stdout
#####################################################################
# hard-coded constants
@@ -15,12 +21,13 @@ cdef int MAX_POS = 2 << 29
#################################################################
# Utility functions for quality string conversions
-cpdef c_array.array qualitystring_to_array(bytes input_str, int offset=33):
+cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
"""convert a qualitystring to an array of quality values."""
if input_str is None:
return None
+ qs = force_bytes(input_str)
cdef char i
- return c_array.array('B', [i - offset for i in input_str])
+ return c_array.array('B', [i - offset for i in qs])
cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
@@ -34,7 +41,7 @@ cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
for x from 0 <= x < len(qualities):
result[x] = qualities[x] + offset
- return result.tostring()
+ return force_str(result.tostring())
cpdef qualities_to_qualitystring(qualities, int offset=33):
@@ -60,7 +67,7 @@ cpdef qualities_to_qualitystring(qualities, int offset=33):
return array_to_qualitystring(qualities, offset=offset)
else:
# tuples and lists
- return "".join([chr(x + offset) for x in qualities])
+ return force_str("".join([chr(x + offset) for x in qualities]))
########################################################################
@@ -68,13 +75,13 @@ cpdef qualities_to_qualitystring(qualities, int offset=33):
########################################################################
## Python 3 compatibility functions
########################################################################
-IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
-cdef from_string_and_size(char* s, size_t length):
- if PY_MAJOR_VERSION < 3:
- return s[:length]
- else:
+cdef from_string_and_size(const char* s, size_t length):
+ if IS_PYTHON3:
return s[:length].decode("ascii")
+ else:
+ return s[:length]
# filename encoding (copied from lxml.etree.pyx)
cdef str _FILENAME_ENCODING
@@ -102,7 +109,7 @@ cdef bytes force_bytes(object s, encoding="ascii"):
u"""convert string or unicode object to bytes, assuming
ascii encoding.
"""
- if PY_MAJOR_VERSION < 3:
+ if not IS_PYTHON3:
return s
elif s is None:
return None
@@ -113,10 +120,7 @@ cdef bytes force_bytes(object s, encoding="ascii"):
else:
raise TypeError(u"Argument must be string, bytes or unicode.")
-cdef bytes force_cmdline_bytes(object s, encoding="ascii"):
- return force_bytes(s)
-
-cdef charptr_to_str(char* s, encoding="ascii"):
+cdef charptr_to_str(const char* s, encoding="ascii"):
if s == NULL:
return None
if PY_MAJOR_VERSION < 3:
@@ -124,6 +128,20 @@ cdef charptr_to_str(char* s, encoding="ascii"):
else:
return s.decode(encoding)
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
+ if s == NULL:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s[:n]
+ else:
+ return s[:n].decode(encoding)
+
+cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
+ if s == NULL:
+ return None
+ else:
+ return s
+
cdef force_str(object s, encoding="ascii"):
"""Return s converted to str type of current Python
(bytes in Py2, unicode in Py3)"""
@@ -209,6 +227,199 @@ cpdef parse_region(reference=None,
return force_bytes(reference), rstart, rend
+ at contextmanager
+def stdout_redirector(to=os.devnull):
+ '''
+ import os
+
+ with stdout_redirected(to=filename):
+ print("from Python")
+ os.system("echo non-Python applications are also supported")
+
+ see http://stackoverflow.com/questions/5081657/how-do-i-prevent-a-c-shared-library-to-print-on-stdout-in-python/17954769#17954769
+ '''
+ fd = sys.stdout.fileno()
+
+ def _redirect_stdout(to):
+ # flush C-level stdout
+ try:
+ fflush(c_stdout)
+ sys.stdout.close()
+ except (OSError, IOError):
+ # some tools close stdout
+ # Py3: OSError
+ # Py2: IOError
+ pass
+
+ # fd writes to 'to' file
+ os.dup2(to.fileno(), fd)
+ # Python writes to fd
+ if IS_PYTHON3:
+ sys.stdout = io.TextIOWrapper(
+ os.fdopen(fd, 'wb'))
+ else:
+ sys.stdout = os.fdopen(fd, 'w')
+
+ with os.fdopen(os.dup(fd), 'w') as old_stdout:
+ _redirect_stdout(to)
+ try:
+ yield # allow code to be run with the redirected stdout
+ finally:
+ _redirect_stdout(old_stdout)
+ # restore stdout.
+ # buffering and flags may be different
+
+# def stdout_redirector(stream):
+# """
+# See discussion in:
+
+# http://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/
+# """
+
+# # The original fd stdout points to. Usually 1 on POSIX systems.
+# original_stdout_fd = sys.stdout.fileno()
+# print ("original_fd=", original_stdout_fd)
+# def _redirect_stdout(to_fd):
+# """Redirect stdout to the given file descriptor."""
+# # Flush the C-level buffer stdout
+# fflush(c_stdout)
+# # Flush and close sys.stdout - also closes the file descriptor
+# # (fd)
+# sys.stdout.close()
+# # Make original_stdout_fd point to the same file as to_fd
+# os.dup2(to_fd, original_stdout_fd)
+# # Create a new sys.stdout that points to the redirected fd
+# if IS_PYTHON3:
+# sys.stdout = io.TextIOWrapper(
+# os.fdopen(original_stdout_fd, 'wb'))
+
+# # Save a copy of the original stdout fd in saved_stdout_fd
+# saved_stdout_fd = os.dup(original_stdout_fd)
+# try:
+# # Create a temporary file and redirect stdout to it
+# tfile = tempfile.TemporaryFile(mode='w+b')
+# _redirect_stdout(tfile.fileno())
+# # Yield to caller, then redirect stdout back to the saved fd
+# yield
+# _redirect_stdout(saved_stdout_fd)
+# # Copy contents of temporary file to the given stream
+# tfile.flush()
+# tfile.seek(0, io.SEEK_SET)
+# stream.write(tfile.read())
+# finally:
+# tfile.close()
+# os.close(saved_stdout_fd)
+
+
+def _pysam_dispatch(collection,
+ method,
+ args=(),
+ catch_stdout=True):
+ '''call ``method`` in samtools/bcftools providing arguments in args.
+
+ .. note::
+ This method redirects stdout to capture it
+ from samtools. If for some reason stdout disappears
+ the reason might be in this method.
+
+ .. note::
+ This method captures stdout and stderr using temporary files,
+ which are then read into memory in their entirety. This method
+ is slow and might cause large memory overhead.
+
+ Catching of stdout can be turned of by setting *catch_stdout* to
+ False.
+
+ See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
+ on the topic of redirecting stderr/stdout.
+
+ '''
+
+ # note that debugging this module can be a problem
+ # as stdout/stderr will not appear on the terminal
+ # some special cases
+ if method == "index":
+ if not os.path.exists(args[0]):
+ raise IOError("No such file or directory: '%s'" % args[0])
+
+ # redirect stderr and stdout to file
+ stderr_h, stderr_f = tempfile.mkstemp()
+ pysam_set_stderr(stderr_h)
+
+ # setup the function call to samtools/bcftools main
+ cdef char ** cargs
+ cdef int i, n, retval, l
+
+ n = len(args)
+ method = force_bytes(method)
+ collection = force_bytes(collection)
+ args = [force_bytes(a) for a in args]
+
+ # allocate two more for first (dummy) argument (contains command)
+ cdef int extra_args = 0
+ if method == b"index":
+ extra_args = 1
+ # add extra arguments for commands accepting optional arguments
+ # such as 'samtools index x.bam [out.index]'
+ cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
+ cargs[0] = collection
+ cargs[1] = method
+
+ # create copies of strings - getopt for long options permutes
+ # arguments
+ for i from 0 <= i < n:
+ l = len(args[i])
+ cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
+ strncpy(cargs[i + 2], args[i], l)
+
+ # reset getopt. On OsX there getopt reset is different
+ # between getopt and getopt_long
+ if method in [b'index', b'cat', b'quickcheck',
+ b'faidx', b'kprobaln']:
+ set_optind(1)
+ else:
+ set_optind(0)
+
+ # call samtools/bcftools
+ if catch_stdout:
+ with tempfile.TemporaryFile(mode='w+b') as tfile:
+ with stdout_redirector(tfile):
+ if collection == b"samtools":
+ retval = samtools_main(n + 2, cargs)
+ elif collection == b"bcftools":
+ retval = bcftools_main(n + 2, cargs)
+ tfile.flush()
+ tfile.seek(0)
+ # do not force str, as output might be binary,
+ # for example BAM, VCF.gz, etc.
+ out_stdout = tfile.read()
+ else:
+ if collection == b"samtools":
+ retval = samtools_main(n + 2, cargs)
+ elif collection == b"bcftools":
+ retval = bcftools_main(n + 2, cargs)
+ out_stdout = None
+
+ for i from 0 <= i < n:
+ free(cargs[i + 2])
+ free(cargs)
+
+ # get error messages
+ pysam_unset_stderr()
+ out_stderr = []
+ try:
+ with open(stderr_f, "r") as inf:
+ out_stderr = inf.readlines()
+ except UnicodeDecodeError:
+ with open( stderr_f, "rb") as inf:
+ # read binary output
+ out_stderr = inf.read()
+ finally:
+ os.remove(stderr_f)
+
+ return retval, out_stderr, out_stdout
+
+
__all__ = ["qualitystring_to_array",
"array_to_qualitystring",
"qualities_to_qualitystring"]
diff --git a/pysam/cvcf.pyx b/pysam/cvcf.pyx
index e9fe3d0..83d3663 100644
--- a/pysam/cvcf.pyx
+++ b/pysam/cvcf.pyx
@@ -546,6 +546,9 @@ class VCF(object):
elif value == "VCFv4.1":
# AH - for testing
self._version = 40
+ elif value == "VCFv4.2":
+ # AH - for testing
+ self._version = 40
else:
self.error(line,self.UNKNOWN_FORMAT_STRING)
elif key == "INFO":
diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h
index 46e44bc..f0d582c 100644
--- a/pysam/htslib_util.h
+++ b/pysam/htslib_util.h
@@ -11,6 +11,7 @@ long hts_utell(htsFile *fp);
int hts_set_verbosity(int verbosity);
int hts_get_verbosity();
+
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
typedef khash_t(vdict) vdict_t;
diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c
index f8ccae7..e669e1d 100644
--- a/pysam/pysam_util.c
+++ b/pysam/pysam_util.c
@@ -1,5 +1,6 @@
#include <ctype.h>
#include <assert.h>
+#include <unistd.h>
#include "bam.h"
#include "bam_endian.h"
#include "htslib/khash.h"
@@ -26,106 +27,15 @@ void pysam_unset_stderr(void)
pysamerr = fopen("/dev/null", "w");
}
-
-// dummy function - required for samtools integration
-void print_error(const char *format, ...)
-{
-}
-
-// dummy function - required for samtools integration
-void print_error_errno(const char *format, ...)
-{
-}
-
-const char *samtools_version()
+void set_optind(int val)
{
+ // setting this in cython via
+ // "from posix.unistd cimport optind"
+ // did not work.
+ //
+ // setting to 0 forces a complete re-initialization
+ optind = val;
}
-// pysam dispatch function to emulate the samtools
-// command line within python.
-// taken from the main function in bamtk.c
-// added code to reset getopt
-int bam_taf2baf(int argc, char *argv[]);
-int bam_mpileup(int argc, char *argv[]);
-int bam_merge(int argc, char *argv[]);
-int bam_index(int argc, char *argv[]);
-int bam_sort(int argc, char *argv[]);
-int bam_tview_main(int argc, char *argv[]);
-int bam_mating(int argc, char *argv[]);
-int bam_rmdup(int argc, char *argv[]);
-int bam_flagstat(int argc, char *argv[]);
-int bam_fillmd(int argc, char *argv[]);
-int bam_idxstats(int argc, char *argv[]);
-int main_samview(int argc, char *argv[]);
-int main_import(int argc, char *argv[]);
-int main_reheader(int argc, char *argv[]);
-int main_cut_target(int argc, char *argv[]);
-int main_phase(int argc, char *argv[]);
-int main_cat(int argc, char *argv[]);
-int main_depth(int argc, char *argv[]);
-int main_bam2fq(int argc, char *argv[]);
-int main_pad2unpad(int argc, char *argv[]);
-int main_bedcov(int argc, char *argv[]);
-int main_bamshuf(int argc, char *argv[]);
-
-int faidx_main(int argc, char *argv[]);
-
-int pysam_dispatch(int argc, char *argv[] )
-{
- extern int optind;
-#ifdef _WIN32
- setmode(fileno(stdout), O_BINARY);
- setmode(fileno(stdin), O_BINARY);
-#ifdef _USE_KNETFILE
- knet_win32_init();
-#endif
-#endif
-
- // reset getopt
- optind = 1;
-
- if (argc < 2) return 1;
- int retval = 0;
-
- if (strcmp(argv[1], "view") == 0) retval = main_samview(argc-1, argv+1);
- else if (strcmp(argv[1], "import") == 0) retval = main_import(argc-1, argv+1);
- else if (strcmp(argv[1], "mpileup") == 0) retval = bam_mpileup(argc-1, argv+1);
- else if (strcmp(argv[1], "merge") == 0) retval = bam_merge(argc-1, argv+1);
- else if (strcmp(argv[1], "sort") == 0) retval = bam_sort(argc-1, argv+1);
- else if (strcmp(argv[1], "index") == 0) retval = bam_index(argc-1, argv+1);
- else if (strcmp(argv[1], "faidx") == 0) retval = faidx_main(argc-1, argv+1);
- else if (strcmp(argv[1], "idxstats") == 0) retval = bam_idxstats(argc-1, argv+1);
- else if (strcmp(argv[1], "fixmate") == 0) retval = bam_mating(argc-1, argv+1);
- else if (strcmp(argv[1], "rmdup") == 0) retval = bam_rmdup(argc-1, argv+1);
- else if (strcmp(argv[1], "flagstat") == 0) retval = bam_flagstat(argc-1, argv+1);
- else if (strcmp(argv[1], "calmd") == 0) retval = bam_fillmd(argc-1, argv+1);
- else if (strcmp(argv[1], "fillmd") == 0) retval = bam_fillmd(argc-1, argv+1);
- else if (strcmp(argv[1], "reheader") == 0) retval = main_reheader(argc-1, argv+1);
- else if (strcmp(argv[1], "cat") == 0) retval = main_cat(argc-1, argv+1);
- else if (strcmp(argv[1], "targetcut") == 0) retval = main_cut_target(argc-1, argv+1);
- else if (strcmp(argv[1], "phase") == 0) retval = main_phase(argc-1, argv+1);
- else if (strcmp(argv[1], "depth") == 0) retval = main_depth(argc-1, argv+1);
- else if (strcmp(argv[1], "bam2fq") == 0) retval = main_bam2fq(argc-1, argv+1);
- else if (strcmp(argv[1], "pad2unpad") == 0) retval = main_pad2unpad(argc-1, argv+1);
- else if (strcmp(argv[1], "depad") == 0) retval = main_pad2unpad(argc-1, argv+1);
- else if (strcmp(argv[1], "bedcov") == 0) retval = main_bedcov(argc-1, argv+1);
- else if (strcmp(argv[1], "bamshuf") == 0) retval = main_bamshuf(argc-1, argv+1);
-
-#if _CURSES_LIB != 0
- else if (strcmp(argv[1], "tview") == 0) retval = bam_tview_main(argc-1, argv+1);
-#endif
- else
- {
- fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
- return 1;
- }
- fflush(stdout);
-
- return retval;
-}
-
-
-
-
diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h
index d6e5141..5f2359f 100644
--- a/pysam/pysam_util.h
+++ b/pysam/pysam_util.h
@@ -17,4 +17,6 @@ void pysam_unset_stderr(void);
int pysam_dispatch(int argc, char *argv[]);
+void set_optind(int);
+
#endif
diff --git a/pysam/samtools.py b/pysam/samtools.py
new file mode 100644
index 0000000..f81fe8f
--- /dev/null
+++ b/pysam/samtools.py
@@ -0,0 +1,46 @@
+from utils import PysamDispatcher
+
+# samtools command line options to export in python
+#
+# import is a python reserved word.
+SAMTOOLS_DISPATCH = {
+ # samtools 'documented' commands
+ "view": ("view", None),
+ "sort": ("sort", None),
+ "mpileup": ("mpileup", None),
+ "depth": ("depth", None),
+ "faidx": ("faidx", None),
+ "tview": ("tview", None),
+ "index": ("index", None),
+ "idxstats": ("idxstats", None),
+ "fixmate": ("fixmate", None),
+ "flagstat": ("flagstat", None),
+ "calmd": ("calmd", None),
+ "merge": ("merge", None),
+ "rmdup": ("rmdup", None),
+ "reheader": ("reheader", None),
+ "cat": ("cat", None),
+ "targetcut": ("targetcut", None),
+ "phase": ("phase", None),
+ "samimport": ("import", None),
+ "bam2fq": ("bam2fq", None),
+ "dict": ("dict", None),
+ "addreplacerg": ("addreplacerg", None),
+ "pad2unpad": ("pad2unpad", None),
+ "depad": ("pad2unpad", None),
+ "bedcov": ("bedcov", None),
+ "bamshuf": ("bamshuf", None),
+ "collate": ("collate", None),
+ "stats": ("stats", None),
+ "fasta": ("fasta", None),
+ "fastq": ("fastq", None),
+ "quickcheck": ("quickcheck", None),
+ "split": ("split", None),
+}
+
+# instantiate samtools commands as python functions
+for key, options in SAMTOOLS_DISPATCH.items():
+ cmd, parser = options
+ globals()[key] = PysamDispatcher("samtools", cmd, parser)
+
+__all__ = list(SAMTOOLS_DISPATCH)
diff --git a/pysam/utils.py b/pysam/utils.py
new file mode 100644
index 0000000..0e49d54
--- /dev/null
+++ b/pysam/utils.py
@@ -0,0 +1,88 @@
+from pysam.cutils import _pysam_dispatch
+
+
+class SamtoolsError(Exception):
+ '''exception raised in case of an error incurred in the samtools
+ library.'''
+
+ def __init__(self, value):
+ self.value = value
+
+ def __str__(self):
+ return repr(self.value)
+
+
+class PysamDispatcher(object):
+ '''The dispatcher emulates the samtools/bctools command line.
+
+ Captures stdout and stderr.
+
+ Raises a :class:`pysam.SamtoolsError` exception in case samtools
+ exits with an error code other than 0.
+
+ Some command line options are associated with parsers. For
+ example, the samtools command "pileup -c" creates a tab-separated
+ table on standard output. In order to associate parsers with
+ options, an optional list of parsers can be supplied. The list
+ will be processed in order checking for the presence of each
+ option.
+
+ If no parser is given or no appropriate parser is found, the
+ stdout output of samtools/bcftools commands will be returned.
+
+ '''
+
+ dispatch = None
+ parsers = None
+ collection = None
+
+ def __init__(self, collection, dispatch, parsers):
+ self.collection = collection
+ self.dispatch = dispatch
+ self.parsers = parsers
+ self.stderr = []
+
+ def __call__(self, *args, **kwargs):
+ '''execute a samtools command.
+
+ Keyword arguments:
+ catch_stdout -- redirect stdout from the samtools command and return as variable (default True)
+ raw -- ignore any parsers associated with this samtools command.
+ '''
+ retval, stderr, stdout = _pysam_dispatch(
+ self.collection,
+ self.dispatch,
+ args,
+ catch_stdout=kwargs.get("catch_stdout", True))
+
+ if retval:
+ raise SamtoolsError(
+ "%s returned with error %i: "
+ "stdout=%s, stderr=%s" %
+ (self.collection,
+ retval,
+ "\n".join(stdout),
+ "\n".join(stderr)))
+
+ self.stderr = stderr
+
+ # call parser for stdout:
+ if not kwargs.get("raw") and stdout and self.parsers:
+ for options, parser in self.parsers:
+ for option in options:
+ if option not in args:
+ break
+ else:
+ return parser(stdout)
+
+ return stdout
+
+ def get_messages(self):
+ return self.stderr
+
+ def usage(self):
+ '''return the samtools usage information for this command'''
+ retval, stderr, stdout = csamtools._samtools_dispatch(
+ self.dispatch)
+ return "".join(stderr)
+
diff --git a/pysam/version.py b/pysam/version.py
index 9047c04..815e4b9 100644
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,7 +1,7 @@
# pysam versioning information
-__version__ = "0.8.4"
+__version__ = "0.9.0"
-__samtools_version__ = "1.2"
+__samtools_version__ = "1.3"
-__htslib_version__ = "1.2.1"
+__htslib_version__ = "1.3"
diff --git a/run_tests_travis.sh b/run_tests_travis.sh
new file mode 100755
index 0000000..d2d9988
--- /dev/null
+++ b/run_tests_travis.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+pushd .
+
+WORKDIR=`pwd`
+
+#Install miniconda python
+if [ $TRAVIS_OS_NAME == "osx" ]; then
+ curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+ bash Miniconda3-latest-MacOSX-x86_64.sh -b
+else
+ curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+ bash Miniconda3-latest-Linux-x86_64.sh -b
+fi
+
+# Create a new conda environment with the target python version
+~/miniconda3/bin/conda install conda-build -y
+~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose
+
+# Add new conda environment to PATH
+export PATH=~/miniconda3/envs/testenv/bin/:$PATH
+
+# Hack to force linking to anaconda libraries rather than system libraries
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
+#export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
+
+# Need to make C compiler and linker use the anaconda includes and libraries:
+export PREFIX=~/miniconda3/
+export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
+
+# create a new folder to store external tools
+mkdir -p $WORKDIR/external-tools
+
+# install htslib
+cd $WORKDIR/external-tools
+curl -L https://github.com/samtools/htslib/releases/download/1.3/htslib-1.3.tar.bz2 > htslib-1.3.tar.bz2
+tar xjvf htslib-1.3.tar.bz2
+cd htslib-1.3
+make
+PATH=$PATH:$WORKDIR/external-tools/htslib-1.3
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3
+
+# install samtools, compile against htslib
+cd $WORKDIR/external-tools
+curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3/samtools-1.3.tar.bz2 > samtools-1.3.tar.bz2
+tar xjvf samtools-1.3.tar.bz2
+cd samtools-1.3
+./configure --with-htslib=../htslib-1.3
+make
+PATH=$PATH:$WORKDIR/external-tools/samtools-1.3
+
+echo "installed samtools"
+samtools --version
+
+if [ $? != 0 ]; then
+ exit 1
+fi
+
+# install bcftools
+cd $WORKDIR/external-tools
+curl -L https://github.com/samtools/bcftools/releases/download/1.3/bcftools-1.3.tar.bz2 > bcftools-1.3.tar.bz2
+tar xjf bcftools-1.3.tar.bz2
+cd bcftools-1.3
+./configure --with-htslib=../htslib-1.3
+make
+PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3
+
+echo "installed bcftools"
+bcftools --version
+
+if [ $? != 0 ]; then
+ exit 1
+fi
+
+popd
+
+# Try building conda recipe first
+~/miniconda3/bin/conda-build ci/conda-recipe/ --python=$CONDA_PY
+
+# install code from the repository
+python setup.py install
+
+# find build/
+
+# change into tests directory. Otherwise,
+# 'import pysam' will import the repository,
+# not the installed version. This causes
+# problems in the compilation test.
+cd tests
+
+# create auxilliary data
+echo
+echo 'building test data'
+echo
+make -C pysam_data
+make -C cbcf_data
+
+# run nosetests
+# -s: do not capture stdout, conflicts with pysam.dispatch
+# -v: verbose output
+nosetests -s -v
+
+if [ $? != 0 ]; then
+ exit 1
+fi
+
+# build source tar-ball and test installation from tar-ball
+cd ..
+python setup.py sdist
+tar -xvzf dist/pysam-*.tar.gz
+cd pysam-*
+python setup.py install
diff --git a/samtools/bam.c b/samtools/bam.c
index f909b7e..afab668 100644
--- a/samtools/bam.c
+++ b/samtools/bam.c
@@ -1,6 +1,6 @@
/* bam.c -- BAM format.
- Copyright (C) 2008-13 Genome Research Ltd.
+ Copyright (C) 2008-2013, 2015 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -61,19 +61,64 @@ int bam_validate1(const bam_header_t *header, const bam1_t *b)
return 1;
}
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
// FIXME: we should also check the LB tag associated with each alignment
const char *bam_get_library(bam_header_t *h, const bam1_t *b)
{
-#if 0
- const uint8_t *rg;
- if (h->dict == 0) h->dict = sam_header_parse2(h->text);
- if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
- rg = bam_aux_get(b, "RG");
- return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
-#else
- fprintf(stderr, "Samtools-htslib-API: bam_get_library() not yet implemented\n");
- abort();
-#endif
+ // Slow and inefficient. Rewrite once we get a proper header API.
+ const char *rg;
+ char *cp = h->text;
+ rg = (char *)bam_aux_get(b, "RG");
+
+ if (!rg)
+ return NULL;
+ else
+ rg++;
+
+ // Header is guaranteed to be nul terminated, so this is valid.
+ while (*cp) {
+ char *ID, *LB;
+ char last = '\t';
+
+ // Find a @RG line
+ if (strncmp(cp, "@RG", 3) != 0) {
+ while (*cp && *cp != '\n') cp++; // skip line
+ if (*cp) cp++;
+ continue;
+ }
+
+ // Find ID: and LB: keys
+ cp += 4;
+ ID = LB = NULL;
+ while (*cp && *cp != '\n') {
+ if (last == '\t') {
+ if (strncmp(cp, "LB:", 3) == 0)
+ LB = cp+3;
+ else if (strncmp(cp, "ID:", 3) == 0)
+ ID = cp+3;
+ }
+ last = *cp++;
+ }
+
+ // Check it's the correct ID
+ if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t')
+ continue;
+
+ // Valid until next query
+ static char LB_text[1024];
+ for (cp = LB; *cp && *cp != '\t' && *cp != '\n'; cp++)
+ ;
+ strncpy(LB_text, LB, MIN(cp-LB, 1023));
+ LB_text[MIN(cp-LB, 1023)] = 0;
+
+ // Return it; valid until the next query.
+ return LB_text;
+ }
+
+ return NULL;
}
int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
diff --git a/samtools/bam.c.pysam.c b/samtools/bam.c.pysam.c
index 751a183..a9da5b9 100644
--- a/samtools/bam.c.pysam.c
+++ b/samtools/bam.c.pysam.c
@@ -2,7 +2,7 @@
/* bam.c -- BAM format.
- Copyright (C) 2008-13 Genome Research Ltd.
+ Copyright (C) 2008-2013, 2015 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -63,19 +63,64 @@ int bam_validate1(const bam_header_t *header, const bam1_t *b)
return 1;
}
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
// FIXME: we should also check the LB tag associated with each alignment
const char *bam_get_library(bam_header_t *h, const bam1_t *b)
{
-#if 0
- const uint8_t *rg;
- if (h->dict == 0) h->dict = sam_header_parse2(h->text);
- if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
- rg = bam_aux_get(b, "RG");
- return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
-#else
- fprintf(pysamerr, "Samtools-htslib-API: bam_get_library() not yet implemented\n");
- abort();
-#endif
+ // Slow and inefficient. Rewrite once we get a proper header API.
+ const char *rg;
+ char *cp = h->text;
+ rg = (char *)bam_aux_get(b, "RG");
+
+ if (!rg)
+ return NULL;
+ else
+ rg++;
+
+ // Header is guaranteed to be nul terminated, so this is valid.
+ while (*cp) {
+ char *ID, *LB;
+ char last = '\t';
+
+ // Find a @RG line
+ if (strncmp(cp, "@RG", 3) != 0) {
+ while (*cp && *cp != '\n') cp++; // skip line
+ if (*cp) cp++;
+ continue;
+ }
+
+ // Find ID: and LB: keys
+ cp += 4;
+ ID = LB = NULL;
+ while (*cp && *cp != '\n') {
+ if (last == '\t') {
+ if (strncmp(cp, "LB:", 3) == 0)
+ LB = cp+3;
+ else if (strncmp(cp, "ID:", 3) == 0)
+ ID = cp+3;
+ }
+ last = *cp++;
+ }
+
+ // Check it's the correct ID
+ if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t')
+ continue;
+
+ // Valid until next query
+ static char LB_text[1024];
+ for (cp = LB; *cp && *cp != '\t' && *cp != '\n'; cp++)
+ ;
+ strncpy(LB_text, LB, MIN(cp-LB, 1023));
+ LB_text[MIN(cp-LB, 1023)] = 0;
+
+ // Return it; valid until the next query.
+ return LB_text;
+ }
+
+ return NULL;
}
int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
diff --git a/samtools/bam.h b/samtools/bam.h
index b8f7bc1..57aa044 100644
--- a/samtools/bam.h
+++ b/samtools/bam.h
@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "1.2"
+#define BAM_VERSION "1.3"
#include <stdint.h>
#include <stdlib.h>
@@ -185,6 +185,9 @@ typedef hts_itr_t *bam_iter_t;
/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
#define bam_nt16_rev_table seq_nt16_str
+/*! @abstract Table for converting a 4-bit encoded nucleotide to ~2 bits. */
+#define bam_nt16_nt4_table seq_nt16_int
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -457,7 +460,7 @@ extern "C" {
/*!
@abstract Retrieve the alignments that are overlapped with the
- specified region.
+ specified region. (For BAM files only; see also samfetch() in sam.h.)
@discussion A user defined function will be called for each
retrieved alignment ordered by its start position.
diff --git a/samtools/bam2bcf.c b/samtools/bam2bcf.c
index ca2481e..ed433b1 100644
--- a/samtools/bam2bcf.c
+++ b/samtools/bam2bcf.c
@@ -34,7 +34,6 @@ DEALINGS IN THE SOFTWARE. */
#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
-extern const char bam_nt16_nt4_table[];
#define CALL_DEFTHETA 0.83
#define DEF_MAPQ 20
@@ -124,7 +123,8 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
- if ( call->DPR ) memset(call->DPR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+ if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+ if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
}
/*
@@ -154,7 +154,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
memset(r->p,0,sizeof(float)*25);
if (ref_base >= 0) {
- ref4 = bam_nt16_nt4_table[ref_base];
+ ref4 = seq_nt16_int[ref_base];
is_indel = 0;
} else ref4 = 4, is_indel = 1;
if (_n == 0) return -1;
@@ -183,7 +183,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
if (q < 4) q = 4; // MQ=0 reads count as BQ=4
if (!is_indel) {
b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
- b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base
+ b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
} else {
b = p->aux>>16&0x3f;
@@ -194,7 +194,13 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
if (b < 4)
{
r->qsum[b] += q;
- if ( r->DPR ) r->DPR[b]++;
+ if ( r->ADF )
+ {
+ if ( bam_is_rev(p->b) )
+ r->ADR[b]++;
+ else
+ r->ADF[b]++;
+ }
}
++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
min_dist = p->b->core.l_qseq - 1 - p->qpos;
@@ -522,7 +528,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
int ref4, i, j;
float qsum[5] = {0,0,0,0,0};
if (ref_base >= 0) {
- call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
+ call->ori_ref = ref4 = seq_nt16_int[ref_base];
if (ref4 > 4) ref4 = 4;
} else call->ori_ref = -1, ref4 = 0;
@@ -619,23 +625,34 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
call->DP4[4*i+3] = calls[i].anno[3];
}
}
- if ( call->DPR )
+ if ( call->ADF )
{
assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
+ // reorder ADR,ADF to match the allele ordering at this site
int32_t tmp[B2B_MAX_ALLELES];
- int32_t *dpr = call->DPR + B2B_MAX_ALLELES, *dpr_out = call->DPR + B2B_MAX_ALLELES;
- int32_t *dpr_tot = call->DPR;
+ int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
+ int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
+ int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
+ int32_t *adf_tot = call->ADF;
for (i=0; i<n; i++)
{
for (j=0; j<call->n_alleles; j++)
{
- tmp[j] = dpr[ call->a[j] ];
- dpr_tot[j] += tmp[j];
+ tmp[j] = adr[ call->a[j] ];
+ adr_tot[j] += tmp[j];
+ }
+ for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
+ for (j=0; j<call->n_alleles; j++)
+ {
+ tmp[j] = adf[ call->a[j] ];
+ adf_tot[j] += tmp[j];
}
- for (j=0; j<call->n_alleles; j++) dpr_out[j] = tmp[j];
- dpr_out += call->n_alleles;
- dpr += B2B_MAX_ALLELES;
+ for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
+ adf_out += call->n_alleles;
+ adr_out += call->n_alleles;
+ adr += B2B_MAX_ALLELES;
+ adf += B2B_MAX_ALLELES;
}
}
@@ -719,7 +736,7 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
{
if (bc->a[i] < 0) break;
kputc(',', &bc->tmp);
- if ( bc->unseen==i ) kputs("<X>", &bc->tmp);
+ if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
else kputc("ACGT"[bc->a[i]], &bc->tmp);
nals++;
}
@@ -736,6 +753,18 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
}
bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
+ if ( fmt_flag&B2B_INFO_ADF )
+ bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_ADR )
+ bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
+ if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
+ {
+ for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
+ if ( fmt_flag&B2B_INFO_AD )
+ bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_DPR )
+ bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
+ }
float tmpf[16];
for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
@@ -795,10 +824,18 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
}
if ( fmt_flag&B2B_FMT_DP4 )
bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
- if ( fmt_flag&B2B_FMT_DPR )
- bcf_update_format_int32(hdr, rec, "DPR", bc->DPR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
- if ( fmt_flag&B2B_INFO_DPR )
- bcf_update_info_int32(hdr, rec, "DPR", bc->DPR, rec->n_allele);
+ if ( fmt_flag&B2B_FMT_ADF )
+ bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_ADR )
+ bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
+ {
+ for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
+ if ( fmt_flag&B2B_FMT_AD )
+ bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_DPR )
+ bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ }
return 0;
}
diff --git a/samtools/bam2bcf.c.pysam.c b/samtools/bam2bcf.c.pysam.c
index 4e127ba..be3876d 100644
--- a/samtools/bam2bcf.c.pysam.c
+++ b/samtools/bam2bcf.c.pysam.c
@@ -36,7 +36,6 @@ DEALINGS IN THE SOFTWARE. */
#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
-extern const char bam_nt16_nt4_table[];
#define CALL_DEFTHETA 0.83
#define DEF_MAPQ 20
@@ -126,7 +125,8 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
- if ( call->DPR ) memset(call->DPR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+ if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+ if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
}
/*
@@ -156,7 +156,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
memset(r->p,0,sizeof(float)*25);
if (ref_base >= 0) {
- ref4 = bam_nt16_nt4_table[ref_base];
+ ref4 = seq_nt16_int[ref_base];
is_indel = 0;
} else ref4 = 4, is_indel = 1;
if (_n == 0) return -1;
@@ -185,7 +185,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
if (q < 4) q = 4; // MQ=0 reads count as BQ=4
if (!is_indel) {
b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
- b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base
+ b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
} else {
b = p->aux>>16&0x3f;
@@ -196,7 +196,13 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
if (b < 4)
{
r->qsum[b] += q;
- if ( r->DPR ) r->DPR[b]++;
+ if ( r->ADF )
+ {
+ if ( bam_is_rev(p->b) )
+ r->ADR[b]++;
+ else
+ r->ADF[b]++;
+ }
}
++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
min_dist = p->b->core.l_qseq - 1 - p->qpos;
@@ -524,7 +530,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
int ref4, i, j;
float qsum[5] = {0,0,0,0,0};
if (ref_base >= 0) {
- call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
+ call->ori_ref = ref4 = seq_nt16_int[ref_base];
if (ref4 > 4) ref4 = 4;
} else call->ori_ref = -1, ref4 = 0;
@@ -621,23 +627,34 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
call->DP4[4*i+3] = calls[i].anno[3];
}
}
- if ( call->DPR )
+ if ( call->ADF )
{
assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
+ // reorder ADR,ADF to match the allele ordering at this site
int32_t tmp[B2B_MAX_ALLELES];
- int32_t *dpr = call->DPR + B2B_MAX_ALLELES, *dpr_out = call->DPR + B2B_MAX_ALLELES;
- int32_t *dpr_tot = call->DPR;
+ int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
+ int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
+ int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
+ int32_t *adf_tot = call->ADF;
for (i=0; i<n; i++)
{
for (j=0; j<call->n_alleles; j++)
{
- tmp[j] = dpr[ call->a[j] ];
- dpr_tot[j] += tmp[j];
+ tmp[j] = adr[ call->a[j] ];
+ adr_tot[j] += tmp[j];
+ }
+ for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
+ for (j=0; j<call->n_alleles; j++)
+ {
+ tmp[j] = adf[ call->a[j] ];
+ adf_tot[j] += tmp[j];
}
- for (j=0; j<call->n_alleles; j++) dpr_out[j] = tmp[j];
- dpr_out += call->n_alleles;
- dpr += B2B_MAX_ALLELES;
+ for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
+ adf_out += call->n_alleles;
+ adr_out += call->n_alleles;
+ adr += B2B_MAX_ALLELES;
+ adf += B2B_MAX_ALLELES;
}
}
@@ -721,7 +738,7 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
{
if (bc->a[i] < 0) break;
kputc(',', &bc->tmp);
- if ( bc->unseen==i ) kputs("<X>", &bc->tmp);
+ if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
else kputc("ACGT"[bc->a[i]], &bc->tmp);
nals++;
}
@@ -738,6 +755,18 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
}
bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
+ if ( fmt_flag&B2B_INFO_ADF )
+ bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_ADR )
+ bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
+ if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
+ {
+ for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
+ if ( fmt_flag&B2B_INFO_AD )
+ bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_DPR )
+ bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
+ }
float tmpf[16];
for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
@@ -797,10 +826,18 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
}
if ( fmt_flag&B2B_FMT_DP4 )
bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
- if ( fmt_flag&B2B_FMT_DPR )
- bcf_update_format_int32(hdr, rec, "DPR", bc->DPR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
- if ( fmt_flag&B2B_INFO_DPR )
- bcf_update_info_int32(hdr, rec, "DPR", bc->DPR, rec->n_allele);
+ if ( fmt_flag&B2B_FMT_ADF )
+ bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_ADR )
+ bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
+ {
+ for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
+ if ( fmt_flag&B2B_FMT_AD )
+ bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_DPR )
+ bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ }
return 0;
}
diff --git a/samtools/bam2bcf.h b/samtools/bam2bcf.h
index 3d532aa..22c67cc 100644
--- a/samtools/bam2bcf.h
+++ b/samtools/bam2bcf.h
@@ -49,6 +49,12 @@ DEALINGS IN THE SOFTWARE. */
#define B2B_FMT_DP4 (1<<3)
#define B2B_FMT_DPR (1<<4)
#define B2B_INFO_DPR (1<<5)
+#define B2B_FMT_AD (1<<6)
+#define B2B_FMT_ADF (1<<7)
+#define B2B_FMT_ADR (1<<8)
+#define B2B_INFO_AD (1<<9)
+#define B2B_INFO_ADF (1<<10)
+#define B2B_INFO_ADR (1<<11)
#define B2B_MAX_ALLELES 5
@@ -56,7 +62,8 @@ typedef struct __bcf_callaux_t {
int capQ, min_baseQ;
int openQ, extQ, tandemQ; // for indels
uint32_t min_support, max_support; // for collecting indel candidates
- float min_frac, max_frac; // for collecting indel candidates
+ double min_frac; // for collecting indel candidates
+ float max_frac; // for collecting indel candidates
int per_sample_flt; // indel filtering strategy
int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests
// for internal uses
@@ -73,7 +80,7 @@ typedef struct __bcf_callaux_t {
typedef struct {
uint32_t ori_depth;
unsigned int mq0;
- int32_t *DPR;
+ int32_t *ADF, *ADR;
float qsum[4];
// The fields are:
// depth fwd .. ref (0) and non-ref (2)
@@ -100,7 +107,7 @@ typedef struct {
int n_supp; // number of supporting non-reference reads
double anno[16];
unsigned int depth, ori_depth, mq0;
- int32_t *PL, *DP4, *DPR;
+ int32_t *PL, *DP4, *ADR, *ADF;
uint8_t *fmt_arr;
float vdb; // variant distance bias
float mwu_pos, mwu_mq, mwu_bq, mwu_mqs;
diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c
index e80e4c2..e1c45c4 100644
--- a/samtools/bam2bcf_indel.c
+++ b/samtools/bam2bcf_indel.c
@@ -38,8 +38,6 @@ KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
#define INDEL_WINDOW_SIZE 50
-extern const char bam_nt16_nt4_table[];
-
void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
{
const char *s, *p, *q, *r, *t;
@@ -199,7 +197,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
}
- float frac = (float)na/nt;
+ double frac = (double)na/nt;
if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
indel_support_ok = 1;
if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
@@ -217,7 +215,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (aux[i] != aux[i-1]) ++n_types;
// Taking totals makes it hard to call rare indels
if ( !bca->per_sample_flt )
- indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
+ indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
if ( n_types == 1 || !indel_support_ok ) { // then skip
free(aux); return -1;
}
@@ -327,7 +325,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (p->indel == types[t]) {
uint8_t *seq = bam_get_seq(p->b);
for (k = 1; k <= p->indel; ++k) {
- int c = bam_nt16_nt4_table[bam_seqi(seq, p->qpos + k)];
+ int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
assert(c<5);
++inscns_aux[(t*max_ins+(k-1))*5 + c];
}
@@ -371,12 +369,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (s = K = 0; s < n; ++s) {
// write ref2
for (k = 0, j = left; j <= pos; ++j)
- ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
if (types[t] <= 0) j += -types[t];
else for (l = 0; l < types[t]; ++l)
ref2[k++] = inscns[t*max_ins + l];
for (; j < right && ref[j]; ++j)
- ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
for (; k < max_ref2; ++k) ref2[k] = 4;
if (j < right) right = j;
// align each read to ref2
@@ -400,7 +398,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
// write the query sequence
for (l = qbeg; l < qend; ++l)
- query[l - qbeg] = bam_nt16_nt4_table[bam_seqi(seq, l)];
+ query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
{ // do realignment; this is the bottleneck
const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c
index 8a469ee..45e1101 100644
--- a/samtools/bam2bcf_indel.c.pysam.c
+++ b/samtools/bam2bcf_indel.c.pysam.c
@@ -40,8 +40,6 @@ KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
#define INDEL_WINDOW_SIZE 50
-extern const char bam_nt16_nt4_table[];
-
void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
{
const char *s, *p, *q, *r, *t;
@@ -201,7 +199,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
}
- float frac = (float)na/nt;
+ double frac = (double)na/nt;
if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
indel_support_ok = 1;
if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
@@ -219,7 +217,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (aux[i] != aux[i-1]) ++n_types;
// Taking totals makes it hard to call rare indels
if ( !bca->per_sample_flt )
- indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
+ indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
if ( n_types == 1 || !indel_support_ok ) { // then skip
free(aux); return -1;
}
@@ -329,7 +327,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (p->indel == types[t]) {
uint8_t *seq = bam_get_seq(p->b);
for (k = 1; k <= p->indel; ++k) {
- int c = bam_nt16_nt4_table[bam_seqi(seq, p->qpos + k)];
+ int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
assert(c<5);
++inscns_aux[(t*max_ins+(k-1))*5 + c];
}
@@ -373,12 +371,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (s = K = 0; s < n; ++s) {
// write ref2
for (k = 0, j = left; j <= pos; ++j)
- ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
if (types[t] <= 0) j += -types[t];
else for (l = 0; l < types[t]; ++l)
ref2[k++] = inscns[t*max_ins + l];
for (; j < right && ref[j]; ++j)
- ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
for (; k < max_ref2; ++k) ref2[k] = 4;
if (j < right) right = j;
// align each read to ref2
@@ -402,7 +400,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
// write the query sequence
for (l = qbeg; l < qend; ++l)
- query[l - qbeg] = bam_nt16_nt4_table[bam_seqi(seq, l)];
+ query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
{ // do realignment; this is the bottleneck
const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c
index b749062..f109447 100644
--- a/samtools/bam2depth.c
+++ b/samtools/bam2depth.c
@@ -33,9 +33,11 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <limits.h>
#include <unistd.h>
#include "htslib/sam.h"
#include "samtools.h"
+#include "sam_opts.h"
typedef struct { // auxiliary data structure
samFile *fp; // the file handle
@@ -47,6 +49,7 @@ typedef struct { // auxiliary data structure
void *bed_read(const char *fn); // read a BED or position list file
void bed_destroy(void *_h); // destroy the BED data structure
int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps
+int bed_query(const void *_h, const char *chr, int pos, int *beg, int *end);
// This function reads a BAM alignment from one BAM file.
static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
@@ -67,9 +70,35 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here
int read_file_list(const char *file_list,int *n,char **argv[]);
+static int usage() {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -a output all positions (including zero depth)\n");
+ fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
+ fprintf(stderr, " -b <bed> list of positions or regions\n");
+ fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
+ fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
+ fprintf(stderr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
+ fprintf(stderr, " -q <int> base quality threshold\n");
+ fprintf(stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(stderr, " -r <chr:from-to> region\n");
+
+ sam_global_opt_help(stderr, "-.--.");
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
+ fprintf(stderr, "position, and coverage depth. Note that positions with zero coverage may be\n");
+ fprintf(stderr, "omitted by default; see the -a option.\n");
+ fprintf(stderr, "\n");
+
+ return 1;
+}
+
int main_depth(int argc, char *argv[])
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, status = EXIT_SUCCESS, nfiles;
+ int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
+ int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
void *bed = 0; // BED data structure
@@ -77,34 +106,35 @@ int main_depth(int argc, char *argv[])
bam_hdr_t *h = NULL; // BAM header of the 1st input
aux_t **data;
bam_mplp_t mplp;
+ int last_pos = -1, last_tid = -1, ret;
+
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
// parse the command line
- while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) {
switch (n) {
case 'l': min_len = atoi(optarg); break; // minimum query length
case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
case 'b':
bed = bed_read(optarg); // BED or position list file can be parsed now
- if (!bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; }
+ if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; }
break;
case 'q': baseQ = atoi(optarg); break; // base quality threshold
case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
case 'f': file_list = optarg; break;
+ case 'a': all++; break;
+ case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth
+ default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return usage();
}
}
- if (optind == argc && !file_list) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
- fprintf(stderr, "Options:\n");
- fprintf(stderr, " -b <bed> list of positions or regions\n");
- fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
- fprintf(stderr, " -q <int> base quality threshold\n");
- fprintf(stderr, " -Q <int> mapping quality threshold\n");
- fprintf(stderr, " -r <chr:from-to> region\n");
- fprintf(stderr, "\n");
- return 1;
- }
+ if (optind == argc && !file_list)
+ return usage();
// initialize the auxiliary data structures
if (file_list)
@@ -117,18 +147,19 @@ int main_depth(int argc, char *argv[])
else
n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- beg = 0; end = 1<<30; // set the default region
+ beg = 0; end = INT_MAX; // set the default region
for (i = 0; i < n; ++i) {
+ int rf;
data[i] = calloc(1, sizeof(aux_t));
- data[i]->fp = sam_open(argv[optind+i], "r"); // open BAM
+ data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
if (data[i]->fp == NULL) {
- print_error_errno("Could not open \"%s\"", argv[optind+i]);
+ print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]);
status = EXIT_FAILURE;
goto depth_end;
}
- if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS,
- SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR |
- SAM_SEQ)) {
+ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
+ if (baseQ) rf |= SAM_QUAL;
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
return 1;
}
@@ -139,17 +170,23 @@ int main_depth(int argc, char *argv[])
data[i]->min_mapQ = mapQ; // set the mapQ filter
data[i]->min_len = min_len; // set the qlen filter
data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
+ if (data[i]->hdr == NULL) {
+ fprintf(stderr, "Couldn't read header for \"%s\"\n",
+ argv[optind+i]);
+ status = EXIT_FAILURE;
+ goto depth_end;
+ }
if (reg) { // if a region is specified
hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index
if (idx == NULL) {
- print_error("can't load index for \"%s\"", argv[optind+i]);
+ print_error("depth", "can't load index for \"%s\"", argv[optind+i]);
status = EXIT_FAILURE;
goto depth_end;
}
data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator
hts_idx_destroy(idx); // the index is not needed any more; free the memory
if (data[i]->iter == NULL) {
- print_error("can't parse region \"%s\"", reg);
+ print_error("depth", "can't parse region \"%s\"", reg);
status = EXIT_FAILURE;
goto depth_end;
}
@@ -164,11 +201,45 @@ int main_depth(int argc, char *argv[])
// the core multi-pileup loop
mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
+ if (0 < max_depth)
+ bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth
n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
- while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
+ while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
+ if (tid >= h->n_targets) continue; // diff number of @SQ lines per file?
if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
+ if (all) {
+ while (tid > last_tid) {
+ if (last_tid >= 0 && all > 1 && !reg) {
+ // Deal with remainder or entirety of last tid
+ while (++last_pos < h->target_len[last_tid]) {
+ if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
+ for (i = 0; i < n; i++)
+ putchar('\t'), putchar('0');
+ putchar('\n');
+ }
+ }
+ last_tid++;
+ last_pos = -1;
+ }
+
+ // Deal with missing portion of current tid
+ while (++last_pos < pos) {
+ if (last_pos < beg) continue; // out of range; skip
+ if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
+ continue;
+ fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1);
+ for (i = 0; i < n; i++)
+ putchar('\t'), putchar('0');
+ putchar('\n');
+ }
+
+ last_tid = tid;
+ last_pos = pos;
+ }
fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
@@ -181,9 +252,29 @@ int main_depth(int argc, char *argv[])
}
putchar('\n');
}
+ if (ret < 0) status = EXIT_FAILURE;
free(n_plp); free(plp);
bam_mplp_destroy(mplp);
+ if (all) {
+ // Handle terminating region
+ while (last_tid < h->n_targets) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (last_pos >= end) break;
+ if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
+ for (i = 0; i < n; i++)
+ putchar('\t'), putchar('0');
+ putchar('\n');
+ }
+ last_tid++;
+ last_pos = -1;
+ if (all < 2 || reg)
+ break;
+ }
+ }
+
depth_end:
for (i = 0; i < n && data[i]; ++i) {
bam_hdr_destroy(data[i]->hdr);
@@ -198,6 +289,7 @@ depth_end:
for (i=0; i<n; i++) free(fn[i]);
free(fn);
}
+ sam_global_args_free(&ga);
return status;
}
diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c
index 5c588f9..6549949 100644
--- a/samtools/bam2depth.c.pysam.c
+++ b/samtools/bam2depth.c.pysam.c
@@ -35,9 +35,11 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <limits.h>
#include <unistd.h>
#include "htslib/sam.h"
#include "samtools.h"
+#include "sam_opts.h"
typedef struct { // auxiliary data structure
samFile *fp; // the file handle
@@ -49,6 +51,7 @@ typedef struct { // auxiliary data structure
void *bed_read(const char *fn); // read a BED or position list file
void bed_destroy(void *_h); // destroy the BED data structure
int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps
+int bed_query(const void *_h, const char *chr, int pos, int *beg, int *end);
// This function reads a BAM alignment from one BAM file.
static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
@@ -69,9 +72,35 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here
int read_file_list(const char *file_list,int *n,char **argv[]);
+static int usage() {
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -a output all positions (including zero depth)\n");
+ fprintf(pysamerr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
+ fprintf(pysamerr, " -b <bed> list of positions or regions\n");
+ fprintf(pysamerr, " -f <list> list of input BAM filenames, one per line [null]\n");
+ fprintf(pysamerr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
+ fprintf(pysamerr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
+ fprintf(pysamerr, " -q <int> base quality threshold\n");
+ fprintf(pysamerr, " -Q <int> mapping quality threshold\n");
+ fprintf(pysamerr, " -r <chr:from-to> region\n");
+
+ sam_global_opt_help(pysamerr, "-.--.");
+
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "The output is a simple tab-separated table with three columns: reference name,\n");
+ fprintf(pysamerr, "position, and coverage depth. Note that positions with zero coverage may be\n");
+ fprintf(pysamerr, "omitted by default; see the -a option.\n");
+ fprintf(pysamerr, "\n");
+
+ return 1;
+}
+
int main_depth(int argc, char *argv[])
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, status = EXIT_SUCCESS, nfiles;
+ int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
+ int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
void *bed = 0; // BED data structure
@@ -79,34 +108,35 @@ int main_depth(int argc, char *argv[])
bam_hdr_t *h = NULL; // BAM header of the 1st input
aux_t **data;
bam_mplp_t mplp;
+ int last_pos = -1, last_tid = -1, ret;
+
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
// parse the command line
- while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) {
switch (n) {
case 'l': min_len = atoi(optarg); break; // minimum query length
case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
case 'b':
bed = bed_read(optarg); // BED or position list file can be parsed now
- if (!bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; }
+ if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; }
break;
case 'q': baseQ = atoi(optarg); break; // base quality threshold
case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
case 'f': file_list = optarg; break;
+ case 'a': all++; break;
+ case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth
+ default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return usage();
}
}
- if (optind == argc && !file_list) {
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -b <bed> list of positions or regions\n");
- fprintf(pysamerr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(pysamerr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
- fprintf(pysamerr, " -q <int> base quality threshold\n");
- fprintf(pysamerr, " -Q <int> mapping quality threshold\n");
- fprintf(pysamerr, " -r <chr:from-to> region\n");
- fprintf(pysamerr, "\n");
- return 1;
- }
+ if (optind == argc && !file_list)
+ return usage();
// initialize the auxiliary data structures
if (file_list)
@@ -119,18 +149,19 @@ int main_depth(int argc, char *argv[])
else
n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- beg = 0; end = 1<<30; // set the default region
+ beg = 0; end = INT_MAX; // set the default region
for (i = 0; i < n; ++i) {
+ int rf;
data[i] = calloc(1, sizeof(aux_t));
- data[i]->fp = sam_open(argv[optind+i], "r"); // open BAM
+ data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
if (data[i]->fp == NULL) {
- print_error_errno("Could not open \"%s\"", argv[optind+i]);
+ print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]);
status = EXIT_FAILURE;
goto depth_end;
}
- if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS,
- SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR |
- SAM_SEQ)) {
+ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
+ if (baseQ) rf |= SAM_QUAL;
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
return 1;
}
@@ -141,17 +172,23 @@ int main_depth(int argc, char *argv[])
data[i]->min_mapQ = mapQ; // set the mapQ filter
data[i]->min_len = min_len; // set the qlen filter
data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
+ if (data[i]->hdr == NULL) {
+ fprintf(pysamerr, "Couldn't read header for \"%s\"\n",
+ argv[optind+i]);
+ status = EXIT_FAILURE;
+ goto depth_end;
+ }
if (reg) { // if a region is specified
hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index
if (idx == NULL) {
- print_error("can't load index for \"%s\"", argv[optind+i]);
+ print_error("depth", "can't load index for \"%s\"", argv[optind+i]);
status = EXIT_FAILURE;
goto depth_end;
}
data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator
hts_idx_destroy(idx); // the index is not needed any more; free the memory
if (data[i]->iter == NULL) {
- print_error("can't parse region \"%s\"", reg);
+ print_error("depth", "can't parse region \"%s\"", reg);
status = EXIT_FAILURE;
goto depth_end;
}
@@ -166,11 +203,45 @@ int main_depth(int argc, char *argv[])
// the core multi-pileup loop
mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
+ if (0 < max_depth)
+ bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth
n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
- while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
+ while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
+ if (tid >= h->n_targets) continue; // diff number of @SQ lines per file?
if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
+ if (all) {
+ while (tid > last_tid) {
+ if (last_tid >= 0 && all > 1 && !reg) {
+ // Deal with remainder or entirety of last tid
+ while (++last_pos < h->target_len[last_tid]) {
+ if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
+ for (i = 0; i < n; i++)
+ putchar('\t'), putchar('0');
+ putchar('\n');
+ }
+ }
+ last_tid++;
+ last_pos = -1;
+ }
+
+ // Deal with missing portion of current tid
+ while (++last_pos < pos) {
+ if (last_pos < beg) continue; // out of range; skip
+ if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
+ continue;
+ fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1);
+ for (i = 0; i < n; i++)
+ putchar('\t'), putchar('0');
+ putchar('\n');
+ }
+
+ last_tid = tid;
+ last_pos = pos;
+ }
fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
@@ -183,9 +254,29 @@ int main_depth(int argc, char *argv[])
}
putchar('\n');
}
+ if (ret < 0) status = EXIT_FAILURE;
free(n_plp); free(plp);
bam_mplp_destroy(mplp);
+ if (all) {
+ // Handle terminating region
+ while (last_tid < h->n_targets) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (last_pos >= end) break;
+ if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
+ for (i = 0; i < n; i++)
+ putchar('\t'), putchar('0');
+ putchar('\n');
+ }
+ last_tid++;
+ last_pos = -1;
+ if (all < 2 || reg)
+ break;
+ }
+ }
+
depth_end:
for (i = 0; i < n && data[i]; ++i) {
bam_hdr_destroy(data[i]->hdr);
@@ -200,6 +291,7 @@ depth_end:
for (i=0; i<n; i++) free(fn[i]);
free(fn);
}
+ sam_global_args_free(&ga);
return status;
}
diff --git a/samtools/bam_addrprg.c b/samtools/bam_addrprg.c
new file mode 100644
index 0000000..2b4939f
--- /dev/null
+++ b/samtools/bam_addrprg.c
@@ -0,0 +1,476 @@
+/* bam_addrprg.c -- samtools command to add or replace readgroups.
+
+ Copyright (c) 2013, 2015 Genome Research Limited.
+
+ Author: Martin O. Pollard <mp15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <htslib/sam.h>
+#include <htslib/kstring.h>
+#include "samtools.h"
+#include "sam_opts.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <assert.h>
+#include <unistd.h>
+
+typedef enum {
+ overwrite_all,
+ orphan_only,
+} rg_mode;
+
+struct parsed_opts {
+ char* input_name;
+ char* output_name;
+ char* rg_id;
+ char* rg_line;
+ rg_mode mode;
+ sam_global_args ga;
+};
+
+struct state;
+typedef struct parsed_opts parsed_opts_t;
+typedef struct state state_t;
+
+struct state {
+ samFile* input_file;
+ bam_hdr_t* input_header;
+ samFile* output_file;
+ bam_hdr_t* output_header;
+ char* rg_id;
+ void (*mode_func)(const state_t*, bam1_t*);
+};
+
+static void cleanup_opts(parsed_opts_t* opts)
+{
+ if (!opts) return;
+ free(opts->rg_id);
+ free(opts->output_name);
+ free(opts->input_name);
+ sam_global_args_free(&opts->ga);
+ free(opts);
+}
+
+static void cleanup_state(state_t* state)
+{
+ if (!state) return;
+ free(state->rg_id);
+ if (state->output_file) sam_close(state->output_file);
+ bam_hdr_destroy(state->output_header);
+ if (state->input_file) sam_close(state->input_file);
+ bam_hdr_destroy(state->input_header);
+ free(state);
+}
+
+// Converts \t and \n into real tabs and newlines
+static char* basic_unescape(const char* in)
+{
+ assert(in);
+ char *ptr, *out;
+ out = ptr = malloc(strlen(in)+1);
+ size_t size = 0;
+ while (*in) {
+ if (*in == '\\') {
+ ++in;
+ if (*in == '\0') {
+ fprintf(stderr, "[%s] Unterminated escape sequence.\n", __func__);
+ free(out);
+ return NULL;
+ }
+ switch (*in) {
+ case '\\':
+ *ptr = '\\';
+ break;
+ case 't':
+ *ptr = '\t';
+ break;
+ case 'n':
+ fprintf(stderr, "[%s] \\n in escape sequence is not supported.\n", __func__);
+ free(out);
+ return NULL;
+ default:
+ fprintf(stderr, "[%s] Unsupported escape sequence.\n", __func__);
+ free(out);
+ return NULL;
+ }
+ } else {
+ *ptr = *in;
+ }
+ ++in;
+ ++ptr;
+ ++size;
+ }
+ *ptr = '\0';
+ ++size;
+ char* tmp = (char*)realloc(out, size);
+ if (!tmp) {
+ free(out);
+ }
+ return tmp;
+}
+
+// These are to be replaced by samtools header parser
+// Extracts the first @RG line from a string.
+static char* get_rg_line(const char* text, size_t* last)
+{
+ const char* rg = text;
+ if (rg[0] != '@' || rg[1] != 'R' || rg[2] != 'G' ) {
+ if ((rg = (const char*)strstr(text,"\n at RG")) == NULL) {
+ return NULL;
+ }
+ rg++;//skip initial \n
+ }
+ // duplicate the line for return
+ char* line;
+ char* end = strchr(rg, '\n');
+ if (end) {
+ line = strndup(rg,(end-rg));
+ *last = end - rg;
+ } else {
+ line = strdup(rg);
+ *last = strlen(rg);
+ }
+ return line;
+}
+
+// Given a @RG line return the id
+static char* get_rg_id(const char* input)
+{
+ assert(input!=NULL);
+ char* line = strdup(input);
+ char *next = line;
+ char* token = strsep(&next, "\t");
+ token = strsep(&next,"\t"); // skip first token it should always be "@RG"
+ while (next != NULL) {
+ char* key = strsep(&token,":");
+ if (!strcmp(key,"ID")) {
+ char* retval = strdup(token);
+ free(line);
+ return retval;
+ }
+ token = strsep(&next,"\t");
+ }
+ free(line);
+ return NULL;
+}
+
+// Confirms the existance of an RG line with a given ID in a bam header
+static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid )
+{
+ assert( hdr != NULL && rgid != NULL );
+
+ char *ptr, *start;
+ bool found = false;
+ start = ptr = strndup(hdr->text, hdr->l_text);
+ while (ptr != NULL && *ptr != '\0' && found == false ) {
+ size_t end = 0;
+ char* line = get_rg_line(ptr, &end);
+ if (line == NULL) break; // No more @RG
+ char* id;
+ if (((id = get_rg_id(line)) != NULL) && !strcmp(id, rgid)) {
+ found = true;
+ }
+ free(id);
+ free(line);
+ ptr += end;
+ }
+ free(start);
+ return found;
+}
+
+static char* get_first_rgid( const bam_hdr_t *hdr )
+{
+ assert( hdr != NULL );
+ char *ptr, *start;
+ char* found = NULL;
+ start = ptr = strndup(hdr->text, hdr->l_text);
+ while (ptr != NULL && *ptr != '\0' && found == NULL ) {
+ size_t end = 0;
+ char* line = get_rg_line(ptr, &end);
+ if ( line ) {
+ found = get_rg_id(line);
+ } else break;
+ free(line);
+ ptr += end;
+ }
+ free(start);
+ return found;
+}
+
+static void usage(FILE *fp)
+{
+ fprintf(fp,
+ "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-o <output.bam>] <input.bam>\n"
+ "\n"
+ "Options:\n"
+ " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
+ " -o FILE Where to write output to [stdout]\n"
+ " -r STRING @RG line text\n"
+ " -R STRING ID of @RG line in existing header to use\n"
+ );
+ sam_global_opt_help(fp, "..O..");
+}
+
+static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
+{
+ *opts = NULL;
+ int n;
+
+ if (argc == 1) { usage(stdout); return true; }
+
+ parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
+ if (! retval ) {
+ fprintf(stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
+ return false;
+ }
+ // Set defaults
+ retval->mode = overwrite_all;
+ sam_global_args_init(&retval->ga);
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+ kstring_t rg_line = {0,0,NULL};
+
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) {
+ switch (n) {
+ case 'r':
+ // Are we adding to existing rg line?
+ if (ks_len(&rg_line) == 0) {
+ if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) {
+ kputs("@RG\t", &rg_line);
+ }
+ } else {
+ kputs("\t", &rg_line);
+ }
+ kputs(optarg, &rg_line);
+ break;
+ case 'R':
+ retval->rg_id = strdup(optarg);
+ break;
+ case 'm': {
+ if (strcmp(optarg, "overwrite_all") == 0) {
+ retval->mode = overwrite_all;
+ } else if (strcmp(optarg, "orphan_only") == 0) {
+ retval->mode = orphan_only;
+ } else {
+ usage(stderr);
+ return false;
+ }
+ break;
+ }
+ case 'o':
+ retval->output_name = strdup(optarg);
+ break;
+ case 'h':
+ usage(stdout);
+ free(retval);
+ return true;
+ case '?':
+ usage(stderr);
+ free(retval);
+ return false;
+ case 'O':
+ default:
+ if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
+ usage(stderr);
+ free(retval);
+ return false;
+ }
+ }
+ retval->rg_line = ks_release(&rg_line);
+
+ if (argc-optind < 1) {
+ fprintf(stderr, "You must specify an input file.\n");
+ usage(stderr);
+ cleanup_opts(retval);
+ return false;
+ }
+ if (retval->rg_id && retval->rg_line) {
+ fprintf(stderr, "The options -r and -R are mutually exclusive.\n");
+ cleanup_opts(retval);
+ return false;
+ }
+
+ if (retval->rg_line)
+ {
+ char* tmp = basic_unescape(retval->rg_line);
+
+ if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
+ fprintf(stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
+ free(tmp);
+ cleanup_opts(retval);
+ return false;
+ }
+ retval->rg_line = tmp;
+ }
+ retval->input_name = strdup(argv[optind+0]);
+
+ *opts = retval;
+ return true;
+}
+
+static void overwrite_all_func(const state_t* state, bam1_t* file_read)
+{
+ uint8_t* data = (uint8_t*)strdup(state->rg_id);
+ int len = strlen(state->rg_id)+1;
+ // If the old exists delete it
+ uint8_t* old = bam_aux_get(file_read, "RG");
+ if (old != NULL) {
+ bam_aux_del(file_read, old);
+ }
+
+ bam_aux_append(file_read, "RG", 'Z', len, data);
+ free(data);
+}
+
+static void orphan_only_func(const state_t* state, bam1_t* file_read)
+{
+ uint8_t* data = (uint8_t*)strdup(state->rg_id);
+ int len = strlen(state->rg_id)+1;
+ // If the old exists don't do anything
+ uint8_t* old = bam_aux_get(file_read, "RG");
+ if (old == NULL) {
+ bam_aux_append(file_read, "RG",'Z',len,data);
+ }
+ free(data);
+}
+
+static bool init(const parsed_opts_t* opts, state_t** state_out) {
+ state_t* retval = (state_t*) calloc(1, sizeof(state_t));
+ if (retval == NULL) {
+ fprintf(stderr, "[init] Out of memory allocating state struct.\n");
+ return false;
+ }
+ *state_out = retval;
+
+ // Open files
+ retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
+ if (retval->input_file == NULL) {
+ fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name);
+ return false;
+ }
+ retval->input_header = sam_hdr_read(retval->input_file);
+
+ retval->output_header = bam_hdr_dup(retval->input_header);
+ retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);
+
+ if (retval->output_file == NULL) {
+ print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
+ return false;
+ }
+
+ if (opts->rg_line) {
+ // Append new RG line to header.
+ // Check does not already exist
+ if ( confirm_rg(retval->output_header, opts->rg_id) ) {
+ fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
+ return false;
+ }
+ retval->rg_id = strdup(opts->rg_id);
+ size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2;
+ char* new_header = malloc(new_len);
+ if (!new_header) {
+ fprintf(stderr, "[init] Out of memory whilst writing new header.\n");
+ return false;
+ }
+ sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line);
+ free(retval->output_header->text);
+ retval->output_header->text = new_header;
+ retval->output_header->l_text = (int)new_len - 1;
+ } else {
+ if (opts->rg_id) {
+ // Confirm what has been supplied exists
+ if ( !confirm_rg(retval->output_header, opts->rg_id) ) {
+ fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
+ return false;
+ }
+ retval->rg_id = strdup(opts->rg_id);
+ } else {
+ if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) {
+ fprintf(stderr, "No RG specified on command line or in existing header.\n");
+ return false;
+ }
+ }
+ }
+
+ switch (opts->mode) {
+ case overwrite_all:
+ retval->mode_func = &overwrite_all_func;
+ break;
+ case orphan_only:
+ retval->mode_func = &orphan_only_func;
+ break;
+ }
+
+ return true;
+}
+
+static bool readgroupise(state_t* state)
+{
+ if (sam_hdr_write(state->output_file, state->output_header) != 0) {
+ print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__);
+ return false;
+ }
+
+ bam1_t* file_read = bam_init1();
+ int ret;
+ while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) {
+ state->mode_func(state, file_read);
+
+ if (sam_write1(state->output_file, state->output_header, file_read) < 0) {
+ print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__);
+ bam_destroy1(file_read);
+ return false;
+ }
+ }
+ bam_destroy1(file_read);
+ if (ret != -1) {
+ print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+int main_addreplacerg(int argc, char** argv)
+{
+ parsed_opts_t* opts = NULL;
+ state_t* state = NULL;
+
+ if (!parse_args(argc, argv, &opts)) goto error;
+ if (opts == NULL) return EXIT_SUCCESS; // Not an error but user doesn't want us to proceed
+ if (!opts || !init(opts, &state)) goto error;
+
+ if (!readgroupise(state)) goto error;
+
+ cleanup_opts(opts);
+ cleanup_state(state);
+
+ return EXIT_SUCCESS;
+error:
+ cleanup_opts(opts);
+ cleanup_state(state);
+
+ return EXIT_FAILURE;
+}
diff --git a/samtools/bam_addrprg.c.pysam.c b/samtools/bam_addrprg.c.pysam.c
new file mode 100644
index 0000000..91fa9cd
--- /dev/null
+++ b/samtools/bam_addrprg.c.pysam.c
@@ -0,0 +1,478 @@
+#include "pysam.h"
+
+/* bam_addrprg.c -- samtools command to add or replace readgroups.
+
+ Copyright (c) 2013, 2015 Genome Research Limited.
+
+ Author: Martin O. Pollard <mp15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <htslib/sam.h>
+#include <htslib/kstring.h>
+#include "samtools.h"
+#include "sam_opts.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <assert.h>
+#include <unistd.h>
+
+typedef enum {
+ overwrite_all,
+ orphan_only,
+} rg_mode;
+
+struct parsed_opts {
+ char* input_name;
+ char* output_name;
+ char* rg_id;
+ char* rg_line;
+ rg_mode mode;
+ sam_global_args ga;
+};
+
+struct state;
+typedef struct parsed_opts parsed_opts_t;
+typedef struct state state_t;
+
+struct state {
+ samFile* input_file;
+ bam_hdr_t* input_header;
+ samFile* output_file;
+ bam_hdr_t* output_header;
+ char* rg_id;
+ void (*mode_func)(const state_t*, bam1_t*);
+};
+
+static void cleanup_opts(parsed_opts_t* opts)
+{
+ if (!opts) return;
+ free(opts->rg_id);
+ free(opts->output_name);
+ free(opts->input_name);
+ sam_global_args_free(&opts->ga);
+ free(opts);
+}
+
+static void cleanup_state(state_t* state)
+{
+ if (!state) return;
+ free(state->rg_id);
+ if (state->output_file) sam_close(state->output_file);
+ bam_hdr_destroy(state->output_header);
+ if (state->input_file) sam_close(state->input_file);
+ bam_hdr_destroy(state->input_header);
+ free(state);
+}
+
+// Converts \t and \n into real tabs and newlines
+static char* basic_unescape(const char* in)
+{
+ assert(in);
+ char *ptr, *out;
+ out = ptr = malloc(strlen(in)+1);
+ size_t size = 0;
+ while (*in) {
+ if (*in == '\\') {
+ ++in;
+ if (*in == '\0') {
+ fprintf(pysamerr, "[%s] Unterminated escape sequence.\n", __func__);
+ free(out);
+ return NULL;
+ }
+ switch (*in) {
+ case '\\':
+ *ptr = '\\';
+ break;
+ case 't':
+ *ptr = '\t';
+ break;
+ case 'n':
+ fprintf(pysamerr, "[%s] \\n in escape sequence is not supported.\n", __func__);
+ free(out);
+ return NULL;
+ default:
+ fprintf(pysamerr, "[%s] Unsupported escape sequence.\n", __func__);
+ free(out);
+ return NULL;
+ }
+ } else {
+ *ptr = *in;
+ }
+ ++in;
+ ++ptr;
+ ++size;
+ }
+ *ptr = '\0';
+ ++size;
+ char* tmp = (char*)realloc(out, size);
+ if (!tmp) {
+ free(out);
+ }
+ return tmp;
+}
+
+// These are to be replaced by samtools header parser
+// Extracts the first @RG line from a string.
+static char* get_rg_line(const char* text, size_t* last)
+{
+ const char* rg = text;
+ if (rg[0] != '@' || rg[1] != 'R' || rg[2] != 'G' ) {
+ if ((rg = (const char*)strstr(text,"\n at RG")) == NULL) {
+ return NULL;
+ }
+ rg++;//skip initial \n
+ }
+ // duplicate the line for return
+ char* line;
+ char* end = strchr(rg, '\n');
+ if (end) {
+ line = strndup(rg,(end-rg));
+ *last = end - rg;
+ } else {
+ line = strdup(rg);
+ *last = strlen(rg);
+ }
+ return line;
+}
+
+// Given a @RG line return the id
+static char* get_rg_id(const char* input)
+{
+ assert(input!=NULL);
+ char* line = strdup(input);
+ char *next = line;
+ char* token = strsep(&next, "\t");
+ token = strsep(&next,"\t"); // skip first token it should always be "@RG"
+ while (next != NULL) {
+ char* key = strsep(&token,":");
+ if (!strcmp(key,"ID")) {
+ char* retval = strdup(token);
+ free(line);
+ return retval;
+ }
+ token = strsep(&next,"\t");
+ }
+ free(line);
+ return NULL;
+}
+
+// Confirms the existance of an RG line with a given ID in a bam header
+static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid )
+{
+ assert( hdr != NULL && rgid != NULL );
+
+ char *ptr, *start;
+ bool found = false;
+ start = ptr = strndup(hdr->text, hdr->l_text);
+ while (ptr != NULL && *ptr != '\0' && found == false ) {
+ size_t end = 0;
+ char* line = get_rg_line(ptr, &end);
+ if (line == NULL) break; // No more @RG
+ char* id;
+ if (((id = get_rg_id(line)) != NULL) && !strcmp(id, rgid)) {
+ found = true;
+ }
+ free(id);
+ free(line);
+ ptr += end;
+ }
+ free(start);
+ return found;
+}
+
+static char* get_first_rgid( const bam_hdr_t *hdr )
+{
+ assert( hdr != NULL );
+ char *ptr, *start;
+ char* found = NULL;
+ start = ptr = strndup(hdr->text, hdr->l_text);
+ while (ptr != NULL && *ptr != '\0' && found == NULL ) {
+ size_t end = 0;
+ char* line = get_rg_line(ptr, &end);
+ if ( line ) {
+ found = get_rg_id(line);
+ } else break;
+ free(line);
+ ptr += end;
+ }
+ free(start);
+ return found;
+}
+
+static void usage(FILE *fp)
+{
+ fprintf(fp,
+ "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-o <output.bam>] <input.bam>\n"
+ "\n"
+ "Options:\n"
+ " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
+ " -o FILE Where to write output to [stdout]\n"
+ " -r STRING @RG line text\n"
+ " -R STRING ID of @RG line in existing header to use\n"
+ );
+ sam_global_opt_help(fp, "..O..");
+}
+
+static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
+{
+ *opts = NULL;
+ int n;
+
+ if (argc == 1) { usage(stdout); return true; }
+
+ parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
+ if (! retval ) {
+ fprintf(pysamerr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
+ return false;
+ }
+ // Set defaults
+ retval->mode = overwrite_all;
+ sam_global_args_init(&retval->ga);
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+ kstring_t rg_line = {0,0,NULL};
+
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) {
+ switch (n) {
+ case 'r':
+ // Are we adding to existing rg line?
+ if (ks_len(&rg_line) == 0) {
+ if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) {
+ kputs("@RG\t", &rg_line);
+ }
+ } else {
+ kputs("\t", &rg_line);
+ }
+ kputs(optarg, &rg_line);
+ break;
+ case 'R':
+ retval->rg_id = strdup(optarg);
+ break;
+ case 'm': {
+ if (strcmp(optarg, "overwrite_all") == 0) {
+ retval->mode = overwrite_all;
+ } else if (strcmp(optarg, "orphan_only") == 0) {
+ retval->mode = orphan_only;
+ } else {
+ usage(pysamerr);
+ return false;
+ }
+ break;
+ }
+ case 'o':
+ retval->output_name = strdup(optarg);
+ break;
+ case 'h':
+ usage(stdout);
+ free(retval);
+ return true;
+ case '?':
+ usage(pysamerr);
+ free(retval);
+ return false;
+ case 'O':
+ default:
+ if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
+ usage(pysamerr);
+ free(retval);
+ return false;
+ }
+ }
+ retval->rg_line = ks_release(&rg_line);
+
+ if (argc-optind < 1) {
+ fprintf(pysamerr, "You must specify an input file.\n");
+ usage(pysamerr);
+ cleanup_opts(retval);
+ return false;
+ }
+ if (retval->rg_id && retval->rg_line) {
+ fprintf(pysamerr, "The options -r and -R are mutually exclusive.\n");
+ cleanup_opts(retval);
+ return false;
+ }
+
+ if (retval->rg_line)
+ {
+ char* tmp = basic_unescape(retval->rg_line);
+
+ if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
+ fprintf(pysamerr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
+ free(tmp);
+ cleanup_opts(retval);
+ return false;
+ }
+ retval->rg_line = tmp;
+ }
+ retval->input_name = strdup(argv[optind+0]);
+
+ *opts = retval;
+ return true;
+}
+
+static void overwrite_all_func(const state_t* state, bam1_t* file_read)
+{
+ uint8_t* data = (uint8_t*)strdup(state->rg_id);
+ int len = strlen(state->rg_id)+1;
+ // If the old exists delete it
+ uint8_t* old = bam_aux_get(file_read, "RG");
+ if (old != NULL) {
+ bam_aux_del(file_read, old);
+ }
+
+ bam_aux_append(file_read, "RG", 'Z', len, data);
+ free(data);
+}
+
+static void orphan_only_func(const state_t* state, bam1_t* file_read)
+{
+ uint8_t* data = (uint8_t*)strdup(state->rg_id);
+ int len = strlen(state->rg_id)+1;
+ // If the old exists don't do anything
+ uint8_t* old = bam_aux_get(file_read, "RG");
+ if (old == NULL) {
+ bam_aux_append(file_read, "RG",'Z',len,data);
+ }
+ free(data);
+}
+
+static bool init(const parsed_opts_t* opts, state_t** state_out) {
+ state_t* retval = (state_t*) calloc(1, sizeof(state_t));
+ if (retval == NULL) {
+ fprintf(pysamerr, "[init] Out of memory allocating state struct.\n");
+ return false;
+ }
+ *state_out = retval;
+
+ // Open files
+ retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
+ if (retval->input_file == NULL) {
+ fprintf(pysamerr, "[init] Could not open input file: %s\n", opts->input_name);
+ return false;
+ }
+ retval->input_header = sam_hdr_read(retval->input_file);
+
+ retval->output_header = bam_hdr_dup(retval->input_header);
+ retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);
+
+ if (retval->output_file == NULL) {
+ print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
+ return false;
+ }
+
+ if (opts->rg_line) {
+ // Append new RG line to header.
+ // Check does not already exist
+ if ( confirm_rg(retval->output_header, opts->rg_id) ) {
+ fprintf(pysamerr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
+ return false;
+ }
+ retval->rg_id = strdup(opts->rg_id);
+ size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2;
+ char* new_header = malloc(new_len);
+ if (!new_header) {
+ fprintf(pysamerr, "[init] Out of memory whilst writing new header.\n");
+ return false;
+ }
+ sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line);
+ free(retval->output_header->text);
+ retval->output_header->text = new_header;
+ retval->output_header->l_text = (int)new_len - 1;
+ } else {
+ if (opts->rg_id) {
+ // Confirm what has been supplied exists
+ if ( !confirm_rg(retval->output_header, opts->rg_id) ) {
+ fprintf(pysamerr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
+ return false;
+ }
+ retval->rg_id = strdup(opts->rg_id);
+ } else {
+ if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) {
+ fprintf(pysamerr, "No RG specified on command line or in existing header.\n");
+ return false;
+ }
+ }
+ }
+
+ switch (opts->mode) {
+ case overwrite_all:
+ retval->mode_func = &overwrite_all_func;
+ break;
+ case orphan_only:
+ retval->mode_func = &orphan_only_func;
+ break;
+ }
+
+ return true;
+}
+
+static bool readgroupise(state_t* state)
+{
+ if (sam_hdr_write(state->output_file, state->output_header) != 0) {
+ print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__);
+ return false;
+ }
+
+ bam1_t* file_read = bam_init1();
+ int ret;
+ while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) {
+ state->mode_func(state, file_read);
+
+ if (sam_write1(state->output_file, state->output_header, file_read) < 0) {
+ print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__);
+ bam_destroy1(file_read);
+ return false;
+ }
+ }
+ bam_destroy1(file_read);
+ if (ret != -1) {
+ print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+int main_addreplacerg(int argc, char** argv)
+{
+ parsed_opts_t* opts = NULL;
+ state_t* state = NULL;
+
+ if (!parse_args(argc, argv, &opts)) goto error;
+ if (opts == NULL) return EXIT_SUCCESS; // Not an error but user doesn't want us to proceed
+ if (!opts || !init(opts, &state)) goto error;
+
+ if (!readgroupise(state)) goto error;
+
+ cleanup_opts(opts);
+ cleanup_state(state);
+
+ return EXIT_SUCCESS;
+error:
+ cleanup_opts(opts);
+ cleanup_state(state);
+
+ return EXIT_FAILURE;
+}
diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c
index 7296f41..7a67de8 100644
--- a/samtools/bam_aux.c
+++ b/samtools/bam_aux.c
@@ -24,6 +24,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include <ctype.h>
+#include <limits.h>
#include "bam.h"
static inline int bam_aux_type2size(int x)
@@ -61,11 +62,18 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s)
int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
{
const char *name_lim = hts_parse_reg(str, beg, end);
- char *name = malloc(name_lim - str + 1);
- memcpy(name, str, name_lim - str);
- name[name_lim - str] = '\0';
- *ref_id = bam_name2id(header, name);
- free(name);
+ if (name_lim) {
+ char *name = malloc(name_lim - str + 1);
+ memcpy(name, str, name_lim - str);
+ name[name_lim - str] = '\0';
+ *ref_id = bam_name2id(header, name);
+ free(name);
+ }
+ else {
+ // not parsable as a region, but possibly a sequence named "foo:a"
+ *ref_id = bam_name2id(header, str);
+ *beg = 0; *end = INT_MAX;
+ }
if (*ref_id == -1) return -1;
return *beg <= *end? 0 : -1;
}
diff --git a/samtools/bam_aux.c.pysam.c b/samtools/bam_aux.c.pysam.c
index 89e2dfa..475c772 100644
--- a/samtools/bam_aux.c.pysam.c
+++ b/samtools/bam_aux.c.pysam.c
@@ -26,6 +26,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include <ctype.h>
+#include <limits.h>
#include "bam.h"
static inline int bam_aux_type2size(int x)
@@ -63,11 +64,18 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s)
int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
{
const char *name_lim = hts_parse_reg(str, beg, end);
- char *name = malloc(name_lim - str + 1);
- memcpy(name, str, name_lim - str);
- name[name_lim - str] = '\0';
- *ref_id = bam_name2id(header, name);
- free(name);
+ if (name_lim) {
+ char *name = malloc(name_lim - str + 1);
+ memcpy(name, str, name_lim - str);
+ name[name_lim - str] = '\0';
+ *ref_id = bam_name2id(header, name);
+ free(name);
+ }
+ else {
+ // not parsable as a region, but possibly a sequence named "foo:a"
+ *ref_id = bam_name2id(header, str);
+ *beg = 0; *end = INT_MAX;
+ }
if (*ref_id == -1) return -1;
return *beg <= *end? 0 : -1;
}
diff --git a/samtools/bam_cat.c b/samtools/bam_cat.c
index 0c6f35f..83cc0fb 100644
--- a/samtools/bam_cat.c
+++ b/samtools/bam_cat.c
@@ -37,9 +37,373 @@ Illumina.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include <string.h>
#include "htslib/bgzf.h"
-#include "bam.h"
+#include "htslib/sam.h"
+#include "htslib/cram.h"
+#include "htslib/khash.h"
+
+KHASH_MAP_INIT_STR(s2i, int)
+
+// Bi-directional lookup.
+// We can go from name to ID or ID to name.
+typedef struct khash_s2i {
+ khash_t(s2i) *h;
+ int n_id, a_id;
+ const char **id; // map Nth entry back to key
+ const char **line;
+} khash_s2i;
+
+static int hash_s2i_inc(khash_s2i *hash, const char *str, const char *line, int *added) {
+ // loosly based on khash_str2int_inc
+ khint_t k;
+ int n;
+
+ if ( !hash ) return -1;
+ // inefficient, but works
+ char *my_str = strdup(str);
+ k = kh_put(s2i, hash->h, my_str, added);
+ if (*added == 0) {
+ free(my_str);
+ return kh_val(hash->h, k);
+ }
+ n = hash->n_id++;
+ kh_val(hash->h, k) = n;
+ if (hash->a_id <= n) {
+ const char **id;
+ hash->a_id = (n+1)*2;
+ if (!(id = realloc(hash->id, hash->a_id*sizeof(*hash->id))))
+ return -1;
+ hash->id = id;
+ if (!(id = realloc(hash->line, hash->a_id*sizeof(*hash->line))))
+ return -1;
+ hash->line = id;
+ }
+ hash->id[n] = my_str; // reverse map
+ if (line)
+ hash->line[n] = line;
+
+ return n;
+}
+
+khash_s2i *hash_s2i_create(void) {
+ khash_s2i *h = calloc(1, sizeof(*h));
+ if (!h)
+ return NULL;
+
+ h->h = kh_init(s2i);
+ if (!h->h) {
+ free(h);
+ return NULL;
+ }
+ return h;
+}
+
+static void hash_s2i_free(khash_s2i *hash) {
+ // based on khash_str2int_destroy_free
+ khint_t k;
+ if (!hash) return;
+ if (hash->h) {
+ for (k = 0; k < kh_end(hash->h); ++k)
+ if (kh_exist(hash->h, k)) free((char*)kh_key(hash->h, k));
+ kh_destroy(s2i, hash->h);
+ }
+ if (hash->id)
+ free(hash->id);
+ if (hash->line)
+ free(hash->line);
+
+ free(hash);
+}
+
+static khash_s2i *hash_rg(const bam_hdr_t *h) {
+ khash_s2i *rg2id = hash_s2i_create();
+ char *cp, *line;
+ int j, l;
+
+ if (!h)
+ return rg2id;
+
+ if (!rg2id)
+ return NULL;
+
+ cp = h->text;
+
+ for (l = 0; l+3 < h->l_text; l++) {
+ line = &cp[l];
+ if (!(cp[l] == '@' && cp[l+1] == 'R' && cp[l+2] == 'G')) {
+ while (l < h->l_text && cp[l] != '\n')
+ l++;
+ continue;
+ }
+
+ // Found an @RG line; add to hash
+ while (cp[l] != '\n') {
+ while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t')
+ l++;
+ if (l+4 < h->l_text && cp[l+1] == 'I' && cp[l+2] == 'D')
+ break;
+ }
+ if (cp[l] == '\n')
+ continue;
+ l = (j = l+4);
+ while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t')
+ l++;
+
+ // To do: save id and keep realloc as needed, as hash_s2i_inc strdups.
+ char *id = malloc(l-j+1);
+ strncpy(id, &cp[j], l-j);
+ id[l-j] = 0;
+
+ int added;
+ hash_s2i_inc(rg2id, id, line, &added);
+ free(id);
+
+ while (l < h->l_text && cp[l] != '\n')
+ l++;
+ }
+
+ return rg2id;
+}
+
+/*
+ * Check the files are consistent and capable of being concatenated.
+ * Also fills out the rg2id read-group hash and the version numbers
+ * and produces a new bam_hdr_t structure with merged RG lines.
+ * Note it is only a simple merge, as we lack the niceties of a proper
+ * header API.
+ *
+ * Returns updated header on success;
+ * NULL on failure.
+ */
+static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h,
+ khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) {
+ int i, vers_maj = -1, vers_min = -1;
+ bam_hdr_t *new_h = NULL;
+
+ if (h) {
+ new_h = bam_hdr_dup(h);
+ *rg2id = hash_rg(new_h);
+ }
+
+ for (i = 0; i < nfn; ++i) {
+ samFile *in;
+ cram_fd *in_c;
+ khint_t ki;
+ int new_rg = -1;
+
+ in = sam_open(fn[i], "rc");
+ if (in == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ return NULL;
+ }
+ in_c = in->fp.cram;
+
+ int vmaj = cram_major_vers(in_c);
+ int vmin = cram_minor_vers(in_c);
+ if ((vers_maj != -1 && vers_maj != vmaj) ||
+ (vers_min != -1 && vers_min != vmin)) {
+ fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
+ __func__);
+ return NULL;
+ }
+ vers_maj = vmaj;
+ vers_min = vmin;
+
+ bam_hdr_t *old = sam_hdr_read(in);
+ khash_s2i *rg2id_in = hash_rg(old);
+
+ if (!new_h) {
+ new_h = bam_hdr_dup(old);
+ *rg2id = hash_rg(new_h);
+ }
+
+ // Add any existing @RG entries to our global @RG hash.
+ for (ki = 0; ki < rg2id_in->n_id; ki++) {
+ int added;
+
+ new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added);
+ //fprintf(stderr, "RG %s: #%d -> #%d\n",
+ // rg2id_in->id[ki], ki, new_rg);
+
+ if (added) {
+ // Also add to new_h
+ const char *line = rg2id_in->line[ki];
+ const char *line_end = line;
+ while (*line && *line_end++ != '\n')
+ ;
+ new_h->l_text += line_end - line;
+ new_h->text = realloc(new_h->text, new_h->l_text+1);
+ strncat(&new_h->text[new_h->l_text - (line_end - line)],
+ line, line_end - line);
+ }
+
+ if (new_rg != ki && rg2id_in->n_id > 1) {
+ fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n",
+ __func__);
+ return NULL;
+ }
+ }
+
+ hash_s2i_free(rg2id_in);
+ bam_hdr_destroy(old);
+ sam_close(in);
+ }
+
+ *vers_maj_p = vers_maj;
+ *vers_min_p = vers_min;
+
+ return new_h;
+}
+
+
+/*
+ * CRAM files don't store the RG:Z:ID per read in the aux field.
+ * Instead they have a numerical data series (RG) to point each read
+ * back to the Nth @RG line in the file. This means that we may need
+ * to edit the RG data series (if the files were produced from
+ * "samtools split" for example).
+ *
+ * The encoding method is stored in the compression header. Typical
+ * examples:
+ *
+ * RG => EXTERNAL {18} # Block content-id 18 holds RG values
+ * # as a series of ITF8 encoded values
+ *
+ * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
+ * # One RG value #-1. (No RG)
+ *
+ * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG)
+ *
+ * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
+ * # Two RG values, #0 and #1, written
+ * # to the CORE block and possibly
+ * # mixed with other data series.
+ *
+ * A single value can (but may not be) implemented as a zero bit
+ * huffman code. In this situation we can change the meta-data in the
+ * compression header to renumber an RG value..
+ */
+int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram)
+{
+ samFile *out;
+ cram_fd *out_c;
+ int i, vers_maj, vers_min;
+ khash_s2i *rg2id = NULL;
+ bam_hdr_t *new_h = NULL;
+
+ /* Check consistent versioning and compatible headers */
+ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min)))
+ return -1;
+
+ /* Open the file with cram_vers */
+ char vers[100];
+ sprintf(vers, "%d.%d", vers_maj, vers_min);
+ out = sam_open(outcram, "wc");
+ if (out == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram);
+ return 1;
+ }
+ out_c = out->fp.cram;
+ cram_set_option(out_c, CRAM_OPT_VERSION, vers);
+ //fprintf(stderr, "Creating cram vers %s\n", vers);
+
+ cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed?
+ sam_hdr_write(out, new_h);
+
+ for (i = 0; i < nfn; ++i) {
+ samFile *in;
+ cram_fd *in_c;
+ cram_container *c;
+ bam_hdr_t *old;
+ int new_rg = -1;
+
+ in = sam_open(fn[i], "rc");
+ if (in == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ return -1;
+ }
+ in_c = in->fp.cram;
+
+ old = sam_hdr_read(in);
+ khash_s2i *rg2id_in = hash_rg(old);
+
+ // Compute RG mapping if suitable for changing.
+ if (rg2id_in->n_id == 1) {
+ int _;
+ new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_);
+ } else {
+ new_rg = 0;
+ }
+
+ hash_s2i_free(rg2id_in);
+
+
+ // Copy contains and blocks within them
+ while ((c = cram_read_container(in_c))) {
+ cram_block *blk;
+
+ if (cram_container_is_empty(in_c)) {
+ if (cram_write_container(out_c, c) != 0)
+ return -1;
+
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ return -1;
+ if (cram_write_block(out_c, blk) != 0) {
+ cram_free_block(blk);
+ return -1;
+ }
+ cram_free_block(blk);
+ cram_free_container(c);
+
+ continue;
+ }
+
+ // If we have just one RG key and new_rg != 0 then
+ // we need to edit the compression header. IF WE CAN.
+ if (new_rg) {
+ int zero = 0;
+ //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg);
+ cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
+ } else {
+ int32_t num_slices;
+
+ // Not switching rg so do the usual read/write loop
+ if (cram_write_container(out_c, c) != 0)
+ return -1;
+
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ return -1;
+ if (cram_write_block(out_c, blk) != 0) {
+ cram_free_block(blk);
+ return -1;
+ }
+ cram_free_block(blk);
+
+
+ // Container num_blocks can be invalid, due to a bug.
+ // Instead we iterate in slice context instead.
+ (void)cram_container_get_landmarks(c, &num_slices);
+ cram_copy_slice(in_c, out_c, num_slices);
+ }
+
+ cram_free_container(c);
+ }
+
+ bam_hdr_destroy(old);
+ sam_close(in);
+ }
+ sam_close(out);
+
+ hash_s2i_free(rg2id);
+ bam_hdr_destroy(new_h);
+
+ return 0;
+}
+
#define BUF_SIZE 0x10000
@@ -48,8 +412,7 @@ Illumina.
#define BGZF_EMPTY_BLOCK_SIZE 28
-
-int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
+int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam)
{
BGZF *fp;
uint8_t *buf;
@@ -62,12 +425,12 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
return 1;
}
- if (h) bam_header_write(fp, h);
+ if (h) bam_hdr_write(fp, h);
buf = (uint8_t*) malloc(BUF_SIZE);
for(i = 0; i < nfn; ++i){
BGZF *in;
- bam_header_t *old;
+ bam_hdr_t *old;
int len,j;
in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
@@ -77,8 +440,14 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
}
if (in->is_write) return -1;
- old = bam_header_read(in);
- if (h == 0 && i == 0) bam_header_write(fp, old);
+ old = bam_hdr_read(in);
+ if (old == NULL) {
+ fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
+ __func__, fn[i]);
+ bgzf_close(in);
+ return -1;
+ }
+ if (h == 0 && i == 0) bam_hdr_write(fp, old);
if (in->block_offset < in->block_length) {
bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
@@ -116,7 +485,7 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
bgzf_raw_write(fp, ebuf, es);
}
}
- bam_header_destroy(old);
+ bam_hdr_destroy(old);
bgzf_close(in);
}
free(buf);
@@ -125,32 +494,65 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
}
-
int main_cat(int argc, char *argv[])
{
- bam_header_t *h = 0;
+ bam_hdr_t *h = 0;
char *outfn = 0;
int c, ret;
+ samFile *in;
+
while ((c = getopt(argc, argv, "h:o:")) >= 0) {
switch (c) {
case 'h': {
- tamFile fph = sam_open(optarg);
+ samFile *fph = sam_open(optarg, "r");
if (fph == 0) {
fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
return 1;
}
- h = sam_header_read(fph);
+ h = sam_hdr_read(fph);
+ if (h == NULL) {
+ fprintf(stderr,
+ "[%s] ERROR: failed to read the header for '%s'.\n",
+ __func__, argv[1]);
+ return 1;
+ }
sam_close(fph);
break;
}
case 'o': outfn = strdup(optarg); break;
}
}
- if (argc - optind < 2) {
- fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
+ if (argc - optind < 1) {
+ fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
+ return 1;
+ }
+
+ in = sam_open(argv[optind], "r");
+ if (!in) {
+ fprintf(stderr, "[%s] ERROR: failed to open file '%s'.\n", __func__, argv[optind]);
+ return 1;
+ }
+
+ switch (hts_get_format(in)->format) {
+ case bam:
+ sam_close(in);
+ ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ break;
+
+ case cram:
+ sam_close(in);
+ ret = cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ break;
+
+ default:
+ sam_close(in);
+ fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
return 1;
}
- ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
free(outfn);
+
+ if (h)
+ bam_hdr_destroy(h);
+
return ret;
}
diff --git a/samtools/bam_cat.c.pysam.c b/samtools/bam_cat.c.pysam.c
index b9a40e3..004911a 100644
--- a/samtools/bam_cat.c.pysam.c
+++ b/samtools/bam_cat.c.pysam.c
@@ -39,9 +39,373 @@ Illumina.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include <string.h>
#include "htslib/bgzf.h"
-#include "bam.h"
+#include "htslib/sam.h"
+#include "htslib/cram.h"
+#include "htslib/khash.h"
+
+KHASH_MAP_INIT_STR(s2i, int)
+
+// Bi-directional lookup.
+// We can go from name to ID or ID to name.
+typedef struct khash_s2i {
+ khash_t(s2i) *h;
+ int n_id, a_id;
+ const char **id; // map Nth entry back to key
+ const char **line;
+} khash_s2i;
+
+static int hash_s2i_inc(khash_s2i *hash, const char *str, const char *line, int *added) {
+ // loosly based on khash_str2int_inc
+ khint_t k;
+ int n;
+
+ if ( !hash ) return -1;
+ // inefficient, but works
+ char *my_str = strdup(str);
+ k = kh_put(s2i, hash->h, my_str, added);
+ if (*added == 0) {
+ free(my_str);
+ return kh_val(hash->h, k);
+ }
+ n = hash->n_id++;
+ kh_val(hash->h, k) = n;
+ if (hash->a_id <= n) {
+ const char **id;
+ hash->a_id = (n+1)*2;
+ if (!(id = realloc(hash->id, hash->a_id*sizeof(*hash->id))))
+ return -1;
+ hash->id = id;
+ if (!(id = realloc(hash->line, hash->a_id*sizeof(*hash->line))))
+ return -1;
+ hash->line = id;
+ }
+ hash->id[n] = my_str; // reverse map
+ if (line)
+ hash->line[n] = line;
+
+ return n;
+}
+
+khash_s2i *hash_s2i_create(void) {
+ khash_s2i *h = calloc(1, sizeof(*h));
+ if (!h)
+ return NULL;
+
+ h->h = kh_init(s2i);
+ if (!h->h) {
+ free(h);
+ return NULL;
+ }
+ return h;
+}
+
+static void hash_s2i_free(khash_s2i *hash) {
+ // based on khash_str2int_destroy_free
+ khint_t k;
+ if (!hash) return;
+ if (hash->h) {
+ for (k = 0; k < kh_end(hash->h); ++k)
+ if (kh_exist(hash->h, k)) free((char*)kh_key(hash->h, k));
+ kh_destroy(s2i, hash->h);
+ }
+ if (hash->id)
+ free(hash->id);
+ if (hash->line)
+ free(hash->line);
+
+ free(hash);
+}
+
+static khash_s2i *hash_rg(const bam_hdr_t *h) {
+ khash_s2i *rg2id = hash_s2i_create();
+ char *cp, *line;
+ int j, l;
+
+ if (!h)
+ return rg2id;
+
+ if (!rg2id)
+ return NULL;
+
+ cp = h->text;
+
+ for (l = 0; l+3 < h->l_text; l++) {
+ line = &cp[l];
+ if (!(cp[l] == '@' && cp[l+1] == 'R' && cp[l+2] == 'G')) {
+ while (l < h->l_text && cp[l] != '\n')
+ l++;
+ continue;
+ }
+
+ // Found an @RG line; add to hash
+ while (cp[l] != '\n') {
+ while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t')
+ l++;
+ if (l+4 < h->l_text && cp[l+1] == 'I' && cp[l+2] == 'D')
+ break;
+ }
+ if (cp[l] == '\n')
+ continue;
+ l = (j = l+4);
+ while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t')
+ l++;
+
+ // To do: save id and keep realloc as needed, as hash_s2i_inc strdups.
+ char *id = malloc(l-j+1);
+ strncpy(id, &cp[j], l-j);
+ id[l-j] = 0;
+
+ int added;
+ hash_s2i_inc(rg2id, id, line, &added);
+ free(id);
+
+ while (l < h->l_text && cp[l] != '\n')
+ l++;
+ }
+
+ return rg2id;
+}
+
+/*
+ * Check the files are consistent and capable of being concatenated.
+ * Also fills out the rg2id read-group hash and the version numbers
+ * and produces a new bam_hdr_t structure with merged RG lines.
+ * Note it is only a simple merge, as we lack the niceties of a proper
+ * header API.
+ *
+ * Returns updated header on success;
+ * NULL on failure.
+ */
+static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h,
+ khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) {
+ int i, vers_maj = -1, vers_min = -1;
+ bam_hdr_t *new_h = NULL;
+
+ if (h) {
+ new_h = bam_hdr_dup(h);
+ *rg2id = hash_rg(new_h);
+ }
+
+ for (i = 0; i < nfn; ++i) {
+ samFile *in;
+ cram_fd *in_c;
+ khint_t ki;
+ int new_rg = -1;
+
+ in = sam_open(fn[i], "rc");
+ if (in == 0) {
+ fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ return NULL;
+ }
+ in_c = in->fp.cram;
+
+ int vmaj = cram_major_vers(in_c);
+ int vmin = cram_minor_vers(in_c);
+ if ((vers_maj != -1 && vers_maj != vmaj) ||
+ (vers_min != -1 && vers_min != vmin)) {
+ fprintf(pysamerr, "[%s] ERROR: input files have differing version numbers.\n",
+ __func__);
+ return NULL;
+ }
+ vers_maj = vmaj;
+ vers_min = vmin;
+
+ bam_hdr_t *old = sam_hdr_read(in);
+ khash_s2i *rg2id_in = hash_rg(old);
+
+ if (!new_h) {
+ new_h = bam_hdr_dup(old);
+ *rg2id = hash_rg(new_h);
+ }
+
+ // Add any existing @RG entries to our global @RG hash.
+ for (ki = 0; ki < rg2id_in->n_id; ki++) {
+ int added;
+
+ new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added);
+ //fprintf(pysamerr, "RG %s: #%d -> #%d\n",
+ // rg2id_in->id[ki], ki, new_rg);
+
+ if (added) {
+ // Also add to new_h
+ const char *line = rg2id_in->line[ki];
+ const char *line_end = line;
+ while (*line && *line_end++ != '\n')
+ ;
+ new_h->l_text += line_end - line;
+ new_h->text = realloc(new_h->text, new_h->l_text+1);
+ strncat(&new_h->text[new_h->l_text - (line_end - line)],
+ line, line_end - line);
+ }
+
+ if (new_rg != ki && rg2id_in->n_id > 1) {
+ fprintf(pysamerr, "[%s] ERROR: Same size @RG lists but differing order / contents\n",
+ __func__);
+ return NULL;
+ }
+ }
+
+ hash_s2i_free(rg2id_in);
+ bam_hdr_destroy(old);
+ sam_close(in);
+ }
+
+ *vers_maj_p = vers_maj;
+ *vers_min_p = vers_min;
+
+ return new_h;
+}
+
+
+/*
+ * CRAM files don't store the RG:Z:ID per read in the aux field.
+ * Instead they have a numerical data series (RG) to point each read
+ * back to the Nth @RG line in the file. This means that we may need
+ * to edit the RG data series (if the files were produced from
+ * "samtools split" for example).
+ *
+ * The encoding method is stored in the compression header. Typical
+ * examples:
+ *
+ * RG => EXTERNAL {18} # Block content-id 18 holds RG values
+ * # as a series of ITF8 encoded values
+ *
+ * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
+ * # One RG value #-1. (No RG)
+ *
+ * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG)
+ *
+ * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
+ * # Two RG values, #0 and #1, written
+ * # to the CORE block and possibly
+ * # mixed with other data series.
+ *
+ * A single value can (but may not be) implemented as a zero bit
+ * huffman code. In this situation we can change the meta-data in the
+ * compression header to renumber an RG value..
+ */
+int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram)
+{
+ samFile *out;
+ cram_fd *out_c;
+ int i, vers_maj, vers_min;
+ khash_s2i *rg2id = NULL;
+ bam_hdr_t *new_h = NULL;
+
+ /* Check consistent versioning and compatible headers */
+ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min)))
+ return -1;
+
+ /* Open the file with cram_vers */
+ char vers[100];
+ sprintf(vers, "%d.%d", vers_maj, vers_min);
+ out = sam_open(outcram, "wc");
+ if (out == 0) {
+ fprintf(pysamerr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram);
+ return 1;
+ }
+ out_c = out->fp.cram;
+ cram_set_option(out_c, CRAM_OPT_VERSION, vers);
+ //fprintf(pysamerr, "Creating cram vers %s\n", vers);
+
+ cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed?
+ sam_hdr_write(out, new_h);
+
+ for (i = 0; i < nfn; ++i) {
+ samFile *in;
+ cram_fd *in_c;
+ cram_container *c;
+ bam_hdr_t *old;
+ int new_rg = -1;
+
+ in = sam_open(fn[i], "rc");
+ if (in == 0) {
+ fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ return -1;
+ }
+ in_c = in->fp.cram;
+
+ old = sam_hdr_read(in);
+ khash_s2i *rg2id_in = hash_rg(old);
+
+ // Compute RG mapping if suitable for changing.
+ if (rg2id_in->n_id == 1) {
+ int _;
+ new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_);
+ } else {
+ new_rg = 0;
+ }
+
+ hash_s2i_free(rg2id_in);
+
+
+ // Copy contains and blocks within them
+ while ((c = cram_read_container(in_c))) {
+ cram_block *blk;
+
+ if (cram_container_is_empty(in_c)) {
+ if (cram_write_container(out_c, c) != 0)
+ return -1;
+
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ return -1;
+ if (cram_write_block(out_c, blk) != 0) {
+ cram_free_block(blk);
+ return -1;
+ }
+ cram_free_block(blk);
+ cram_free_container(c);
+
+ continue;
+ }
+
+ // If we have just one RG key and new_rg != 0 then
+ // we need to edit the compression header. IF WE CAN.
+ if (new_rg) {
+ int zero = 0;
+ //fprintf(pysamerr, "Transcode RG %d to %d\n", 0, new_rg);
+ cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
+ } else {
+ int32_t num_slices;
+
+ // Not switching rg so do the usual read/write loop
+ if (cram_write_container(out_c, c) != 0)
+ return -1;
+
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ return -1;
+ if (cram_write_block(out_c, blk) != 0) {
+ cram_free_block(blk);
+ return -1;
+ }
+ cram_free_block(blk);
+
+
+ // Container num_blocks can be invalid, due to a bug.
+ // Instead we iterate in slice context instead.
+ (void)cram_container_get_landmarks(c, &num_slices);
+ cram_copy_slice(in_c, out_c, num_slices);
+ }
+
+ cram_free_container(c);
+ }
+
+ bam_hdr_destroy(old);
+ sam_close(in);
+ }
+ sam_close(out);
+
+ hash_s2i_free(rg2id);
+ bam_hdr_destroy(new_h);
+
+ return 0;
+}
+
#define BUF_SIZE 0x10000
@@ -50,8 +414,7 @@ Illumina.
#define BGZF_EMPTY_BLOCK_SIZE 28
-
-int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
+int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam)
{
BGZF *fp;
uint8_t *buf;
@@ -64,12 +427,12 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
fprintf(pysamerr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
return 1;
}
- if (h) bam_header_write(fp, h);
+ if (h) bam_hdr_write(fp, h);
buf = (uint8_t*) malloc(BUF_SIZE);
for(i = 0; i < nfn; ++i){
BGZF *in;
- bam_header_t *old;
+ bam_hdr_t *old;
int len,j;
in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
@@ -79,8 +442,14 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
}
if (in->is_write) return -1;
- old = bam_header_read(in);
- if (h == 0 && i == 0) bam_header_write(fp, old);
+ old = bam_hdr_read(in);
+ if (old == NULL) {
+ fprintf(pysamerr, "[%s] ERROR: couldn't read header for '%s'.\n",
+ __func__, fn[i]);
+ bgzf_close(in);
+ return -1;
+ }
+ if (h == 0 && i == 0) bam_hdr_write(fp, old);
if (in->block_offset < in->block_length) {
bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
@@ -118,7 +487,7 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
bgzf_raw_write(fp, ebuf, es);
}
}
- bam_header_destroy(old);
+ bam_hdr_destroy(old);
bgzf_close(in);
}
free(buf);
@@ -127,32 +496,65 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam
}
-
int main_cat(int argc, char *argv[])
{
- bam_header_t *h = 0;
+ bam_hdr_t *h = 0;
char *outfn = 0;
int c, ret;
+ samFile *in;
+
while ((c = getopt(argc, argv, "h:o:")) >= 0) {
switch (c) {
case 'h': {
- tamFile fph = sam_open(optarg);
+ samFile *fph = sam_open(optarg, "r");
if (fph == 0) {
fprintf(pysamerr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
return 1;
}
- h = sam_header_read(fph);
+ h = sam_hdr_read(fph);
+ if (h == NULL) {
+ fprintf(pysamerr,
+ "[%s] ERROR: failed to read the header for '%s'.\n",
+ __func__, argv[1]);
+ return 1;
+ }
sam_close(fph);
break;
}
case 'o': outfn = strdup(optarg); break;
}
}
- if (argc - optind < 2) {
- fprintf(pysamerr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
+ if (argc - optind < 1) {
+ fprintf(pysamerr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
+ return 1;
+ }
+
+ in = sam_open(argv[optind], "r");
+ if (!in) {
+ fprintf(pysamerr, "[%s] ERROR: failed to open file '%s'.\n", __func__, argv[optind]);
+ return 1;
+ }
+
+ switch (hts_get_format(in)->format) {
+ case bam:
+ sam_close(in);
+ ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ break;
+
+ case cram:
+ sam_close(in);
+ ret = cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ break;
+
+ default:
+ sam_close(in);
+ fprintf(pysamerr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
return 1;
}
- ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
free(outfn);
+
+ if (h)
+ bam_hdr_destroy(h);
+
return ret;
}
diff --git a/samtools/bam_index.c b/samtools/bam_index.c
index b0654bc..83a855d 100644
--- a/samtools/bam_index.c
+++ b/samtools/bam_index.c
@@ -33,13 +33,9 @@ DEALINGS IN THE SOFTWARE. */
#include <inttypes.h>
#include <unistd.h>
-#define BAM_LIDX_SHIFT 14
+#include "samtools.h"
-int bam_index_build2(const char *fn, const char *_fnidx)
-{
- fprintf(stderr, "Samtools-htslib-API: bam_index_build2() not yet implemented\n");
- abort();
-}
+#define BAM_LIDX_SHIFT 14
static void index_usage(FILE *fp)
{
@@ -55,7 +51,7 @@ int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
- int c;
+ int c, ret;
while ((c = getopt(argc, argv, "bcm:")) >= 0)
switch (c) {
@@ -71,8 +67,18 @@ int bam_index(int argc, char *argv[])
index_usage(stdout);
return 1;
}
- if (argc - optind > 1) bam_index_build2(argv[optind], argv[optind+1]);
- else bam_index_build(argv[optind], csi? min_shift : 0);
+
+ ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0);
+ if (ret != 0) {
+ if (ret == -2)
+ print_error_errno("index", "failed to open \"%s\"", argv[optind]);
+ else if (ret == -3)
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ else
+ print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]);
+ return EXIT_FAILURE;
+ }
+
return 0;
}
@@ -89,6 +95,11 @@ int bam_idxstats(int argc, char *argv[])
fp = sam_open(argv[1], "r");
if (fp == NULL) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
header = sam_hdr_read(fp);
+ if (header == NULL) {
+ fprintf(stderr, "[%s] failed to read header for '%s'.\n",
+ __func__, argv[1]);
+ return 1;
+ }
idx = sam_index_load(fp, argv[1]);
if (idx == NULL) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c
index 7fd7a76..ed902c5 100644
--- a/samtools/bam_index.c.pysam.c
+++ b/samtools/bam_index.c.pysam.c
@@ -35,13 +35,9 @@ DEALINGS IN THE SOFTWARE. */
#include <inttypes.h>
#include <unistd.h>
-#define BAM_LIDX_SHIFT 14
+#include "samtools.h"
-int bam_index_build2(const char *fn, const char *_fnidx)
-{
- fprintf(pysamerr, "Samtools-htslib-API: bam_index_build2() not yet implemented\n");
- abort();
-}
+#define BAM_LIDX_SHIFT 14
static void index_usage(FILE *fp)
{
@@ -57,7 +53,7 @@ int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
- int c;
+ int c, ret;
while ((c = getopt(argc, argv, "bcm:")) >= 0)
switch (c) {
@@ -73,8 +69,18 @@ int bam_index(int argc, char *argv[])
index_usage(stdout);
return 1;
}
- if (argc - optind > 1) bam_index_build2(argv[optind], argv[optind+1]);
- else bam_index_build(argv[optind], csi? min_shift : 0);
+
+ ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0);
+ if (ret != 0) {
+ if (ret == -2)
+ print_error_errno("index", "failed to open \"%s\"", argv[optind]);
+ else if (ret == -3)
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ else
+ print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]);
+ return EXIT_FAILURE;
+ }
+
return 0;
}
@@ -91,6 +97,11 @@ int bam_idxstats(int argc, char *argv[])
fp = sam_open(argv[1], "r");
if (fp == NULL) { fprintf(pysamerr, "[%s] fail to open BAM.\n", __func__); return 1; }
header = sam_hdr_read(fp);
+ if (header == NULL) {
+ fprintf(pysamerr, "[%s] failed to read header for '%s'.\n",
+ __func__, argv[1]);
+ return 1;
+ }
idx = sam_index_load(fp, argv[1]);
if (idx == NULL) { fprintf(pysamerr, "[%s] fail to load the index.\n", __func__); return 1; }
diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c
index 017d5e1..54c3ed3 100644
--- a/samtools/bam_mate.c
+++ b/samtools/bam_mate.c
@@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
@@ -185,6 +186,10 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro
str.l = str.m = 0; str.s = 0;
header = sam_hdr_read(in);
+ if (header == NULL) {
+ fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n");
+ exit(1);
+ }
// Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted.
if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) {
char *p, *q;
@@ -292,56 +297,65 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro
void usage(FILE* where)
{
- fprintf(where,"Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n\n");
- fprintf(where,"Options:\n");
- fprintf(stderr," -r Remove unmapped reads and secondary alignments\n");
- fprintf(stderr," -p Disable FR proper pair check\n");
- fprintf(stderr," -c Add template cigar ct tag\n");
- fprintf(stderr," -O FORMAT Write output as FORMAT ('sam'/'bam'/'cram')\n");
- fprintf(stderr,"As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n");
- fprintf(stderr,"file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n");
- fprintf(stderr,"input is not accepted.\n");
+ fprintf(where,
+"Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n"
+"Options:\n"
+" -r Remove unmapped reads and secondary alignments\n"
+" -p Disable FR proper pair check\n"
+" -c Add template cigar ct tag\n");
+
+ sam_global_opt_help(where, "-.O..");
+
+ fprintf(where,
+"\n"
+"As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n"
+"file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n"
+"input is not accepted.\n");
}
int bam_mating(int argc, char *argv[])
{
samFile *in, *out;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0;
- char* fmtout = NULL;
- char modeout[12];
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ char wmode[3] = {'w', 'b', 0};
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
// parse args
if (argc == 1) { usage(stdout); return 0; }
- while ((c = getopt(argc, argv, "rpcO:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
case 'c': add_ct = 1; break;
- case 'O': fmtout = optarg; break;
- default: usage(stderr); return 1;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage(stderr); return 1;
}
}
if (optind+1 >= argc) { usage(stderr); return 1; }
- strcpy(modeout, "w");
- if (sam_open_mode(&modeout[1], argv[optind+1], fmtout) < 0) {
- if (fmtout) fprintf(stderr, "[bam_mating] cannot parse output format \"%s\"\n", fmtout);
- else fprintf(stderr, "[bam_mating] cannot determine output format\n");
- return 1;
- }
// init
- if ((in = sam_open(argv[optind], "r")) == NULL) {
+ if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
fprintf(stderr, "[bam_mating] cannot open input file\n");
return 1;
}
- if ((out = sam_open(argv[optind+1], modeout)) == NULL) {
+ sam_open_mode(wmode+1, argv[optind+1], NULL);
+ if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) {
fprintf(stderr, "[bam_mating] cannot open output file\n");
return 1;
}
// run
bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
+
// cleanup
sam_close(in); sam_close(out);
+ sam_global_args_free(&ga);
+
return 0;
}
diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c
index be0dc37..c7900a1 100644
--- a/samtools/bam_mate.c.pysam.c
+++ b/samtools/bam_mate.c.pysam.c
@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
@@ -187,6 +188,10 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro
str.l = str.m = 0; str.s = 0;
header = sam_hdr_read(in);
+ if (header == NULL) {
+ fprintf(pysamerr, "[bam_mating_core] ERROR: Couldn't read header\n");
+ exit(1);
+ }
// Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted.
if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) {
char *p, *q;
@@ -294,56 +299,65 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro
void usage(FILE* where)
{
- fprintf(where,"Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n\n");
- fprintf(where,"Options:\n");
- fprintf(pysamerr," -r Remove unmapped reads and secondary alignments\n");
- fprintf(pysamerr," -p Disable FR proper pair check\n");
- fprintf(pysamerr," -c Add template cigar ct tag\n");
- fprintf(pysamerr," -O FORMAT Write output as FORMAT ('sam'/'bam'/'cram')\n");
- fprintf(pysamerr,"As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n");
- fprintf(pysamerr,"file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n");
- fprintf(pysamerr,"input is not accepted.\n");
+ fprintf(where,
+"Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n"
+"Options:\n"
+" -r Remove unmapped reads and secondary alignments\n"
+" -p Disable FR proper pair check\n"
+" -c Add template cigar ct tag\n");
+
+ sam_global_opt_help(where, "-.O..");
+
+ fprintf(where,
+"\n"
+"As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n"
+"file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n"
+"input is not accepted.\n");
}
int bam_mating(int argc, char *argv[])
{
samFile *in, *out;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0;
- char* fmtout = NULL;
- char modeout[12];
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ char wmode[3] = {'w', 'b', 0};
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
// parse args
if (argc == 1) { usage(stdout); return 0; }
- while ((c = getopt(argc, argv, "rpcO:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
case 'c': add_ct = 1; break;
- case 'O': fmtout = optarg; break;
- default: usage(pysamerr); return 1;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage(pysamerr); return 1;
}
}
if (optind+1 >= argc) { usage(pysamerr); return 1; }
- strcpy(modeout, "w");
- if (sam_open_mode(&modeout[1], argv[optind+1], fmtout) < 0) {
- if (fmtout) fprintf(pysamerr, "[bam_mating] cannot parse output format \"%s\"\n", fmtout);
- else fprintf(pysamerr, "[bam_mating] cannot determine output format\n");
- return 1;
- }
// init
- if ((in = sam_open(argv[optind], "r")) == NULL) {
+ if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
fprintf(pysamerr, "[bam_mating] cannot open input file\n");
return 1;
}
- if ((out = sam_open(argv[optind+1], modeout)) == NULL) {
+ sam_open_mode(wmode+1, argv[optind+1], NULL);
+ if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) {
fprintf(pysamerr, "[bam_mating] cannot open output file\n");
return 1;
}
// run
bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
+
// cleanup
sam_close(in); sam_close(out);
+ sam_global_args_free(&ga);
+
return 0;
}
diff --git a/samtools/bam_md.c b/samtools/bam_md.c
index 7d1c6a7..30f3243 100644
--- a/samtools/bam_md.c
+++ b/samtools/bam_md.c
@@ -1,6 +1,6 @@
/* bam_md.c -- calmd subcommand.
- Copyright (C) 2009-2011 Genome Research Ltd.
+ Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd.
Portions copyright (C) 2009-2011 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -26,11 +26,13 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <string.h>
#include <ctype.h>
+#include <limits.h>
#include <math.h>
#include "htslib/faidx.h"
-#include "sam.h"
+#include "htslib/sam.h"
#include "htslib/kstring.h"
#include "kprobaln.h"
+#include "sam_opts.h"
#define USE_EQUAL 1
#define DROP_TAG 2
@@ -39,14 +41,12 @@ DEALINGS IN THE SOFTWARE. */
#define UPDATE_MD 16
#define HASH_QNM 32
-const char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
-
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
-void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
+void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
{
- uint8_t *seq = bam1_seq(b);
- uint32_t *cigar = bam1_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
int i, x, y, u = 0;
kstring_t *str;
@@ -57,9 +57,9 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
+ int c1, c2, z = y + j;
+ if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
+ c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
++u;
@@ -73,12 +73,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
} else if (op == BAM_CDEL) {
kputw(u, str); kputc('^', str);
for (j = 0; j < l; ++j) {
- if (ref[x+j] == 0) break;
+ if (x+j >= ref_len || ref[x+j] == '\0') break;
kputc(ref[x+j], str);
}
u = 0;
+ x += j; nm += j;
if (j < l) break;
- x += l; nm += l;
} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
y += l;
if (op == BAM_CINS) nm += l;
@@ -93,12 +93,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
+ int c1, c2, z = y + j;
+ if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
+ c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
seq[z/2] |= (z&1)? 0x0f : 0xf0;
- bam1_qual(b)[z] = 0;
+ bam_get_qual(b)[z] = 0;
}
}
if (j < l) break;
@@ -113,7 +113,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
if (old_nm) old_nm_i = bam_aux2i(old_nm);
if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
else if (nm != old_nm_i) {
- fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+ fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
bam_aux_del(b, old_nm);
bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
}
@@ -131,7 +131,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
if (i < str->l) is_diff = 1;
} else is_diff = 1;
if (is_diff) {
- fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
bam_aux_del(b, old_md);
bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
}
@@ -145,7 +145,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
}
// reduce the resolution of base quality
if (flag&BIN_QUAL) {
- uint8_t *qual = bam1_qual(b);
+ uint8_t *qual = bam_get_qual(b);
for (i = 0; i < b->core.l_qseq; ++i)
if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
}
@@ -155,13 +155,13 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
void bam_fillmd1(bam1_t *b, char *ref, int flag)
{
- bam_fillmd1_core(b, ref, flag, 0);
+ bam_fillmd1_core(b, ref, INT_MAX, flag, 0);
}
-int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres)
{
- uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
- uint32_t *cigar = bam1_cigar(b);
+ uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
+ uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
int i, x, y, mm, q, len, clip_l, clip_q;
double t;
@@ -171,9 +171,9 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
+ int c1, c2, z = y + j;
+ if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
+ c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
++len;
if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
@@ -186,7 +186,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
x += l; y += l; len += l;
} else if (op == BAM_CDEL) {
for (j = 0; j < l; ++j)
- if (ref[x+j] == 0) break;
+ if (x+j >= ref_len || ref[x+j] == '\0') break;
if (j < l) break;
x += l;
} else if (op == BAM_CSOFT_CLIP) {
@@ -205,18 +205,20 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
if (t > thres) return -1;
if (t < 0) t = 0;
t = sqrt((thres - t) / thres) * thres;
-// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
+// fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
return (int)(t + .499);
}
-int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
+int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag)
{
int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam1_cigar(b);
+ uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
+ uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
+ if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
+ return -1; // do nothing
+
// test if BQ or ZQ is present
if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
@@ -266,16 +268,16 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
if (xe - xb - c->l_qseq > bw)
xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
{ // glocal
- uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
+ uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
int *state;
bq = calloc(c->l_qseq + 1, 1);
memcpy(bq, qual, c->l_qseq);
s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
+ for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
r = calloc(xe - xb, 1);
for (i = xb; i < xe; ++i) {
- if (ref[i] == 0) { xe = i; break; }
- r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
+ if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
+ r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
}
state = calloc(c->l_qseq, sizeof(int));
q = calloc(c->l_qseq, 1);
@@ -325,22 +327,44 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
int bam_prob_realn(bam1_t *b, const char *ref)
{
- return bam_prob_realn_core(b, ref, 1);
+ return bam_prob_realn_core(b, ref, INT_MAX, 1);
+}
+
+int calmd_usage() {
+ fprintf(stderr,
+"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
+"Options:\n"
+" -e change identical bases to '='\n"
+" -u uncompressed BAM output (for piping)\n"
+" -b compressed BAM output\n"
+" -S ignored (input format is auto-detected)\n"
+" -A modify the quality string\n"
+" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
+" -E extended BAQ for better sensitivity but lower specificity\n");
+
+ sam_global_opt_help(stderr, "-....");
+ return 1;
}
int bam_fillmd(int argc, char *argv[])
{
- int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
- samfile_t *fp, *fpout = 0;
+ int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ samFile *fp, *fpout = 0;
+ bam_hdr_t *header;
faidx_t *fai;
- char *ref = 0, mode_w[8], mode_r[8];
+ char *ref = 0, mode_w[8], *ref_file;
bam1_t *b;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
flt_flag = UPDATE_NM | UPDATE_MD;
- is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
- mode_w[0] = mode_r[0] = 0;
- strcpy(mode_r, "r"); strcpy(mode_w, "w");
- while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
+ is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
+ strcpy(mode_w, "w");
+ while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
@@ -350,63 +374,68 @@ int bam_fillmd(int argc, char *argv[])
case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
case 'b': is_bam_out = 1; break;
case 'u': is_uncompressed = is_bam_out = 1; break;
- case 'S': is_sam_in = 1; break;
+ case 'S': break;
case 'n': max_nm = atoi(optarg); break;
case 'C': capQ = atoi(optarg); break;
case 'A': baq_flag |= 1; break;
case 'E': baq_flag |= 2; break;
- default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
+ /* else fall-through */
+ case '?': return calmd_usage();
}
}
- if (!is_sam_in) strcat(mode_r, "b");
if (is_bam_out) strcat(mode_w, "b");
else strcat(mode_w, "h");
- if (is_uncompressed) strcat(mode_w, "u");
- if (optind + 1 >= argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools calmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
- fprintf(stderr, "Options: -e change identical bases to '='\n");
- fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
- fprintf(stderr, " -b compressed BAM output\n");
- fprintf(stderr, " -S the input is SAM with header\n");
- fprintf(stderr, " -A modify the quality string\n");
- fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
- fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n");
- return 1;
- }
- fp = samopen(argv[optind], mode_r, 0);
+ if (is_uncompressed) strcat(mode_w, "0");
+ if (optind + (ga.reference == NULL) >= argc)
+ return calmd_usage();
+ fp = sam_open_format(argv[optind], "r", &ga.in);
if (fp == 0) return 1;
- if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
+
+ header = sam_hdr_read(fp);
+ if (header == NULL || header->n_targets == 0) {
fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
return 1;
}
- fpout = samopen("-", mode_w, fp->header);
- fai = fai_load(argv[optind+1]);
+
+ fpout = sam_open_format("-", mode_w, &ga.out);
+ sam_hdr_write(fpout, header);
+
+ ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
+ fai = fai_load(ref_file);
+
+ if (!fai) {
+ perror(ref_file);
+ return 1;
+ }
b = bam_init1();
- while ((ret = samread(fp, b)) >= 0) {
+ while ((ret = sam_read1(fp, header, b)) >= 0) {
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
- ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
+ ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
tid = b->core.tid;
if (ref == 0)
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
- fp->header->target_name[tid]);
+ header->target_name[tid]);
}
- if (is_realn) bam_prob_realn_core(b, ref, baq_flag);
+ if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
if (capQ > 10) {
- int q = bam_cap_mapQ(b, ref, capQ);
+ int q = bam_cap_mapQ(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
- if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
+ if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
}
- samwrite(fpout, b);
+ sam_write1(fpout, header, b);
}
bam_destroy1(b);
+ bam_hdr_destroy(header);
free(ref);
fai_destroy(fai);
- samclose(fp); samclose(fpout);
+ sam_close(fp);
+ sam_close(fpout);
return 0;
}
diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c
index 5f5bb8a..070f9cd 100644
--- a/samtools/bam_md.c.pysam.c
+++ b/samtools/bam_md.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_md.c -- calmd subcommand.
- Copyright (C) 2009-2011 Genome Research Ltd.
+ Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd.
Portions copyright (C) 2009-2011 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -28,11 +28,13 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <string.h>
#include <ctype.h>
+#include <limits.h>
#include <math.h>
#include "htslib/faidx.h"
-#include "sam.h"
+#include "htslib/sam.h"
#include "htslib/kstring.h"
#include "kprobaln.h"
+#include "sam_opts.h"
#define USE_EQUAL 1
#define DROP_TAG 2
@@ -41,14 +43,12 @@ DEALINGS IN THE SOFTWARE. */
#define UPDATE_MD 16
#define HASH_QNM 32
-const char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
-
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
-void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
+void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
{
- uint8_t *seq = bam1_seq(b);
- uint32_t *cigar = bam1_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
int i, x, y, u = 0;
kstring_t *str;
@@ -59,9 +59,9 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
+ int c1, c2, z = y + j;
+ if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
+ c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
++u;
@@ -75,12 +75,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
} else if (op == BAM_CDEL) {
kputw(u, str); kputc('^', str);
for (j = 0; j < l; ++j) {
- if (ref[x+j] == 0) break;
+ if (x+j >= ref_len || ref[x+j] == '\0') break;
kputc(ref[x+j], str);
}
u = 0;
+ x += j; nm += j;
if (j < l) break;
- x += l; nm += l;
} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
y += l;
if (op == BAM_CINS) nm += l;
@@ -95,12 +95,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
+ int c1, c2, z = y + j;
+ if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
+ c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
seq[z/2] |= (z&1)? 0x0f : 0xf0;
- bam1_qual(b)[z] = 0;
+ bam_get_qual(b)[z] = 0;
}
}
if (j < l) break;
@@ -115,7 +115,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
if (old_nm) old_nm_i = bam_aux2i(old_nm);
if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
else if (nm != old_nm_i) {
- fprintf(pysamerr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+ fprintf(pysamerr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
bam_aux_del(b, old_nm);
bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
}
@@ -133,7 +133,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
if (i < str->l) is_diff = 1;
} else is_diff = 1;
if (is_diff) {
- fprintf(pysamerr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+ fprintf(pysamerr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
bam_aux_del(b, old_md);
bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
}
@@ -147,7 +147,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
}
// reduce the resolution of base quality
if (flag&BIN_QUAL) {
- uint8_t *qual = bam1_qual(b);
+ uint8_t *qual = bam_get_qual(b);
for (i = 0; i < b->core.l_qseq; ++i)
if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
}
@@ -157,13 +157,13 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
void bam_fillmd1(bam1_t *b, char *ref, int flag)
{
- bam_fillmd1_core(b, ref, flag, 0);
+ bam_fillmd1_core(b, ref, INT_MAX, flag, 0);
}
-int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres)
{
- uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
- uint32_t *cigar = bam1_cigar(b);
+ uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
+ uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
int i, x, y, mm, q, len, clip_l, clip_q;
double t;
@@ -173,9 +173,9 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
+ int c1, c2, z = y + j;
+ if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
+ c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
++len;
if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
@@ -188,7 +188,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
x += l; y += l; len += l;
} else if (op == BAM_CDEL) {
for (j = 0; j < l; ++j)
- if (ref[x+j] == 0) break;
+ if (x+j >= ref_len || ref[x+j] == '\0') break;
if (j < l) break;
x += l;
} else if (op == BAM_CSOFT_CLIP) {
@@ -207,18 +207,20 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
if (t > thres) return -1;
if (t < 0) t = 0;
t = sqrt((thres - t) / thres) * thres;
-// fprintf(pysamerr, "%s %lf %d\n", bam1_qname(b), t, q);
+// fprintf(pysamerr, "%s %lf %d\n", bam_get_qname(b), t, q);
return (int)(t + .499);
}
-int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
+int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag)
{
int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam1_cigar(b);
+ uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
+ uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
+ if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
+ return -1; // do nothing
+
// test if BQ or ZQ is present
if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
@@ -268,16 +270,16 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
if (xe - xb - c->l_qseq > bw)
xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
{ // glocal
- uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
+ uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
int *state;
bq = calloc(c->l_qseq + 1, 1);
memcpy(bq, qual, c->l_qseq);
s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
+ for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
r = calloc(xe - xb, 1);
for (i = xb; i < xe; ++i) {
- if (ref[i] == 0) { xe = i; break; }
- r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
+ if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
+ r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
}
state = calloc(c->l_qseq, sizeof(int));
q = calloc(c->l_qseq, 1);
@@ -327,22 +329,44 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
int bam_prob_realn(bam1_t *b, const char *ref)
{
- return bam_prob_realn_core(b, ref, 1);
+ return bam_prob_realn_core(b, ref, INT_MAX, 1);
+}
+
+int calmd_usage() {
+ fprintf(pysamerr,
+"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
+"Options:\n"
+" -e change identical bases to '='\n"
+" -u uncompressed BAM output (for piping)\n"
+" -b compressed BAM output\n"
+" -S ignored (input format is auto-detected)\n"
+" -A modify the quality string\n"
+" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
+" -E extended BAQ for better sensitivity but lower specificity\n");
+
+ sam_global_opt_help(pysamerr, "-....");
+ return 1;
}
int bam_fillmd(int argc, char *argv[])
{
- int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
- samfile_t *fp, *fpout = 0;
+ int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ samFile *fp, *fpout = 0;
+ bam_hdr_t *header;
faidx_t *fai;
- char *ref = 0, mode_w[8], mode_r[8];
+ char *ref = 0, mode_w[8], *ref_file;
bam1_t *b;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
flt_flag = UPDATE_NM | UPDATE_MD;
- is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
- mode_w[0] = mode_r[0] = 0;
- strcpy(mode_r, "r"); strcpy(mode_w, "w");
- while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
+ is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
+ strcpy(mode_w, "w");
+ while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
@@ -352,63 +376,68 @@ int bam_fillmd(int argc, char *argv[])
case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
case 'b': is_bam_out = 1; break;
case 'u': is_uncompressed = is_bam_out = 1; break;
- case 'S': is_sam_in = 1; break;
+ case 'S': break;
case 'n': max_nm = atoi(optarg); break;
case 'C': capQ = atoi(optarg); break;
case 'A': baq_flag |= 1; break;
case 'E': baq_flag |= 2; break;
- default: fprintf(pysamerr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ fprintf(pysamerr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
+ /* else fall-through */
+ case '?': return calmd_usage();
}
}
- if (!is_sam_in) strcat(mode_r, "b");
if (is_bam_out) strcat(mode_w, "b");
else strcat(mode_w, "h");
- if (is_uncompressed) strcat(mode_w, "u");
- if (optind + 1 >= argc) {
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools calmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
- fprintf(pysamerr, "Options: -e change identical bases to '='\n");
- fprintf(pysamerr, " -u uncompressed BAM output (for piping)\n");
- fprintf(pysamerr, " -b compressed BAM output\n");
- fprintf(pysamerr, " -S the input is SAM with header\n");
- fprintf(pysamerr, " -A modify the quality string\n");
- fprintf(pysamerr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
- fprintf(pysamerr, " -E extended BAQ for better sensitivity but lower specificity\n\n");
- return 1;
- }
- fp = samopen(argv[optind], mode_r, 0);
+ if (is_uncompressed) strcat(mode_w, "0");
+ if (optind + (ga.reference == NULL) >= argc)
+ return calmd_usage();
+ fp = sam_open_format(argv[optind], "r", &ga.in);
if (fp == 0) return 1;
- if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
+
+ header = sam_hdr_read(fp);
+ if (header == NULL || header->n_targets == 0) {
fprintf(pysamerr, "[bam_fillmd] input SAM does not have header. Abort!\n");
return 1;
}
- fpout = samopen("-", mode_w, fp->header);
- fai = fai_load(argv[optind+1]);
+
+ fpout = sam_open_format("-", mode_w, &ga.out);
+ sam_hdr_write(fpout, header);
+
+ ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
+ fai = fai_load(ref_file);
+
+ if (!fai) {
+ perror(ref_file);
+ return 1;
+ }
b = bam_init1();
- while ((ret = samread(fp, b)) >= 0) {
+ while ((ret = sam_read1(fp, header, b)) >= 0) {
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
- ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
+ ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
tid = b->core.tid;
if (ref == 0)
fprintf(pysamerr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
- fp->header->target_name[tid]);
+ header->target_name[tid]);
}
- if (is_realn) bam_prob_realn_core(b, ref, baq_flag);
+ if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
if (capQ > 10) {
- int q = bam_cap_mapQ(b, ref, capQ);
+ int q = bam_cap_mapQ(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
- if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
+ if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
}
- samwrite(fpout, b);
+ sam_write1(fpout, header, b);
}
bam_destroy1(b);
+ bam_hdr_destroy(header);
free(ref);
fai_destroy(fai);
- samclose(fp); samclose(fpout);
+ sam_close(fp);
+ sam_close(fpout);
return 0;
}
diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c
index d574cca..9e00836 100644
--- a/samtools/bam_plcmd.c
+++ b/samtools/bam_plcmd.c
@@ -1,6 +1,6 @@
/* bam_plcmd.c -- mpileup subcommand.
- Copyright (C) 2008-2014 Genome Research Ltd.
+ Copyright (C) 2008-2015 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -25,9 +25,11 @@ DEALINGS IN THE SOFTWARE. */
#include <math.h>
#include <stdio.h>
+#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>
#include <string.h>
+#include <limits.h>
#include <errno.h>
#include <sys/stat.h>
#include <getopt.h>
@@ -37,6 +39,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/khash_str2int.h>
#include "sam_header.h"
#include "samtools.h"
+#include "sam_opts.h"
static inline int printw(int c, FILE *fp)
{
@@ -122,14 +125,22 @@ typedef struct {
void *bed, *rghash;
int argc;
char **argv;
+ sam_global_args ga;
} mplp_conf_t;
typedef struct {
+ char *ref[2];
+ int ref_id[2];
+ int ref_len[2];
+} mplp_ref_t;
+
+#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
+
+typedef struct {
samFile *fp;
hts_itr_t *iter;
bam_hdr_t *h;
- int ref_id;
- char *ref;
+ mplp_ref_t *ref;
const mplp_conf_t *conf;
} mplp_aux_t;
@@ -139,13 +150,71 @@ typedef struct {
bam_pileup1_t **plp;
} mplp_pileup_t;
+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
+ mplp_ref_t *r = ma->ref;
+
+ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
+
+ if (!r || !ma->conf->fai) {
+ *ref = NULL;
+ return 0;
+ }
+
+ // Do we need to reference count this so multiple mplp_aux_t can
+ // track which references are in use?
+ // For now we just cache the last two. Sufficient?
+ if (tid == r->ref_id[0]) {
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+ if (tid == r->ref_id[1]) {
+ // Last, swap over
+ int tmp;
+ tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp;
+ tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
+
+ char *tc;
+ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+
+ // New, so migrate to old and load new
+ free(r->ref[1]);
+ r->ref[1] = r->ref[0];
+ r->ref_id[1] = r->ref_id[0];
+ r->ref_len[1] = r->ref_len[0];
+
+ r->ref_id[0] = tid;
+ r->ref[0] = faidx_fetch_seq(ma->conf->fai,
+ ma->h->target_name[r->ref_id[0]],
+ 0,
+ INT_MAX,
+ &r->ref_len[0]);
+
+ if (!r->ref[0]) {
+ r->ref[0] = NULL;
+ r->ref_id[0] = -1;
+ r->ref_len[0] = 0;
+ *ref = NULL;
+ return 0;
+ }
+
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+}
+
static int mplp_func(void *data, bam1_t *b)
{
extern int bam_realn(bam1_t *b, const char *ref);
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int);
- extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
+ extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres);
+ char *ref;
mplp_aux_t *ma = (mplp_aux_t*)data;
- int ret, skip = 0;
+ int ret, skip = 0, ref_len;
do {
int has_ref;
ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
@@ -173,11 +242,23 @@ static int mplp_func(void *data, bam1_t *b)
for (i = 0; i < b->core.l_qseq; ++i)
qual[i] = qual[i] > 31? qual[i] - 31 : 0;
}
- has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
+
+ if (ma->conf->fai && b->core.tid >= 0) {
+ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
+ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
+ fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
+ __func__, b->core.pos, ref_len, b->core.tid);
+ skip = 1;
+ continue;
+ }
+ } else {
+ has_ref = 0;
+ }
+
skip = 0;
- if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
if (has_ref && ma->conf->capQ_thres > 10) {
- int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres);
+ int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) skip = 1;
else if (b->core.qual > q) b->core.qual = q;
}
@@ -197,13 +278,13 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf,
const bam_pileup1_t *p = plp[i] + j;
uint8_t *q;
int id = -1;
- q = ignore_rg? 0 : bam_aux_get(p->b, "RG");
+ q = ignore_rg? NULL : bam_aux_get(p->b, "RG");
if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf);
if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
if (id < 0 || id >= m->n) {
assert(q); // otherwise a bug
fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
if (m->n_plp[id] == m->m_plp[id]) {
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
@@ -225,8 +306,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
+ mplp_ref_t mp_ref = MPLP_REF_INIT;
bam_mplp_t iter;
bam_hdr_t *h = NULL; /* header of first file in input list */
char *ref;
@@ -253,44 +335,49 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (n == 0) {
fprintf(stderr,"[%s] no input file/data given\n", __func__);
- exit(1);
+ exit(EXIT_FAILURE);
}
// read the header of each file in the list and initialize data
for (i = 0; i < n; ++i) {
bam_hdr_t *h_tmp;
data[i] = calloc(1, sizeof(mplp_aux_t));
- data[i]->fp = sam_open(fn[i], "rb");
+ data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in);
if ( !data[i]->fp )
{
fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
- exit(1);
+ exit(EXIT_FAILURE);
}
if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
- return 1;
+ exit(EXIT_FAILURE);
+ }
+ if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
}
- hts_set_fai_filename(data[i]->fp, conf->fai_fname);
data[i]->conf = conf;
+ data[i]->ref = &mp_ref;
h_tmp = sam_hdr_read(data[i]->fp);
if ( !h_tmp ) {
fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
// Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
if (conf->reg) {
hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]);
- if (idx == 0) {
+ if (idx == NULL) {
fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) {
fprintf(stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
- if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
+ if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
hts_idx_destroy(idx);
}
else
@@ -325,12 +412,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode);
if (bcf_fp == NULL) {
fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
- exit(1);
+ exit(EXIT_FAILURE);
}
// BCF header creation
bcf_hdr = bcf_hdr_init("w");
- kstring_t str = {0,0,0};
+ kstring_t str = {0,0,NULL};
ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version());
bcf_hdr_append(bcf_hdr, str.s);
@@ -357,7 +444,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_append(bcf_hdr, str.s);
}
free(str.s);
- bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">");
+ bcf_hdr_append(bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
@@ -390,6 +477,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
if ( conf->fmt_flag&B2B_FMT_SP )
bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
+ if ( conf->fmt_flag&B2B_FMT_AD )
+ bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
+ if ( conf->fmt_flag&B2B_FMT_ADF )
+ bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_FMT_ADR )
+ bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
+ if ( conf->fmt_flag&B2B_INFO_AD )
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
+ if ( conf->fmt_flag&B2B_INFO_ADF )
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_INFO_ADR )
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
for (i=0; i<sm->n; i++)
bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]);
@@ -414,12 +513,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
assert( sizeof(float)==sizeof(int32_t) );
bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4);
bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields
- if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) )
+ if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
{
// first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
- bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ bc.ADR = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ bc.ADF = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
for (i=0; i<sm->n; i++)
- bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES;
+ {
+ bcr[i].ADR = bc.ADR + (i+1)*B2B_MAX_ALLELES;
+ bcr[i].ADF = bc.ADF + (i+1)*B2B_MAX_ALLELES;
+ }
}
}
}
@@ -428,16 +531,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (pileup_fp == NULL) {
fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
- exit(1);
+ exit(EXIT_FAILURE);
}
}
- if (tid0 >= 0 && conf->fai) { // region is set
- ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
- ref_tid = tid0;
- for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
- } else ref_tid = -1, ref = 0;
-
// init pileup
iter = bam_mplp_init(n, mplp_func, (void**)data);
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
@@ -456,12 +553,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
- if (tid != ref_tid) {
- free(ref); ref = 0;
- if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
- for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
- ref_tid = tid;
- }
+ mplp_get_ref(data[0], tid, &ref, &ref_len);
+ //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
if (conf->flag & MPLP_BCF) {
int total_depth, _ref0, ref16;
for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
@@ -537,8 +630,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
if (conf->flag & MPLP_PRINT_POS) {
putc('\t', pileup_fp);
+ int last = 0;
for (j = 0; j < n_plp[i]; ++j) {
- if (j > 0) putc(',', pileup_fp);
+ const bam_pileup1_t *p = plp[i] + j;
+ int c = bam_get_qual(p->b)[p->qpos];
+ if ( c < conf->min_baseQ ) continue;
+
+ if (last++) putc(',', pileup_fp);
fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
}
}
@@ -558,7 +656,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_call_destroy(bca);
free(bc.PL);
free(bc.DP4);
- free(bc.DPR);
+ free(bc.ADR);
+ free(bc.ADF);
free(bc.fmt_arr);
free(bcr);
}
@@ -574,7 +673,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (data[i]->iter) hts_itr_destroy(data[i]->iter);
free(data[i]);
}
- free(data); free(plp); free(ref); free(n_plp);
+ free(data); free(plp); free(n_plp);
+ free(mp_ref.ref[0]);
+ free(mp_ref.ref[1]);
return ret;
}
@@ -643,15 +744,21 @@ int parse_format_flag(const char *str)
for(i=0; i<n_tags; i++)
{
if ( !strcasecmp(tags[i],"DP") ) flag |= B2B_FMT_DP;
- else if ( !strcasecmp(tags[i],"DV") ) flag |= B2B_FMT_DV;
+ else if ( !strcasecmp(tags[i],"DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
else if ( !strcasecmp(tags[i],"SP") ) flag |= B2B_FMT_SP;
- else if ( !strcasecmp(tags[i],"DP4") ) flag |= B2B_FMT_DP4;
- else if ( !strcasecmp(tags[i],"DPR") ) flag |= B2B_FMT_DPR;
- else if ( !strcasecmp(tags[i],"INFO/DPR") ) flag |= B2B_INFO_DPR;
+ else if ( !strcasecmp(tags[i],"DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"AD") ) flag |= B2B_FMT_AD;
+ else if ( !strcasecmp(tags[i],"ADF") ) flag |= B2B_FMT_ADF;
+ else if ( !strcasecmp(tags[i],"ADR") ) flag |= B2B_FMT_ADR;
+ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
+ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
+ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
else
{
fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
- exit(1);
+ exit(EXIT_FAILURE);
}
free(tags[i]);
}
@@ -707,7 +814,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -s, --output-MQ output mapping quality\n"
"\n"
"Output options for genotype likelihoods (when -g/-v is used):\n"
-" -t, --output-tags LIST optional tags to output: DP,DPR,DV,DP4,INFO/DPR,SP []\n"
+" -t, --output-tags LIST optional tags to output:\n"
+" DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR []\n"
" -u, --uncompressed generate uncompressed VCF/BCF output\n"
"\n"
"SNP/INDEL genotype likelihoods options (effective with -g/-v):\n"
@@ -725,7 +833,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
fprintf(fp,
" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
-" -P, --platforms STR comma separated list of platforms for indels [all]\n"
+" -P, --platforms STR comma separated list of platforms for indels [all]\n");
+ sam_global_opt_help(fp, "-.--.");
+ fprintf(fp,
"\n"
"Notes: Assuming diploid individuals.\n");
@@ -750,8 +860,11 @@ int bam_mpileup(int argc, char *argv[])
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
+ sam_global_args_init(&mplp.ga);
+
static const struct option lopts[] =
{
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
{"rf", required_argument, NULL, 1}, // require flag
{"ff", required_argument, NULL, 2}, // filter flag
{"incl-flags", required_argument, NULL, 1},
@@ -816,7 +929,7 @@ int bam_mpileup(int argc, char *argv[])
case 4 : mplp.openQ = atoi(optarg); break;
case 'f':
mplp.fai = fai_load(optarg);
- if (mplp.fai == 0) return 1;
+ if (mplp.fai == NULL) return 1;
mplp.fai_fname = optarg;
break;
case 'd': mplp.max_depth = atoi(optarg); break;
@@ -826,7 +939,7 @@ int bam_mpileup(int argc, char *argv[])
// with few BED intervals and big BAMs. Todo: devise a heuristic to determine
// best strategy, that is streaming or jumping.
mplp.bed = bed_read(optarg);
- if (!mplp.bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; }
+ if (!mplp.bed) { print_error_errno("mpileup", "Could not read file \"%s\"", optarg); return 1; }
break;
case 'P': mplp.pl_list = strdup(optarg); break;
case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
@@ -865,7 +978,7 @@ int bam_mpileup(int argc, char *argv[])
FILE *fp_rg;
char buf[1024];
mplp.rghash = khash_str2int_init();
- if ((fp_rg = fopen(optarg, "r")) == 0)
+ if ((fp_rg = fopen(optarg, "r")) == NULL)
fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me...
khash_str2int_inc(mplp.rghash, strdup(buf));
@@ -874,10 +987,19 @@ int bam_mpileup(int argc, char *argv[])
break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); break;
default:
- fprintf(stderr,"Invalid option: '%c'\n", c);
+ if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
+ /* else fall-through */
+ case '?':
+ print_usage(stderr, &mplp);
return 1;
}
}
+ if (!mplp.fai && mplp.ga.reference) {
+ mplp.fai_fname = mplp.ga.reference;
+ mplp.fai = fai_load(mplp.fai_fname);
+ if (mplp.fai == NULL) return 1;
+ }
+
if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
{
fprintf(stderr,"Error: The -B option cannot be combined with -E\n");
diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c
index 9d2c987..bafbb92 100644
--- a/samtools/bam_plcmd.c.pysam.c
+++ b/samtools/bam_plcmd.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_plcmd.c -- mpileup subcommand.
- Copyright (C) 2008-2014 Genome Research Ltd.
+ Copyright (C) 2008-2015 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -27,9 +27,11 @@ DEALINGS IN THE SOFTWARE. */
#include <math.h>
#include <stdio.h>
+#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>
#include <string.h>
+#include <limits.h>
#include <errno.h>
#include <sys/stat.h>
#include <getopt.h>
@@ -39,6 +41,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/khash_str2int.h>
#include "sam_header.h"
#include "samtools.h"
+#include "sam_opts.h"
static inline int printw(int c, FILE *fp)
{
@@ -124,14 +127,22 @@ typedef struct {
void *bed, *rghash;
int argc;
char **argv;
+ sam_global_args ga;
} mplp_conf_t;
typedef struct {
+ char *ref[2];
+ int ref_id[2];
+ int ref_len[2];
+} mplp_ref_t;
+
+#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
+
+typedef struct {
samFile *fp;
hts_itr_t *iter;
bam_hdr_t *h;
- int ref_id;
- char *ref;
+ mplp_ref_t *ref;
const mplp_conf_t *conf;
} mplp_aux_t;
@@ -141,13 +152,71 @@ typedef struct {
bam_pileup1_t **plp;
} mplp_pileup_t;
+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
+ mplp_ref_t *r = ma->ref;
+
+ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
+
+ if (!r || !ma->conf->fai) {
+ *ref = NULL;
+ return 0;
+ }
+
+ // Do we need to reference count this so multiple mplp_aux_t can
+ // track which references are in use?
+ // For now we just cache the last two. Sufficient?
+ if (tid == r->ref_id[0]) {
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+ if (tid == r->ref_id[1]) {
+ // Last, swap over
+ int tmp;
+ tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp;
+ tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
+
+ char *tc;
+ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+
+ // New, so migrate to old and load new
+ free(r->ref[1]);
+ r->ref[1] = r->ref[0];
+ r->ref_id[1] = r->ref_id[0];
+ r->ref_len[1] = r->ref_len[0];
+
+ r->ref_id[0] = tid;
+ r->ref[0] = faidx_fetch_seq(ma->conf->fai,
+ ma->h->target_name[r->ref_id[0]],
+ 0,
+ INT_MAX,
+ &r->ref_len[0]);
+
+ if (!r->ref[0]) {
+ r->ref[0] = NULL;
+ r->ref_id[0] = -1;
+ r->ref_len[0] = 0;
+ *ref = NULL;
+ return 0;
+ }
+
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+}
+
static int mplp_func(void *data, bam1_t *b)
{
extern int bam_realn(bam1_t *b, const char *ref);
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int);
- extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
+ extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres);
+ char *ref;
mplp_aux_t *ma = (mplp_aux_t*)data;
- int ret, skip = 0;
+ int ret, skip = 0, ref_len;
do {
int has_ref;
ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
@@ -175,11 +244,23 @@ static int mplp_func(void *data, bam1_t *b)
for (i = 0; i < b->core.l_qseq; ++i)
qual[i] = qual[i] > 31? qual[i] - 31 : 0;
}
- has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
+
+ if (ma->conf->fai && b->core.tid >= 0) {
+ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
+ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
+ fprintf(pysamerr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
+ __func__, b->core.pos, ref_len, b->core.tid);
+ skip = 1;
+ continue;
+ }
+ } else {
+ has_ref = 0;
+ }
+
skip = 0;
- if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
if (has_ref && ma->conf->capQ_thres > 10) {
- int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres);
+ int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) skip = 1;
else if (b->core.qual > q) b->core.qual = q;
}
@@ -199,13 +280,13 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf,
const bam_pileup1_t *p = plp[i] + j;
uint8_t *q;
int id = -1;
- q = ignore_rg? 0 : bam_aux_get(p->b, "RG");
+ q = ignore_rg? NULL : bam_aux_get(p->b, "RG");
if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf);
if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
if (id < 0 || id >= m->n) {
assert(q); // otherwise a bug
fprintf(pysamerr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
if (m->n_plp[id] == m->m_plp[id]) {
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
@@ -227,8 +308,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
+ mplp_ref_t mp_ref = MPLP_REF_INIT;
bam_mplp_t iter;
bam_hdr_t *h = NULL; /* header of first file in input list */
char *ref;
@@ -255,44 +337,49 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (n == 0) {
fprintf(pysamerr,"[%s] no input file/data given\n", __func__);
- exit(1);
+ exit(EXIT_FAILURE);
}
// read the header of each file in the list and initialize data
for (i = 0; i < n; ++i) {
bam_hdr_t *h_tmp;
data[i] = calloc(1, sizeof(mplp_aux_t));
- data[i]->fp = sam_open(fn[i], "rb");
+ data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in);
if ( !data[i]->fp )
{
fprintf(pysamerr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
- exit(1);
+ exit(EXIT_FAILURE);
}
if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
- return 1;
+ exit(EXIT_FAILURE);
+ }
+ if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(pysamerr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
}
- hts_set_fai_filename(data[i]->fp, conf->fai_fname);
data[i]->conf = conf;
+ data[i]->ref = &mp_ref;
h_tmp = sam_hdr_read(data[i]->fp);
if ( !h_tmp ) {
fprintf(pysamerr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
// Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
if (conf->reg) {
hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]);
- if (idx == 0) {
+ if (idx == NULL) {
fprintf(pysamerr, "[%s] fail to load index for %s\n", __func__, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) {
fprintf(pysamerr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
- exit(1);
+ exit(EXIT_FAILURE);
}
- if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
+ if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
hts_idx_destroy(idx);
}
else
@@ -327,12 +414,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode);
if (bcf_fp == NULL) {
fprintf(pysamerr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
- exit(1);
+ exit(EXIT_FAILURE);
}
// BCF header creation
bcf_hdr = bcf_hdr_init("w");
- kstring_t str = {0,0,0};
+ kstring_t str = {0,0,NULL};
ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version());
bcf_hdr_append(bcf_hdr, str.s);
@@ -359,7 +446,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_append(bcf_hdr, str.s);
}
free(str.s);
- bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">");
+ bcf_hdr_append(bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
@@ -392,6 +479,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
if ( conf->fmt_flag&B2B_FMT_SP )
bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
+ if ( conf->fmt_flag&B2B_FMT_AD )
+ bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
+ if ( conf->fmt_flag&B2B_FMT_ADF )
+ bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_FMT_ADR )
+ bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
+ if ( conf->fmt_flag&B2B_INFO_AD )
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
+ if ( conf->fmt_flag&B2B_INFO_ADF )
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_INFO_ADR )
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
for (i=0; i<sm->n; i++)
bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]);
@@ -416,12 +515,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
assert( sizeof(float)==sizeof(int32_t) );
bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4);
bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields
- if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) )
+ if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
{
// first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
- bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ bc.ADR = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ bc.ADF = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
for (i=0; i<sm->n; i++)
- bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES;
+ {
+ bcr[i].ADR = bc.ADR + (i+1)*B2B_MAX_ALLELES;
+ bcr[i].ADF = bc.ADF + (i+1)*B2B_MAX_ALLELES;
+ }
}
}
}
@@ -430,16 +533,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (pileup_fp == NULL) {
fprintf(pysamerr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
- exit(1);
+ exit(EXIT_FAILURE);
}
}
- if (tid0 >= 0 && conf->fai) { // region is set
- ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
- ref_tid = tid0;
- for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
- } else ref_tid = -1, ref = 0;
-
// init pileup
iter = bam_mplp_init(n, mplp_func, (void**)data);
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
@@ -458,12 +555,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
- if (tid != ref_tid) {
- free(ref); ref = 0;
- if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
- for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
- ref_tid = tid;
- }
+ mplp_get_ref(data[0], tid, &ref, &ref_len);
+ //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
if (conf->flag & MPLP_BCF) {
int total_depth, _ref0, ref16;
for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
@@ -539,8 +632,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
if (conf->flag & MPLP_PRINT_POS) {
putc('\t', pileup_fp);
+ int last = 0;
for (j = 0; j < n_plp[i]; ++j) {
- if (j > 0) putc(',', pileup_fp);
+ const bam_pileup1_t *p = plp[i] + j;
+ int c = bam_get_qual(p->b)[p->qpos];
+ if ( c < conf->min_baseQ ) continue;
+
+ if (last++) putc(',', pileup_fp);
fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
}
}
@@ -560,7 +658,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_call_destroy(bca);
free(bc.PL);
free(bc.DP4);
- free(bc.DPR);
+ free(bc.ADR);
+ free(bc.ADF);
free(bc.fmt_arr);
free(bcr);
}
@@ -576,7 +675,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (data[i]->iter) hts_itr_destroy(data[i]->iter);
free(data[i]);
}
- free(data); free(plp); free(ref); free(n_plp);
+ free(data); free(plp); free(n_plp);
+ free(mp_ref.ref[0]);
+ free(mp_ref.ref[1]);
return ret;
}
@@ -645,15 +746,21 @@ int parse_format_flag(const char *str)
for(i=0; i<n_tags; i++)
{
if ( !strcasecmp(tags[i],"DP") ) flag |= B2B_FMT_DP;
- else if ( !strcasecmp(tags[i],"DV") ) flag |= B2B_FMT_DV;
+ else if ( !strcasecmp(tags[i],"DV") ) { flag |= B2B_FMT_DV; fprintf(pysamerr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
else if ( !strcasecmp(tags[i],"SP") ) flag |= B2B_FMT_SP;
- else if ( !strcasecmp(tags[i],"DP4") ) flag |= B2B_FMT_DP4;
- else if ( !strcasecmp(tags[i],"DPR") ) flag |= B2B_FMT_DPR;
- else if ( !strcasecmp(tags[i],"INFO/DPR") ) flag |= B2B_INFO_DPR;
+ else if ( !strcasecmp(tags[i],"DP4") ) { flag |= B2B_FMT_DP4; fprintf(pysamerr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DPR") ) { flag |= B2B_FMT_DPR; fprintf(pysamerr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(pysamerr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"AD") ) flag |= B2B_FMT_AD;
+ else if ( !strcasecmp(tags[i],"ADF") ) flag |= B2B_FMT_ADF;
+ else if ( !strcasecmp(tags[i],"ADR") ) flag |= B2B_FMT_ADR;
+ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
+ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
+ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
else
{
fprintf(pysamerr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
- exit(1);
+ exit(EXIT_FAILURE);
}
free(tags[i]);
}
@@ -709,7 +816,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -s, --output-MQ output mapping quality\n"
"\n"
"Output options for genotype likelihoods (when -g/-v is used):\n"
-" -t, --output-tags LIST optional tags to output: DP,DPR,DV,DP4,INFO/DPR,SP []\n"
+" -t, --output-tags LIST optional tags to output:\n"
+" DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR []\n"
" -u, --uncompressed generate uncompressed VCF/BCF output\n"
"\n"
"SNP/INDEL genotype likelihoods options (effective with -g/-v):\n"
@@ -727,7 +835,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
fprintf(fp,
" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
-" -P, --platforms STR comma separated list of platforms for indels [all]\n"
+" -P, --platforms STR comma separated list of platforms for indels [all]\n");
+ sam_global_opt_help(fp, "-.--.");
+ fprintf(fp,
"\n"
"Notes: Assuming diploid individuals.\n");
@@ -752,8 +862,11 @@ int bam_mpileup(int argc, char *argv[])
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
+ sam_global_args_init(&mplp.ga);
+
static const struct option lopts[] =
{
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
{"rf", required_argument, NULL, 1}, // require flag
{"ff", required_argument, NULL, 2}, // filter flag
{"incl-flags", required_argument, NULL, 1},
@@ -818,7 +931,7 @@ int bam_mpileup(int argc, char *argv[])
case 4 : mplp.openQ = atoi(optarg); break;
case 'f':
mplp.fai = fai_load(optarg);
- if (mplp.fai == 0) return 1;
+ if (mplp.fai == NULL) return 1;
mplp.fai_fname = optarg;
break;
case 'd': mplp.max_depth = atoi(optarg); break;
@@ -828,7 +941,7 @@ int bam_mpileup(int argc, char *argv[])
// with few BED intervals and big BAMs. Todo: devise a heuristic to determine
// best strategy, that is streaming or jumping.
mplp.bed = bed_read(optarg);
- if (!mplp.bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; }
+ if (!mplp.bed) { print_error_errno("mpileup", "Could not read file \"%s\"", optarg); return 1; }
break;
case 'P': mplp.pl_list = strdup(optarg); break;
case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
@@ -867,7 +980,7 @@ int bam_mpileup(int argc, char *argv[])
FILE *fp_rg;
char buf[1024];
mplp.rghash = khash_str2int_init();
- if ((fp_rg = fopen(optarg, "r")) == 0)
+ if ((fp_rg = fopen(optarg, "r")) == NULL)
fprintf(pysamerr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me...
khash_str2int_inc(mplp.rghash, strdup(buf));
@@ -876,10 +989,19 @@ int bam_mpileup(int argc, char *argv[])
break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); break;
default:
- fprintf(pysamerr,"Invalid option: '%c'\n", c);
+ if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
+ /* else fall-through */
+ case '?':
+ print_usage(pysamerr, &mplp);
return 1;
}
}
+ if (!mplp.fai && mplp.ga.reference) {
+ mplp.fai_fname = mplp.ga.reference;
+ mplp.fai = fai_load(mplp.fai_fname);
+ if (mplp.fai == NULL) return 1;
+ }
+
if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
{
fprintf(pysamerr,"Error: The -B option cannot be combined with -E\n");
diff --git a/samtools/bam_quickcheck.c b/samtools/bam_quickcheck.c
new file mode 100644
index 0000000..8d1e7ef
--- /dev/null
+++ b/samtools/bam_quickcheck.c
@@ -0,0 +1,134 @@
+/* bam_quickcheck.c -- quickcheck subcommand.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: Joshua C. Randall <jcrandall at alum.mit.edu>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/bgzf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static void usage_quickcheck(FILE *write_to)
+{
+ fprintf(write_to,
+"Usage: samtools quickcheck [options] <input> [...]\n"
+"Options:\n"
+" -v verbose output (repeat for more verbosity)\n"
+"\n"
+ );
+}
+
+int main_quickcheck(int argc, char** argv)
+{
+ int verbose = 0;
+ hts_verbose = 0;
+
+ const char* optstring = "v";
+ int opt;
+ while ((opt = getopt(argc, argv, optstring)) != -1) {
+ switch (opt) {
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage_quickcheck(stderr);
+ return 1;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ usage_quickcheck(stdout);
+ return 1;
+ }
+
+ if (verbose >= 2) {
+ fprintf(stderr, "verbosity set to %d\n", verbose);
+ }
+
+ if (verbose >= 4) {
+ hts_verbose = 3;
+ }
+
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < argc; i++) {
+ char* fn = argv[i];
+ int file_state = 0;
+
+ if (verbose >= 3) fprintf(stderr, "checking %s\n", fn);
+
+ // attempt to open
+ htsFile *hts_fp = hts_open(fn, "r");
+ if (hts_fp == NULL) {
+ if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading\n", fn);
+ file_state |= 2;
+ }
+ else {
+ if (verbose >= 3) fprintf(stderr, "opened %s\n", fn);
+ // make sure we have sequence data
+ const htsFormat *fmt = hts_get_format(hts_fp);
+ if (fmt->category != sequence_data ) {
+ if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data\n", fn);
+ file_state |= 4;
+ }
+ else {
+ if (verbose >= 3) fprintf(stderr, "%s is sequence data\n", fn);
+ // check header
+ bam_hdr_t *header = sam_hdr_read(hts_fp);
+ if (header->n_targets <= 0) {
+ if (verbose >= 2) fprintf(stderr, "%s had no targets in header\n", fn);
+ file_state |= 8;
+ }
+ else {
+ if (verbose >= 3) fprintf(stderr, "%s has %d targets in header\n", fn, header->n_targets);
+ }
+
+ // only check EOF on BAM for now
+ // TODO implement and use hts_check_EOF() to include CRAM support
+ if (fmt->format == bam) {
+ if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
+ if (verbose >= 2) fprintf(stderr, "%s was missing EOF block\n", fn);
+ file_state |= 16;
+ }
+ else {
+ if (verbose >= 3) fprintf(stderr, "%s has good EOF block\n", fn);
+ }
+ }
+ }
+
+ hts_close(hts_fp);
+ }
+
+ if (file_state > 0 && verbose >= 1) {
+ fprintf(stdout, "%s\n", fn);
+ }
+ ret |= file_state;
+ }
+
+ return ret;
+}
diff --git a/samtools/bam_quickcheck.c.pysam.c b/samtools/bam_quickcheck.c.pysam.c
new file mode 100644
index 0000000..b589d46
--- /dev/null
+++ b/samtools/bam_quickcheck.c.pysam.c
@@ -0,0 +1,136 @@
+#include "pysam.h"
+
+/* bam_quickcheck.c -- quickcheck subcommand.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: Joshua C. Randall <jcrandall at alum.mit.edu>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/bgzf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static void usage_quickcheck(FILE *write_to)
+{
+ fprintf(write_to,
+"Usage: samtools quickcheck [options] <input> [...]\n"
+"Options:\n"
+" -v verbose output (repeat for more verbosity)\n"
+"\n"
+ );
+}
+
+int main_quickcheck(int argc, char** argv)
+{
+ int verbose = 0;
+ hts_verbose = 0;
+
+ const char* optstring = "v";
+ int opt;
+ while ((opt = getopt(argc, argv, optstring)) != -1) {
+ switch (opt) {
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage_quickcheck(pysamerr);
+ return 1;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ usage_quickcheck(stdout);
+ return 1;
+ }
+
+ if (verbose >= 2) {
+ fprintf(pysamerr, "verbosity set to %d\n", verbose);
+ }
+
+ if (verbose >= 4) {
+ hts_verbose = 3;
+ }
+
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < argc; i++) {
+ char* fn = argv[i];
+ int file_state = 0;
+
+ if (verbose >= 3) fprintf(pysamerr, "checking %s\n", fn);
+
+ // attempt to open
+ htsFile *hts_fp = hts_open(fn, "r");
+ if (hts_fp == NULL) {
+ if (verbose >= 2) fprintf(pysamerr, "%s could not be opened for reading\n", fn);
+ file_state |= 2;
+ }
+ else {
+ if (verbose >= 3) fprintf(pysamerr, "opened %s\n", fn);
+ // make sure we have sequence data
+ const htsFormat *fmt = hts_get_format(hts_fp);
+ if (fmt->category != sequence_data ) {
+ if (verbose >= 2) fprintf(pysamerr, "%s was not identified as sequence data\n", fn);
+ file_state |= 4;
+ }
+ else {
+ if (verbose >= 3) fprintf(pysamerr, "%s is sequence data\n", fn);
+ // check header
+ bam_hdr_t *header = sam_hdr_read(hts_fp);
+ if (header->n_targets <= 0) {
+ if (verbose >= 2) fprintf(pysamerr, "%s had no targets in header\n", fn);
+ file_state |= 8;
+ }
+ else {
+ if (verbose >= 3) fprintf(pysamerr, "%s has %d targets in header\n", fn, header->n_targets);
+ }
+
+ // only check EOF on BAM for now
+ // TODO implement and use hts_check_EOF() to include CRAM support
+ if (fmt->format == bam) {
+ if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
+ if (verbose >= 2) fprintf(pysamerr, "%s was missing EOF block\n", fn);
+ file_state |= 16;
+ }
+ else {
+ if (verbose >= 3) fprintf(pysamerr, "%s has good EOF block\n", fn);
+ }
+ }
+ }
+
+ hts_close(hts_fp);
+ }
+
+ if (file_state > 0 && verbose >= 1) {
+ fprintf(stdout, "%s\n", fn);
+ }
+ ret |= file_state;
+ }
+
+ return ret;
+}
diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c
index 47ceced..dc43807 100644
--- a/samtools/bam_reheader.c
+++ b/samtools/bam_reheader.c
@@ -25,22 +25,55 @@ DEALINGS IN THE SOFTWARE. */
#include <stdio.h>
#include <stdlib.h>
+#include <assert.h>
+#include <getopt.h>
+
#include "htslib/bgzf.h"
-#include "bam.h"
+#include "htslib/sam.h"
+#include "htslib/hfile.h"
+#include "htslib/cram.h"
+#include "samtools.h"
#define BUF_SIZE 0x10000
-int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
+/*
+ * Reads a file and outputs a new BAM file to fd with 'h' replaced as
+ * the header. No checks are made to the validity.
+ */
+int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
+ const char *arg_list, int add_PG)
{
BGZF *fp;
- bam_header_t *old;
ssize_t len;
uint8_t *buf;
if (in->is_write) return -1;
buf = malloc(BUF_SIZE);
- old = bam_header_read(in);
+ if (bam_hdr_read(in) == NULL) {
+ fprintf(stderr, "Couldn't read header\n");
+ free(buf);
+ return -1;
+ }
fp = bgzf_fdopen(fd, "w");
- bam_header_write(fp, h);
+
+ if (add_PG) {
+ // Around the houses, but it'll do until we can manipulate bam_hdr_t natively.
+ SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text);
+ if (sam_hdr_add_PG(sh, "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ return -1;
+
+ free(h->text);
+ h->text = strdup(sam_hdr_str(sh));
+ h->l_text = sam_hdr_length(sh);
+ if (!h->text)
+ return -1;
+ sam_hdr_free(sh);
+ }
+
+ bam_hdr_write(fp, h);
if (in->block_offset < in->block_length) {
bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
bgzf_flush(fp);
@@ -53,29 +86,397 @@ int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
return 0;
}
+/*
+ * Reads a file and outputs a new CRAM file to stdout with 'h'
+ * replaced as the header. No checks are made to the validity.
+ *
+ * FIXME: error checking
+ */
+int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG)
+{
+ htsFile *h_out = hts_open("-", "wc");
+ cram_fd *out = h_out->fp.cram;
+ cram_container *c = NULL;
+ int ret = -1;
+
+ // Attempt to fill out a cram->refs[] array from @SQ headers
+ cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text));
+ if (add_PG) {
+ if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ goto err;
+
+ // Covert back to bam_hdr_t struct
+ free(h->text);
+ h->text = strdup(sam_hdr_str(cram_fd_get_header(out)));
+ h->l_text = sam_hdr_length(cram_fd_get_header(out));
+ if (!h->text)
+ goto err;
+ }
+
+ if (sam_hdr_write(h_out, h) != 0)
+ goto err;
+ cram_set_option(out, CRAM_OPT_REFERENCE, NULL);
+
+ while ((c = cram_read_container(in))) {
+ int32_t i, num_blocks = cram_container_get_num_blocks(c);
+ if (cram_write_container(out, c) != 0)
+ goto err;
+
+ for (i = 0; i < num_blocks; i++) {
+ cram_block *blk = cram_read_block(in);
+ if (!blk || cram_write_block(out, blk) != 0) {
+ if (blk) cram_free_block(blk);
+ goto err;
+ }
+ cram_free_block(blk);
+ }
+ cram_free_container(c);
+ }
+
+ ret = 0;
+
+ err:
+ if (hts_close(h_out) != 0)
+ ret = -1;
+
+ return ret;
+}
+
+
+
+/*
+ * Reads a version 2 CRAM file and replaces the header in-place,
+ * provided the header is small enough to fit without growing the
+ * entire file.
+ *
+ * Version 2 format has an uncompressed SAM header with multiple nul
+ * termination bytes to permit inline header editing.
+ *
+ * Returns 0 on success;
+ * -1 on general failure;
+ * -2 on failure due to insufficient size
+ */
+int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
+ int add_PG)
+{
+ cram_container *c = NULL;
+ cram_block *b = NULL;
+ SAM_hdr *hdr = NULL;
+ off_t start;
+ int ret = -1;
+
+ if (cram_major_vers(fd) < 2 ||
+ cram_major_vers(fd) > 3) {
+ fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
+ cram_major_vers(fd));
+ goto err;
+ }
+
+ if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
+ goto err;
+
+ if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL))
+ goto err;
+
+ int header_len = sam_hdr_length(hdr);
+ /* Fix M5 strings? Maybe out of scope for this tool */
+
+ // Load the existing header
+ if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
+ goto err;
+
+ if (!(c = cram_read_container(fd)))
+ goto err;
+
+ // Version 2.1 has a single uncompressed block which is nul
+ // terminated with many nuls to permit growth.
+ //
+ // So load old block and keep all contents identical bar the
+ // header text itself
+ if (!(b = cram_read_block(fd)))
+ goto err;
+
+ if (cram_block_get_uncomp_size(b) < header_len+4) {
+ fprintf(stderr, "New header will not fit. Use non-inplace version (%d > %d)\n",
+ header_len+4, cram_block_get_uncomp_size(b));
+ ret = -2;
+ goto err;
+ }
+
+ cram_block_set_offset(b, 0); // rewind block
+ int32_put_blk(b, header_len);
+ cram_block_append(b, sam_hdr_str(hdr), header_len);
+ // Zero the remaining block
+ memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
+ cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
+ // Make sure all sizes and byte-offsets are consistent after memset
+ cram_block_set_offset(b, cram_block_get_uncomp_size(b));
+ cram_block_set_comp_size(b, cram_block_get_uncomp_size(b));
+
+ if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start)
+ goto err;
+
+ if (cram_write_container(fd, c) == -1)
+ goto err;
+
+ if (cram_write_block(fd, b) == -1)
+ goto err;
+
+ ret = 0;
+ err:
+ if (c) cram_free_container(c);
+ if (b) cram_free_block(b);
+ if (hdr) sam_hdr_free(hdr);
+
+ return ret;
+}
+
+
+/*
+ * Reads a version 3 CRAM file and replaces the header in-place,
+ * provided the header is small enough to fit without growing the
+ * entire file.
+ *
+ * Version 3 format has a SAM header held as an (optionally)
+ * compressed block within the header container. Additional
+ * uncompressed blocks or simply unallocated space (the difference
+ * between total block sizes and the container size) are used to
+ * provide room for growth or contraction of the compressed header.
+ *
+ * Returns 0 on success;
+ * -1 on general failure;
+ * -2 on failure due to insufficient size
+ */
+int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
+ int add_PG)
+{
+ cram_container *c = NULL;
+ cram_block *b = NULL;
+ SAM_hdr *hdr = NULL;
+ off_t start, sz, end;
+ int container_sz, max_container_sz;
+ char *buf = NULL;
+ int ret = -1;
+
+ if (cram_major_vers(fd) < 2 ||
+ cram_major_vers(fd) > 3) {
+ fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
+ cram_major_vers(fd));
+ goto err;
+ }
+
+ if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
+ goto err;
+
+ if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL))
+ goto err;
+
+ int header_len = sam_hdr_length(hdr);
+ /* Fix M5 strings? Maybe out of scope for this tool */
+
+ // Find current size of SAM header block
+ if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
+ goto err;
+
+ if (!(c = cram_read_container(fd)))
+ goto err;
+
+ // +5 allows num_landmarks to increase from 0 to 1 (Cramtools)
+ max_container_sz = cram_container_size(c)+5;
+
+ sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start;
+ end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c);
+
+ // We force 1 block instead of (optionally) 2. C CRAM
+ // implementations for v3 were writing 1 compressed block followed
+ // by 1 uncompressed block. However this is tricky to deal with
+ // as changing block sizes can mean the block header also changes
+ // size due to itf8 and variable size integers.
+ //
+ // If we had 1 block, this doesn't change anything.
+ // If we had 2 blocks, the new container header will be smaller by
+ // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value.
+ // However this is an int32 instead of itf8 so the container
+ // header structure stays the same size. This means we can always
+ // reduce the number of blocks without running into size problems.
+ cram_container_set_num_blocks(c, 1);
+ int32_t *landmark;
+ int32_t num_landmarks;
+ landmark = cram_container_get_landmarks(c, &num_landmarks);
+ if (num_landmarks && landmark) {
+ num_landmarks = 1;
+ landmark[0] = 0;
+ } else {
+ num_landmarks = 0;
+ }
+ cram_container_set_landmarks(c, num_landmarks, landmark);
+
+ buf = malloc(max_container_sz);
+ container_sz = max_container_sz;
+ if (cram_store_container(fd, c, buf, &container_sz) != 0)
+ goto err;
+
+ if (!buf)
+ goto err;
+
+ // Proposed new length, but changing cram_container_get_length(c) may change the
+ // container_sz and thus the remainder (cram_container_get_length(c) itself).
+ cram_container_set_length(c, sz - container_sz);
+
+ int old_container_sz = container_sz;
+ container_sz = max_container_sz;
+ if (cram_store_container(fd, c, buf, &container_sz) != 0)
+ goto err;
+
+ if (old_container_sz != container_sz) {
+ fprintf(stderr, "Quirk of fate makes this troublesome! "
+ "Please use non-inplace version.\n");
+ goto err;
+ }
+
+
+
+ // Version 3.0 supports compressed header
+ b = cram_new_block(FILE_HEADER, 0);
+ int32_put_blk(b, header_len);
+ cram_block_append(b, sam_hdr_str(hdr), header_len);
+ cram_block_update_size(b);
+
+ cram_compress_block(fd, b, NULL, -1, -1);
+
+ if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26)
+ goto err;
+
+ if (cram_block_size(b) > cram_container_get_length(c)) {
+ fprintf(stderr, "New header will not fit. Use non-inplace version"
+ " (%d > %d)\n",
+ (int)cram_block_size(b), cram_container_get_length(c));
+ ret = -2;
+ goto err;
+ }
+
+ if (cram_write_container(fd, c) == -1)
+ goto err;
+
+ if (cram_write_block(fd, b) == -1)
+ goto err;
+
+ // Blank out the remainder
+ int rsz = end - htell(cram_fd_get_fp(fd));
+ assert(rsz >= 0);
+ if (rsz) {
+ char *rem = calloc(1, rsz);
+ ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1;
+ free(rem);
+ }
+
+ err:
+ if (c) cram_free_container(c);
+ if (buf) free(buf);
+ if (b) cram_free_block(b);
+ if (hdr) sam_hdr_free(hdr);
+
+ return ret;
+}
+
+int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
+ int add_PG)
+{
+ switch (cram_major_vers(fd)) {
+ case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG);
+ case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG);
+ default:
+ fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
+ cram_major_vers(fd));
+ return -1;
+ }
+}
+
+static void usage(FILE *fp, int ret) {
+ fprintf(fp,
+ "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n"
+ " or samtools reheader [-P] -i in.header.sam file.bam\n"
+ "\n"
+ "Options:\n"
+ " -P, --no-PG Do not generate an @PG header line.\n"
+ " -i, --in-place Modify the bam/cram file directly.\n"
+ " (Defaults to outputting to stdout.)\n");
+ exit(ret);
+}
+
int main_reheader(int argc, char *argv[])
{
- bam_header_t *h;
- BGZF *in;
- if (argc != 3) {
- fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
- return 1;
+ int inplace = 0, r, add_PG = 1, c;
+ bam_hdr_t *h;
+ samFile *in;
+ char *arg_list = stringify_argv(argc+1, argv-1);
+
+ static const struct option lopts[] = {
+ {"help", no_argument, NULL, 'h'},
+ {"in-place", no_argument, NULL, 'i'},
+ {"no-PG", no_argument, NULL, 'P'},
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'P': add_PG = 0; break;
+ case 'i': inplace = 1; break;
+ case 'h': usage(stdout, 0); break;
+ default:
+ fprintf(stderr, "Invalid option '%c'\n", c);
+ usage(stderr, 1);
+ }
}
+
+ if (argc - optind != 2)
+ usage(stderr, 1);
+
{ // read the header
- tamFile fph = sam_open(argv[1]);
+ samFile *fph = sam_open(argv[optind], "r");
if (fph == 0) {
- fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
+ fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[optind]);
return 1;
}
- h = sam_header_read(fph);
+ h = sam_hdr_read(fph);
sam_close(fph);
+ if (h == NULL) {
+ fprintf(stderr, "[%s] failed to read the header for '%s'.\n",
+ __func__, argv[1]);
+ return 1;
+ }
}
- in = strcmp(argv[2], "-")? bgzf_open(argv[2], "r") : bgzf_fdopen(fileno(stdin), "r");
+ in = sam_open(argv[optind+1], inplace?"r+":"r");
if (in == 0) {
- fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]);
+ fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[optind+1]);
return 1;
}
- bam_reheader(in, h, fileno(stdout));
- bgzf_close(in);
- return 0;
+ if (hts_get_format(in)->format == bam) {
+ r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, add_PG);
+ } else {
+ if (inplace)
+ r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG);
+ else
+ r = cram_reheader(in->fp.cram, h, arg_list, add_PG);
+ }
+
+ if (sam_close(in) != 0)
+ r = -1;
+
+ bam_hdr_destroy(h);
+
+ if (arg_list)
+ free(arg_list);
+
+ return -r;
}
diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c
index 3fcafbd..0519137 100644
--- a/samtools/bam_reheader.c.pysam.c
+++ b/samtools/bam_reheader.c.pysam.c
@@ -27,22 +27,55 @@ DEALINGS IN THE SOFTWARE. */
#include <stdio.h>
#include <stdlib.h>
+#include <assert.h>
+#include <getopt.h>
+
#include "htslib/bgzf.h"
-#include "bam.h"
+#include "htslib/sam.h"
+#include "htslib/hfile.h"
+#include "htslib/cram.h"
+#include "samtools.h"
#define BUF_SIZE 0x10000
-int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
+/*
+ * Reads a file and outputs a new BAM file to fd with 'h' replaced as
+ * the header. No checks are made to the validity.
+ */
+int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
+ const char *arg_list, int add_PG)
{
BGZF *fp;
- bam_header_t *old;
ssize_t len;
uint8_t *buf;
if (in->is_write) return -1;
buf = malloc(BUF_SIZE);
- old = bam_header_read(in);
+ if (bam_hdr_read(in) == NULL) {
+ fprintf(pysamerr, "Couldn't read header\n");
+ free(buf);
+ return -1;
+ }
fp = bgzf_fdopen(fd, "w");
- bam_header_write(fp, h);
+
+ if (add_PG) {
+ // Around the houses, but it'll do until we can manipulate bam_hdr_t natively.
+ SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text);
+ if (sam_hdr_add_PG(sh, "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ return -1;
+
+ free(h->text);
+ h->text = strdup(sam_hdr_str(sh));
+ h->l_text = sam_hdr_length(sh);
+ if (!h->text)
+ return -1;
+ sam_hdr_free(sh);
+ }
+
+ bam_hdr_write(fp, h);
if (in->block_offset < in->block_length) {
bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
bgzf_flush(fp);
@@ -55,29 +88,398 @@ int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
return 0;
}
+/*
+ * Reads a file and outputs a new CRAM file to stdout with 'h'
+ * replaced as the header. No checks are made to the validity.
+ *
+ * FIXME: error checking
+ */
+int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG)
+{
+ htsFile *h_out = hts_open("-", "wc");
+ cram_fd *out = h_out->fp.cram;
+ cram_container *c = NULL;
+ int ret = -1;
+
+ // Attempt to fill out a cram->refs[] array from @SQ headers
+ cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text));
+ if (add_PG) {
+ if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ goto err;
+
+ // Covert back to bam_hdr_t struct
+ free(h->text);
+ h->text = strdup(sam_hdr_str(cram_fd_get_header(out)));
+ h->l_text = sam_hdr_length(cram_fd_get_header(out));
+ if (!h->text)
+ goto err;
+ }
+
+ if (sam_hdr_write(h_out, h) != 0)
+ goto err;
+ cram_set_option(out, CRAM_OPT_REFERENCE, NULL);
+
+ while ((c = cram_read_container(in))) {
+ int32_t i, num_blocks = cram_container_get_num_blocks(c);
+ if (cram_write_container(out, c) != 0)
+ goto err;
+
+ for (i = 0; i < num_blocks; i++) {
+ cram_block *blk = cram_read_block(in);
+ if (!blk || cram_write_block(out, blk) != 0) {
+ if (blk) cram_free_block(blk);
+ goto err;
+ }
+ cram_free_block(blk);
+ }
+ cram_free_container(c);
+ }
+
+ ret = 0;
+
+ err:
+ if (hts_close(h_out) != 0)
+ ret = -1;
+
+ return ret;
+}
+
+
+
+/*
+ * Reads a version 2 CRAM file and replaces the header in-place,
+ * provided the header is small enough to fit without growing the
+ * entire file.
+ *
+ * Version 2 format has an uncompressed SAM header with multiple nul
+ * termination bytes to permit inline header editing.
+ *
+ * Returns 0 on success;
+ * -1 on general failure;
+ * -2 on failure due to insufficient size
+ */
+int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
+ int add_PG)
+{
+ cram_container *c = NULL;
+ cram_block *b = NULL;
+ SAM_hdr *hdr = NULL;
+ off_t start;
+ int ret = -1;
+
+ if (cram_major_vers(fd) < 2 ||
+ cram_major_vers(fd) > 3) {
+ fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__,
+ cram_major_vers(fd));
+ goto err;
+ }
+
+ if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
+ goto err;
+
+ if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL))
+ goto err;
+
+ int header_len = sam_hdr_length(hdr);
+ /* Fix M5 strings? Maybe out of scope for this tool */
+
+ // Load the existing header
+ if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
+ goto err;
+
+ if (!(c = cram_read_container(fd)))
+ goto err;
+
+ // Version 2.1 has a single uncompressed block which is nul
+ // terminated with many nuls to permit growth.
+ //
+ // So load old block and keep all contents identical bar the
+ // header text itself
+ if (!(b = cram_read_block(fd)))
+ goto err;
+
+ if (cram_block_get_uncomp_size(b) < header_len+4) {
+ fprintf(pysamerr, "New header will not fit. Use non-inplace version (%d > %d)\n",
+ header_len+4, cram_block_get_uncomp_size(b));
+ ret = -2;
+ goto err;
+ }
+
+ cram_block_set_offset(b, 0); // rewind block
+ int32_put_blk(b, header_len);
+ cram_block_append(b, sam_hdr_str(hdr), header_len);
+ // Zero the remaining block
+ memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
+ cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
+ // Make sure all sizes and byte-offsets are consistent after memset
+ cram_block_set_offset(b, cram_block_get_uncomp_size(b));
+ cram_block_set_comp_size(b, cram_block_get_uncomp_size(b));
+
+ if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start)
+ goto err;
+
+ if (cram_write_container(fd, c) == -1)
+ goto err;
+
+ if (cram_write_block(fd, b) == -1)
+ goto err;
+
+ ret = 0;
+ err:
+ if (c) cram_free_container(c);
+ if (b) cram_free_block(b);
+ if (hdr) sam_hdr_free(hdr);
+
+ return ret;
+}
+
+
+/*
+ * Reads a version 3 CRAM file and replaces the header in-place,
+ * provided the header is small enough to fit without growing the
+ * entire file.
+ *
+ * Version 3 format has a SAM header held as an (optionally)
+ * compressed block within the header container. Additional
+ * uncompressed blocks or simply unallocated space (the difference
+ * between total block sizes and the container size) are used to
+ * provide room for growth or contraction of the compressed header.
+ *
+ * Returns 0 on success;
+ * -1 on general failure;
+ * -2 on failure due to insufficient size
+ */
+int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
+ int add_PG)
+{
+ cram_container *c = NULL;
+ cram_block *b = NULL;
+ SAM_hdr *hdr = NULL;
+ off_t start, sz, end;
+ int container_sz, max_container_sz;
+ char *buf = NULL;
+ int ret = -1;
+
+ if (cram_major_vers(fd) < 2 ||
+ cram_major_vers(fd) > 3) {
+ fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__,
+ cram_major_vers(fd));
+ goto err;
+ }
+
+ if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
+ goto err;
+
+ if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL))
+ goto err;
+
+ int header_len = sam_hdr_length(hdr);
+ /* Fix M5 strings? Maybe out of scope for this tool */
+
+ // Find current size of SAM header block
+ if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
+ goto err;
+
+ if (!(c = cram_read_container(fd)))
+ goto err;
+
+ // +5 allows num_landmarks to increase from 0 to 1 (Cramtools)
+ max_container_sz = cram_container_size(c)+5;
+
+ sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start;
+ end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c);
+
+ // We force 1 block instead of (optionally) 2. C CRAM
+ // implementations for v3 were writing 1 compressed block followed
+ // by 1 uncompressed block. However this is tricky to deal with
+ // as changing block sizes can mean the block header also changes
+ // size due to itf8 and variable size integers.
+ //
+ // If we had 1 block, this doesn't change anything.
+ // If we had 2 blocks, the new container header will be smaller by
+ // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value.
+ // However this is an int32 instead of itf8 so the container
+ // header structure stays the same size. This means we can always
+ // reduce the number of blocks without running into size problems.
+ cram_container_set_num_blocks(c, 1);
+ int32_t *landmark;
+ int32_t num_landmarks;
+ landmark = cram_container_get_landmarks(c, &num_landmarks);
+ if (num_landmarks && landmark) {
+ num_landmarks = 1;
+ landmark[0] = 0;
+ } else {
+ num_landmarks = 0;
+ }
+ cram_container_set_landmarks(c, num_landmarks, landmark);
+
+ buf = malloc(max_container_sz);
+ container_sz = max_container_sz;
+ if (cram_store_container(fd, c, buf, &container_sz) != 0)
+ goto err;
+
+ if (!buf)
+ goto err;
+
+ // Proposed new length, but changing cram_container_get_length(c) may change the
+ // container_sz and thus the remainder (cram_container_get_length(c) itself).
+ cram_container_set_length(c, sz - container_sz);
+
+ int old_container_sz = container_sz;
+ container_sz = max_container_sz;
+ if (cram_store_container(fd, c, buf, &container_sz) != 0)
+ goto err;
+
+ if (old_container_sz != container_sz) {
+ fprintf(pysamerr, "Quirk of fate makes this troublesome! "
+ "Please use non-inplace version.\n");
+ goto err;
+ }
+
+
+
+ // Version 3.0 supports compressed header
+ b = cram_new_block(FILE_HEADER, 0);
+ int32_put_blk(b, header_len);
+ cram_block_append(b, sam_hdr_str(hdr), header_len);
+ cram_block_update_size(b);
+
+ cram_compress_block(fd, b, NULL, -1, -1);
+
+ if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26)
+ goto err;
+
+ if (cram_block_size(b) > cram_container_get_length(c)) {
+ fprintf(pysamerr, "New header will not fit. Use non-inplace version"
+ " (%d > %d)\n",
+ (int)cram_block_size(b), cram_container_get_length(c));
+ ret = -2;
+ goto err;
+ }
+
+ if (cram_write_container(fd, c) == -1)
+ goto err;
+
+ if (cram_write_block(fd, b) == -1)
+ goto err;
+
+ // Blank out the remainder
+ int rsz = end - htell(cram_fd_get_fp(fd));
+ assert(rsz >= 0);
+ if (rsz) {
+ char *rem = calloc(1, rsz);
+ ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1;
+ free(rem);
+ }
+
+ err:
+ if (c) cram_free_container(c);
+ if (buf) free(buf);
+ if (b) cram_free_block(b);
+ if (hdr) sam_hdr_free(hdr);
+
+ return ret;
+}
+
+int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
+ int add_PG)
+{
+ switch (cram_major_vers(fd)) {
+ case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG);
+ case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG);
+ default:
+ fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__,
+ cram_major_vers(fd));
+ return -1;
+ }
+}
+
+static void usage(FILE *fp, int ret) {
+ fprintf(fp,
+ "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n"
+ " or samtools reheader [-P] -i in.header.sam file.bam\n"
+ "\n"
+ "Options:\n"
+ " -P, --no-PG Do not generate an @PG header line.\n"
+ " -i, --in-place Modify the bam/cram file directly.\n"
+ " (Defaults to outputting to stdout.)\n");
+ exit(ret);
+}
+
int main_reheader(int argc, char *argv[])
{
- bam_header_t *h;
- BGZF *in;
- if (argc != 3) {
- fprintf(pysamerr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
- return 1;
+ int inplace = 0, r, add_PG = 1, c;
+ bam_hdr_t *h;
+ samFile *in;
+ char *arg_list = stringify_argv(argc+1, argv-1);
+
+ static const struct option lopts[] = {
+ {"help", no_argument, NULL, 'h'},
+ {"in-place", no_argument, NULL, 'i'},
+ {"no-PG", no_argument, NULL, 'P'},
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) {
+ fprintf(stderr, " %i %c %s\n", optind, c, argv[optind-1]);
+ switch (c) {
+ case 'P': add_PG = 0; break;
+ case 'i': inplace = 1; break;
+ case 'h': usage(stdout, 0); break;
+ default:
+ fprintf(pysamerr, "Invalid option '%c'\n", c);
+ usage(pysamerr, 1);
+ }
}
+
+ if (argc - optind != 2)
+ usage(pysamerr, 1);
+
{ // read the header
- tamFile fph = sam_open(argv[1]);
+ samFile *fph = sam_open(argv[optind], "r");
if (fph == 0) {
- fprintf(pysamerr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
+ fprintf(pysamerr, "[%s] fail to read the header from %s.\n", __func__, argv[optind]);
return 1;
}
- h = sam_header_read(fph);
+ h = sam_hdr_read(fph);
sam_close(fph);
+ if (h == NULL) {
+ fprintf(pysamerr, "[%s] failed to read the header for '%s'.\n",
+ __func__, argv[1]);
+ return 1;
+ }
}
- in = strcmp(argv[2], "-")? bgzf_open(argv[2], "r") : bgzf_fdopen(fileno(stdin), "r");
+ in = sam_open(argv[optind+1], inplace?"r+":"r");
if (in == 0) {
- fprintf(pysamerr, "[%s] fail to open file %s.\n", __func__, argv[2]);
+ fprintf(pysamerr, "[%s] fail to open file %s.\n", __func__, argv[optind+1]);
return 1;
}
- bam_reheader(in, h, fileno(stdout));
- bgzf_close(in);
- return 0;
+ if (hts_get_format(in)->format == bam) {
+ r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, add_PG);
+ } else {
+ if (inplace)
+ r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG);
+ else
+ r = cram_reheader(in->fp.cram, h, arg_list, add_PG);
+ }
+
+ if (sam_close(in) != 0)
+ r = -1;
+
+ bam_hdr_destroy(h);
+
+ if (arg_list)
+ free(arg_list);
+
+ return -r;
}
diff --git a/samtools/bam_rmdup.c b/samtools/bam_rmdup.c
index a9c92d1..cdca878 100644
--- a/samtools/bam_rmdup.c
+++ b/samtools/bam_rmdup.c
@@ -1,6 +1,6 @@
/* bam_rmdup.c -- duplicate read detection.
- Copyright (C) 2009 Genome Research Ltd.
+ Copyright (C) 2009, 2015 Genome Research Ltd.
Portions copyright (C) 2009 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -28,7 +28,9 @@ DEALINGS IN THE SOFTWARE. */
#include <stdio.h>
#include <zlib.h>
#include <unistd.h>
-#include "sam.h"
+#include "htslib/sam.h"
+#include "sam_opts.h"
+#include "bam.h" // for bam_get_library
typedef bam1_t *bam1_p;
@@ -58,11 +60,11 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
stack->a[stack->n++] = b;
}
-static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
+static inline void dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr)
{
int i;
for (i = 0; i != stack->n; ++i) {
- samwrite(out, stack->a[i]);
+ sam_write1(out, hdr, stack->a[i]);
bam_destroy1(stack->a[i]);
}
stack->n = 0;
@@ -107,12 +109,12 @@ static void clear_best(khash_t(lib) *aux, int max)
static inline int sum_qual(const bam1_t *b)
{
int i, q;
- uint8_t *qual = bam1_qual(b);
+ uint8_t *qual = bam_get_qual(b);
for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
return q;
}
-void bam_rmdup_core(samfile_t *in, samfile_t *out)
+void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out)
{
bam1_t *b;
int last_tid = -1, last_pos = -1;
@@ -127,10 +129,10 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
memset(&stack, 0, sizeof(tmp_stack_t));
kh_resize(name, del_set, 4 * BUFFER_SIZE);
- while (samread(in, b) >= 0) {
+ while (sam_read1(in, hdr, b) >= 0) {
bam1_core_t *c = &b->core;
if (c->tid != last_tid || last_pos != c->pos) {
- dump_best(&stack, out); // write the result
+ dump_best(&stack, out, hdr); // write the result
clear_best(aux, BUFFER_SIZE);
if (c->tid != last_tid) {
clear_best(aux, 0);
@@ -139,22 +141,22 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
clear_del_set(del_set);
}
if ((int)c->tid == -1) { // append unmapped reads
- samwrite(out, b);
- while (samread(in, b) >= 0) samwrite(out, b);
+ sam_write1(out, hdr, b);
+ while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b);
break;
}
last_tid = c->tid;
- fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
+ fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]);
}
}
if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
- samwrite(out, b);
+ sam_write1(out, hdr, b);
} else if (c->isize > 0) { // paired, head
uint64_t key = (uint64_t)c->pos<<32 | c->isize;
const char *lib;
lib_aux_t *q;
int ret;
- lib = bam_get_library(in->header, b);
+ lib = bam_get_library(hdr, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
k = kh_put(pos, q->best_hash, key, &ret);
@@ -162,21 +164,21 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
bam1_t *p = kh_val(q->best_hash, k);
++q->n_removed;
if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
- kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+ kh_put(name, del_set, strdup(bam_get_qname(p)), &ret); // p will be removed
bam_copy1(p, b); // replaced as b
- } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+ } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed
if (ret == 0)
- fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+ fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b));
} else { // not found in best_hash
kh_val(q->best_hash, k) = bam_dup1(b);
stack_insert(&stack, kh_val(q->best_hash, k));
}
} else { // paired, tail
- k = kh_get(name, del_set, bam1_qname(b));
+ k = kh_get(name, del_set, bam_get_qname(b));
if (k != kh_end(del_set)) {
free((char*)kh_key(del_set, k));
kh_del(name, del_set, k);
- } else samwrite(out, b);
+ } else sam_write1(out, hdr, b);
}
last_pos = c->pos;
}
@@ -184,7 +186,7 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
lib_aux_t *q = &kh_val(aux, k);
- dump_best(&stack, out);
+ dump_best(&stack, out, hdr);
fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
kh_destroy(pos, q->best_hash);
@@ -199,33 +201,61 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
bam_destroy1(b);
}
-void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
+void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se);
+
+static int rmdup_usage(void) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
+ fprintf(stderr, "Option: -s rmdup for SE reads\n");
+ fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n");
+
+ sam_global_opt_help(stderr, "-....");
+ return 1;
+}
int bam_rmdup(int argc, char *argv[])
{
int c, is_se = 0, force_se = 0;
- samfile_t *in, *out;
- while ((c = getopt(argc, argv, "sS")) >= 0) {
+ samFile *in, *out;
+ bam_hdr_t *header;
+ char wmode[3] = {'w', 'b', 0};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "sS", lopts, NULL)) >= 0) {
switch (c) {
case 's': is_se = 1; break;
case 'S': force_se = is_se = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return rmdup_usage();
}
}
- if (optind + 2 > argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
- fprintf(stderr, "Option: -s rmdup for SE reads\n");
- fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n");
+ if (optind + 2 > argc)
+ return rmdup_usage();
+
+ in = sam_open_format(argv[optind], "r", &ga.in);
+ header = sam_hdr_read(in);
+ if (header == NULL || header->n_targets == 0) {
+ fprintf(stderr, "[bam_rmdup] input SAM does not have header. Abort!\n");
return 1;
}
- in = samopen(argv[optind], "rb", 0);
- out = samopen(argv[optind+1], "wb", in->header);
+
+ sam_open_mode(wmode+1, argv[optind+1], NULL);
+ out = sam_open_format(argv[optind+1], wmode, &ga.out);
if (in == 0 || out == 0) {
fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
return 1;
}
- if (is_se) bam_rmdupse_core(in, out, force_se);
- else bam_rmdup_core(in, out);
- samclose(in); samclose(out);
+ sam_hdr_write(out, header);
+
+ if (is_se) bam_rmdupse_core(in, header, out, force_se);
+ else bam_rmdup_core(in, header, out);
+ bam_hdr_destroy(header);
+ sam_close(in); sam_close(out);
return 0;
}
diff --git a/samtools/bam_rmdup.c.pysam.c b/samtools/bam_rmdup.c.pysam.c
index e53b065..4ece6f2 100644
--- a/samtools/bam_rmdup.c.pysam.c
+++ b/samtools/bam_rmdup.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_rmdup.c -- duplicate read detection.
- Copyright (C) 2009 Genome Research Ltd.
+ Copyright (C) 2009, 2015 Genome Research Ltd.
Portions copyright (C) 2009 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -30,7 +30,9 @@ DEALINGS IN THE SOFTWARE. */
#include <stdio.h>
#include <zlib.h>
#include <unistd.h>
-#include "sam.h"
+#include "htslib/sam.h"
+#include "sam_opts.h"
+#include "bam.h" // for bam_get_library
typedef bam1_t *bam1_p;
@@ -60,11 +62,11 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
stack->a[stack->n++] = b;
}
-static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
+static inline void dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr)
{
int i;
for (i = 0; i != stack->n; ++i) {
- samwrite(out, stack->a[i]);
+ sam_write1(out, hdr, stack->a[i]);
bam_destroy1(stack->a[i]);
}
stack->n = 0;
@@ -109,12 +111,12 @@ static void clear_best(khash_t(lib) *aux, int max)
static inline int sum_qual(const bam1_t *b)
{
int i, q;
- uint8_t *qual = bam1_qual(b);
+ uint8_t *qual = bam_get_qual(b);
for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
return q;
}
-void bam_rmdup_core(samfile_t *in, samfile_t *out)
+void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out)
{
bam1_t *b;
int last_tid = -1, last_pos = -1;
@@ -129,10 +131,10 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
memset(&stack, 0, sizeof(tmp_stack_t));
kh_resize(name, del_set, 4 * BUFFER_SIZE);
- while (samread(in, b) >= 0) {
+ while (sam_read1(in, hdr, b) >= 0) {
bam1_core_t *c = &b->core;
if (c->tid != last_tid || last_pos != c->pos) {
- dump_best(&stack, out); // write the result
+ dump_best(&stack, out, hdr); // write the result
clear_best(aux, BUFFER_SIZE);
if (c->tid != last_tid) {
clear_best(aux, 0);
@@ -141,22 +143,22 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
clear_del_set(del_set);
}
if ((int)c->tid == -1) { // append unmapped reads
- samwrite(out, b);
- while (samread(in, b) >= 0) samwrite(out, b);
+ sam_write1(out, hdr, b);
+ while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b);
break;
}
last_tid = c->tid;
- fprintf(pysamerr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
+ fprintf(pysamerr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]);
}
}
if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
- samwrite(out, b);
+ sam_write1(out, hdr, b);
} else if (c->isize > 0) { // paired, head
uint64_t key = (uint64_t)c->pos<<32 | c->isize;
const char *lib;
lib_aux_t *q;
int ret;
- lib = bam_get_library(in->header, b);
+ lib = bam_get_library(hdr, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
k = kh_put(pos, q->best_hash, key, &ret);
@@ -164,21 +166,21 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
bam1_t *p = kh_val(q->best_hash, k);
++q->n_removed;
if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
- kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+ kh_put(name, del_set, strdup(bam_get_qname(p)), &ret); // p will be removed
bam_copy1(p, b); // replaced as b
- } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+ } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed
if (ret == 0)
- fprintf(pysamerr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+ fprintf(pysamerr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b));
} else { // not found in best_hash
kh_val(q->best_hash, k) = bam_dup1(b);
stack_insert(&stack, kh_val(q->best_hash, k));
}
} else { // paired, tail
- k = kh_get(name, del_set, bam1_qname(b));
+ k = kh_get(name, del_set, bam_get_qname(b));
if (k != kh_end(del_set)) {
free((char*)kh_key(del_set, k));
kh_del(name, del_set, k);
- } else samwrite(out, b);
+ } else sam_write1(out, hdr, b);
}
last_pos = c->pos;
}
@@ -186,7 +188,7 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
lib_aux_t *q = &kh_val(aux, k);
- dump_best(&stack, out);
+ dump_best(&stack, out, hdr);
fprintf(pysamerr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
kh_destroy(pos, q->best_hash);
@@ -201,33 +203,61 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
bam_destroy1(b);
}
-void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
+void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se);
+
+static int rmdup_usage(void) {
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
+ fprintf(pysamerr, "Option: -s rmdup for SE reads\n");
+ fprintf(pysamerr, " -S treat PE reads as SE in rmdup (force -s)\n");
+
+ sam_global_opt_help(pysamerr, "-....");
+ return 1;
+}
int bam_rmdup(int argc, char *argv[])
{
int c, is_se = 0, force_se = 0;
- samfile_t *in, *out;
- while ((c = getopt(argc, argv, "sS")) >= 0) {
+ samFile *in, *out;
+ bam_hdr_t *header;
+ char wmode[3] = {'w', 'b', 0};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "sS", lopts, NULL)) >= 0) {
switch (c) {
case 's': is_se = 1; break;
case 'S': force_se = is_se = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return rmdup_usage();
}
}
- if (optind + 2 > argc) {
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
- fprintf(pysamerr, "Option: -s rmdup for SE reads\n");
- fprintf(pysamerr, " -S treat PE reads as SE in rmdup (force -s)\n\n");
+ if (optind + 2 > argc)
+ return rmdup_usage();
+
+ in = sam_open_format(argv[optind], "r", &ga.in);
+ header = sam_hdr_read(in);
+ if (header == NULL || header->n_targets == 0) {
+ fprintf(pysamerr, "[bam_rmdup] input SAM does not have header. Abort!\n");
return 1;
}
- in = samopen(argv[optind], "rb", 0);
- out = samopen(argv[optind+1], "wb", in->header);
+
+ sam_open_mode(wmode+1, argv[optind+1], NULL);
+ out = sam_open_format(argv[optind+1], wmode, &ga.out);
if (in == 0 || out == 0) {
fprintf(pysamerr, "[bam_rmdup] fail to read/write input files\n");
return 1;
}
- if (is_se) bam_rmdupse_core(in, out, force_se);
- else bam_rmdup_core(in, out);
- samclose(in); samclose(out);
+ sam_hdr_write(out, header);
+
+ if (is_se) bam_rmdupse_core(in, header, out, force_se);
+ else bam_rmdup_core(in, header, out);
+ bam_hdr_destroy(header);
+ sam_close(in); sam_close(out);
return 0;
}
diff --git a/samtools/bam_rmdupse.c b/samtools/bam_rmdupse.c
index 34cb9c3..d17f6f5 100644
--- a/samtools/bam_rmdupse.c
+++ b/samtools/bam_rmdupse.c
@@ -1,6 +1,6 @@
/* bam_rmdupse.c -- duplicate read detection for unpaired reads.
- Copyright (C) 2009 Genome Research Ltd.
+ Copyright (C) 2009, 2015 Genome Research Ltd.
Portions copyright (C) 2009 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -24,7 +24,9 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include <math.h>
-#include "sam.h"
+#include <stdio.h>
+#include "bam.h" // for bam_get_library
+#include "htslib/sam.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
@@ -68,7 +70,7 @@ static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
static inline int sum_qual(const bam1_t *b)
{
int i, q;
- uint8_t *qual = bam1_qual(b);
+ uint8_t *qual = bam_get_qual(b);
for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
return q;
}
@@ -91,7 +93,8 @@ static void clear_besthash(besthash_t *h, int32_t pos)
kh_del(best, h, k);
}
-static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
+static void dump_alignment(samFile *out, bam_hdr_t *hdr,
+ queue_t *queue, int32_t pos, khash_t(lib) *h)
{
if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
khint_t k;
@@ -100,13 +103,13 @@ static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(
if (queue->head == queue->tail) break;
q = &kl_val(queue->head);
if (q->discarded) {
- q->b->data_len = 0;
+ q->b->l_data = 0;
kl_shift(q, queue, 0);
continue;
}
if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
- samwrite(out, q->b);
- q->b->data_len = 0;
+ sam_write1(out, hdr, q->b);
+ q->b->l_data = 0;
kl_shift(q, queue, 0);
}
for (k = kh_begin(h); k != kh_end(h); ++k) {
@@ -118,7 +121,7 @@ static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(
}
}
-void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
+void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se)
{
bam1_t *b;
queue_t *queue;
@@ -129,15 +132,15 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
aux = kh_init(lib);
b = bam_init1();
queue = kl_init(q);
- while (samread(in, b) >= 0) {
+ while (sam_read1(in, hdr, b) >= 0) {
bam1_core_t *c = &b->core;
- int endpos = bam_calend(c, bam1_cigar(b));
+ int endpos = bam_endpos(b);
int score = sum_qual(b);
if (last_tid != c->tid) {
- if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
+ if (last_tid >= 0) dump_alignment(out, hdr, queue, MAX_POS, aux);
last_tid = c->tid;
- } else dump_alignment(out, queue, c->pos, aux);
+ } else dump_alignment(out, hdr, queue, c->pos, aux);
if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
push_queue(queue, b, endpos, score);
} else {
@@ -146,7 +149,7 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
besthash_t *h;
uint32_t key;
int ret;
- lib = bam_get_library(in->header, b);
+ lib = bam_get_library(hdr, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
@@ -167,7 +170,7 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
} else kh_val(h, k) = push_queue(queue, b, endpos, score);
}
}
- dump_alignment(out, queue, MAX_POS, aux);
+ dump_alignment(out, hdr, queue, MAX_POS, aux);
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
diff --git a/samtools/bam_rmdupse.c.pysam.c b/samtools/bam_rmdupse.c.pysam.c
index 31adc67..06895a8 100644
--- a/samtools/bam_rmdupse.c.pysam.c
+++ b/samtools/bam_rmdupse.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_rmdupse.c -- duplicate read detection for unpaired reads.
- Copyright (C) 2009 Genome Research Ltd.
+ Copyright (C) 2009, 2015 Genome Research Ltd.
Portions copyright (C) 2009 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -26,7 +26,9 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include <math.h>
-#include "sam.h"
+#include <stdio.h>
+#include "bam.h" // for bam_get_library
+#include "htslib/sam.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
@@ -70,7 +72,7 @@ static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
static inline int sum_qual(const bam1_t *b)
{
int i, q;
- uint8_t *qual = bam1_qual(b);
+ uint8_t *qual = bam_get_qual(b);
for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
return q;
}
@@ -93,7 +95,8 @@ static void clear_besthash(besthash_t *h, int32_t pos)
kh_del(best, h, k);
}
-static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
+static void dump_alignment(samFile *out, bam_hdr_t *hdr,
+ queue_t *queue, int32_t pos, khash_t(lib) *h)
{
if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
khint_t k;
@@ -102,13 +105,13 @@ static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(
if (queue->head == queue->tail) break;
q = &kl_val(queue->head);
if (q->discarded) {
- q->b->data_len = 0;
+ q->b->l_data = 0;
kl_shift(q, queue, 0);
continue;
}
if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
- samwrite(out, q->b);
- q->b->data_len = 0;
+ sam_write1(out, hdr, q->b);
+ q->b->l_data = 0;
kl_shift(q, queue, 0);
}
for (k = kh_begin(h); k != kh_end(h); ++k) {
@@ -120,7 +123,7 @@ static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(
}
}
-void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
+void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se)
{
bam1_t *b;
queue_t *queue;
@@ -131,15 +134,15 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
aux = kh_init(lib);
b = bam_init1();
queue = kl_init(q);
- while (samread(in, b) >= 0) {
+ while (sam_read1(in, hdr, b) >= 0) {
bam1_core_t *c = &b->core;
- int endpos = bam_calend(c, bam1_cigar(b));
+ int endpos = bam_endpos(b);
int score = sum_qual(b);
if (last_tid != c->tid) {
- if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
+ if (last_tid >= 0) dump_alignment(out, hdr, queue, MAX_POS, aux);
last_tid = c->tid;
- } else dump_alignment(out, queue, c->pos, aux);
+ } else dump_alignment(out, hdr, queue, c->pos, aux);
if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
push_queue(queue, b, endpos, score);
} else {
@@ -148,7 +151,7 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
besthash_t *h;
uint32_t key;
int ret;
- lib = bam_get_library(in->header, b);
+ lib = bam_get_library(hdr, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
@@ -169,7 +172,7 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
} else kh_val(h, k) = push_queue(queue, b, endpos, score);
}
}
- dump_alignment(out, queue, MAX_POS, aux);
+ dump_alignment(out, hdr, queue, MAX_POS, aux);
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c
index e721c59..7a441ae 100644
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -1,9 +1,10 @@
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2014 Genome Research Ltd.
+ Copyright (C) 2008-2015 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
+ Author: Martin Pollard <mp15 at sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -29,14 +30,16 @@ DEALINGS IN THE SOFTWARE. */
#include <errno.h>
#include <stdio.h>
#include <string.h>
-#include <regex.h>
#include <time.h>
#include <unistd.h>
+#include <getopt.h>
+#include <assert.h>
#include "htslib/ksort.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "sam_opts.h"
#if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L
#define NEED_MEMSET_PATTERN4
@@ -57,10 +60,11 @@ void memset_pattern4(void *target, const void *pattern, size_t size) {
#endif
KHASH_INIT(c2c, char*, char*, 1, kh_str_hash_func, kh_str_hash_equal)
+KHASH_INIT(cset, char*, char, 0, kh_str_hash_func, kh_str_hash_equal)
KHASH_MAP_INIT_STR(c2i, int)
-#define __free_char(p)
-KLIST_INIT(hdrln, char*, __free_char)
+#define hdrln_free_char(p)
+KLIST_INIT(hdrln, char*, hdrln_free_char)
static int g_is_by_qname = 0;
@@ -111,6 +115,22 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
KSORT_INIT(heap, heap1_t, heap_lt)
+typedef struct merged_header {
+ kstring_t out_hd;
+ kstring_t out_sq;
+ kstring_t out_rg;
+ kstring_t out_pg;
+ kstring_t out_co;
+ char **target_name;
+ uint32_t *target_len;
+ size_t n_targets;
+ size_t targets_sz;
+ khash_t(c2i) *sq_tids;
+ khash_t(cset) *rg_ids;
+ khash_t(cset) *pg_ids;
+ bool have_hd;
+} merged_header_t;
+
typedef struct trans_tbl {
int32_t n_targets;
int* tid_trans;
@@ -119,18 +139,100 @@ typedef struct trans_tbl {
bool lost_coord_sort;
} trans_tbl_t;
+/* Something to look like a regmatch_t */
+typedef struct hdr_match {
+ ptrdiff_t rm_so;
+ ptrdiff_t rm_eo;
+} hdr_match_t;
+
+/*
+ * Search for header lines of a particular record type.
+ *
+ * This replaces a regex search for something like /^@SQ.*\tSN:([^\t]+).*$/
+ * but is much quicker. The locations found are returned in *matches,
+ * which has a signature the same as that of a regmatch_t.
+ *
+ * rec is the record type to match (i.e. @HD, @SQ, @PG or @RG)
+ * tag is a tag type in the record to match (SN for @SQ, ID for @PG or @RG)
+ *
+ * The location of the record (if found) is returned in matches[0]
+ * If tag is not NULL, the record is searched for the presence of the
+ * given tag. If found, the location of the value is returned in matches[1].
+ * If the tag isn't found then the record is ignored and the search resumes
+ * on the next header line.
+ *
+ * For simplicity, some assumptions are made about rec and tag:
+ * rec should include the leading '@' sign and be three characters long.
+ * tag should be exactly two characters long.
+ * These are always string constants when this is called below, so we don't
+ * bother to check here.
+ *
+ * Returns 0 if a match was found, -1 if not.
+ */
+
+
+static int hdr_line_match(const char *text, const char *rec,
+ const char *tag, hdr_match_t *matches) {
+ const char *line_start, *line_end = text;
+ const char *tag_start, *tag_end;
+
+ for (;;) {
+ // Find record, ensure either at start of text or follows '\n'
+ line_start = strstr(line_end, rec);
+ while (line_start && line_start > text && *(line_start - 1) != '\n') {
+ line_start = strstr(line_start + 3, rec);
+ }
+ if (!line_start) return -1;
+
+ // Find end of header line
+ line_end = strchr(line_start, '\n');
+ if (!line_end) line_end = line_start + strlen(line_start);
+
+ matches[0].rm_so = line_start - text;
+ matches[0].rm_eo = line_end - text;
+ if (!tag) return 0; // Match found if not looking for tag.
+
+ for (tag_start = line_start + 3; tag_start < line_end; tag_start++) {
+ // Find possible tag start. Hacky but quick.
+ while (*tag_start > '\n') tag_start++;
+
+ // Check it
+ if (tag_start[0] == '\t'
+ && strncmp(tag_start + 1, tag, 2) == 0
+ && tag_start[3] == ':') {
+ // Found tag, record location and return.
+ tag_end = tag_start + 4;
+ while (*tag_end && *tag_end != '\t' && *tag_end != '\n')
+ ++tag_end;
+ matches[1].rm_so = tag_start - text + 4;
+ matches[1].rm_eo = tag_end - text;
+ return 0;
+ }
+ }
+ // Couldn't find tag, try again from end of current record.
+ }
+}
+
static void trans_tbl_destroy(trans_tbl_t *tbl) {
- free(tbl->tid_trans);
khiter_t iter;
+
+ free(tbl->tid_trans);
+
+ /*
+ * The values for the tbl->rg_trans and tbl->pg_trans hashes are pointers
+ * to keys in the rg_ids and pg_ids sets of the merged_header_t, so
+ * they should not be freed here.
+ *
+ * The keys are unique to each hash entry, so they do have to go.
+ */
+
for (iter = kh_begin(tbl->rg_trans); iter != kh_end(tbl->rg_trans); ++iter) {
if (kh_exist(tbl->rg_trans, iter)) {
- free(kh_value(tbl->rg_trans, iter));
free(kh_key(tbl->rg_trans, iter));
}
}
for (iter = kh_begin(tbl->pg_trans); iter != kh_end(tbl->pg_trans); ++iter) {
if (kh_exist(tbl->pg_trans, iter)) {
- free(kh_value(tbl->pg_trans, iter));
free(kh_key(tbl->pg_trans, iter));
}
}
@@ -139,347 +241,727 @@ static void trans_tbl_destroy(trans_tbl_t *tbl) {
kh_destroy(c2c,tbl->pg_trans);
}
-// Takes in existing header and rewrites it in the usual order HD, SQ, RG, PG CO, other
-static void pretty_header(char** text_in_out, int32_t text_len)
-{
- char* output, *output_pointer;
- output = output_pointer = (char*)calloc(1,text_len+1);
- output[text_len] = '\0';
-
- // Read @HD and write
- regex_t hd_regex, sq_regex, pg_regex, rg_regex, co_regex, other_regex;
- regmatch_t matches[1];
- if (regcomp( &hd_regex, "^@HD.*$", REG_EXTENDED|REG_NEWLINE ))
- abort();
- if (regexec( &hd_regex, *text_in_out, 1, &matches[0], 0 ) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, *text_in_out+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- }
- regfree(&hd_regex);
-
- // Read @SQ's and write
- if (regcomp( &sq_regex, "^@SQ.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* sq_pointer = *text_in_out;
- while (*text_in_out+text_len > sq_pointer && regexec( &sq_regex, sq_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, sq_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- sq_pointer += matches[0].rm_eo + 1;
- }
- regfree(&sq_regex);
-
- // Read @RG's and write
- if (regcomp( &rg_regex, "^@RG.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* rg_pointer = *text_in_out;
- while (*text_in_out+text_len > rg_pointer && regexec( &rg_regex, rg_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, rg_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- rg_pointer += matches[0].rm_eo + 1;
- }
- regfree(&rg_regex);
-
- // Read @PG's and write
- if (regcomp( &pg_regex, "^@PG.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* pg_pointer = *text_in_out;
- while (*text_in_out+text_len > pg_pointer && regexec( &pg_regex, pg_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, pg_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- pg_pointer += matches[0].rm_eo + 1;
- }
- regfree(&pg_regex);
-
- // Read @CO's and write
- if (regcomp( &co_regex, "^@CO.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* co_pointer = *text_in_out;
- while (*text_in_out+text_len > co_pointer && regexec( &co_regex, co_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, co_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- co_pointer += matches[0].rm_eo + 1;
- }
- regfree(&co_regex);
-
- // Read any other not HD,SQ,RG,PG,CO tags and write
- if (regcomp( &other_regex, "^@([^HSCPR]|H[^D]|S[^Q]|[PR][^G]|C[^O]).*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* other_pointer = *text_in_out;
- while (*text_in_out+text_len > other_pointer && regexec( &other_regex, other_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, other_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- other_pointer += matches[0].rm_eo + 1;
- }
- regfree(&other_regex);
+/*
+ * Create a merged_header_t struct.
+ */
- // Safety check, make sure we copied it all, if we didn't something is wrong with the header
- if ( output+text_len != output_pointer ) {
- fprintf(stderr, "[pretty_header] invalid header\n");
- exit(1);
+static merged_header_t * init_merged_header() {
+ merged_header_t *merged_hdr;
+
+ merged_hdr = calloc(1, sizeof(*merged_hdr));
+ if (merged_hdr == NULL) return NULL;
+
+ merged_hdr->targets_sz = 16;
+ merged_hdr->target_name = malloc(merged_hdr->targets_sz
+ * sizeof(*merged_hdr->target_name));
+ if (NULL == merged_hdr->target_name) goto fail;
+
+ merged_hdr->target_len = malloc(merged_hdr->targets_sz
+ * sizeof(*merged_hdr->target_len));
+ if (NULL == merged_hdr->target_len) goto fail;
+
+ merged_hdr->sq_tids = kh_init(c2i);
+ if (merged_hdr->sq_tids == NULL) goto fail;
+
+ merged_hdr->rg_ids = kh_init(cset);
+ if (merged_hdr->rg_ids == NULL) goto fail;
+
+ merged_hdr->pg_ids = kh_init(cset);
+ if (merged_hdr->pg_ids == NULL) goto fail;
+
+ return merged_hdr;
+
+ fail:
+ perror("[init_merged_header]");
+ kh_destroy(cset, merged_hdr->pg_ids);
+ kh_destroy(cset, merged_hdr->rg_ids);
+ kh_destroy(c2i, merged_hdr->sq_tids);
+ free(merged_hdr->target_name);
+ free(merged_hdr->target_len);
+ free(merged_hdr);
+ return NULL;
+}
+
+/* Some handy kstring manipulating functions */
+
+// Append char range to kstring
+static inline int range_to_ks(const char *src, int from, int to,
+ kstring_t *dest) {
+ return kputsn(src + from, to - from, dest) != to - from;
+}
+
+// Append a header line match to kstring
+static inline int match_to_ks(const char *src, const hdr_match_t *match,
+ kstring_t *dest) {
+ return range_to_ks(src, match->rm_so, match->rm_eo, dest);
+}
+
+// Append a kstring to a kstring
+static inline int ks_to_ks(kstring_t *src, kstring_t *dest) {
+ return kputsn(ks_str(src), ks_len(src), dest) != ks_len(src);
+}
+
+/*
+ * Generate a unique ID by appending a random suffix to a given prefix.
+ * existing_ids is the set of IDs that are already in use.
+ * If always_add_suffix is true, the suffix will always be included.
+ * If false, prefix will be returned unchanged if it isn't in existing_ids.
+ */
+
+static int gen_unique_id(char *prefix, khash_t(cset) *existing_ids,
+ bool always_add_suffix, kstring_t *dest) {
+ khiter_t iter;
+
+ if (!always_add_suffix) {
+ // Try prefix on its own first
+ iter = kh_get(cset, existing_ids, prefix);
+ if (iter == kh_end(existing_ids)) { // prefix isn't used yet
+ dest->l = 0;
+ if (kputs(prefix, dest) == EOF) return -1;
+ return 0;
+ }
}
- free(*text_in_out);
- *text_in_out = output;
+
+ do {
+ dest->l = 0;
+ ksprintf(dest, "%s-%0lX", prefix, lrand48());
+ iter = kh_get(cset, existing_ids, ks_str(dest));
+ } while (iter != kh_end(existing_ids));
+
+ return 0;
}
-static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg)
-{
- tbl->n_targets = translate->n_targets;
- tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int));
- tbl->rg_trans = kh_init(c2c);
- tbl->pg_trans = kh_init(c2c);
- if (!tbl->tid_trans || !tbl->rg_trans || !tbl->pg_trans) { perror("out of memory"); exit(-1); }
+/*
+ * Add the @HD line to the new header
+ * In practice the @HD line will come from the first input header.
+ */
- int32_t out_len = out->l_text;
- while (out_len > 0 && out->text[out_len-1] == '\n') {--out_len; } // strip trailing \n's
- kstring_t out_text = { 0, 0, NULL };
- kputsn(out->text, out_len, &out_text);
+static int trans_tbl_add_hd(merged_header_t* merged_hdr,
+ bam_hdr_t *translate) {
+ hdr_match_t match = {0, 0};
- int i, min_tid = -1;
- tbl->lost_coord_sort = false;
+ // TODO: handle case when @HD needs merging.
+ if (merged_hdr->have_hd) return 0;
- khash_t(c2i) *out_tid = kh_init(c2i);
- for (i = 0; i < out->n_targets; ++i) {
- int ret;
- khiter_t iter = kh_put(c2i, out_tid, out->target_name[i], &ret);
- if (ret <= 0) abort();
- kh_value(out_tid, iter) = i;
+ if (hdr_line_match(translate->text, "@HD", NULL, &match) != 0) {
+ return 0;
}
+ if (match_to_ks(translate->text, &match, &merged_hdr->out_hd)) goto memfail;
+ if (kputc('\n', &merged_hdr->out_hd) == EOF) goto memfail;
+ merged_hdr->have_hd = true;
+
+ return 0;
+
+ memfail:
+ perror(__func__);
+ return -1;
+}
+
+static inline int grow_target_list(merged_header_t* merged_hdr) {
+ size_t new_size;
+ char **new_names;
+ uint32_t *new_len;
+
+ new_size = merged_hdr->targets_sz * 2;
+ new_names = realloc(merged_hdr->target_name, sizeof(*new_names) * new_size);
+ if (!new_names) goto fail;
+ merged_hdr->target_name = new_names;
+
+ new_len = realloc(merged_hdr->target_len, sizeof(*new_len) * new_size);
+ if (!new_len) goto fail;
+ merged_hdr->target_len = new_len;
+
+ merged_hdr->targets_sz = new_size;
+
+ return 0;
+
+ fail:
+ perror(__func__);
+ return -1;
+}
+
+/*
+ * Add @SQ records to the translation table.
+ *
+ * Go through the target list for the input header. Any new targets found
+ * are added to the output header target list. At the same time, a mapping
+ * from the input to output target ids is stored in tbl.
+ *
+ * If any new targets are found, the header text is scanned to find the
+ * corresponding @SQ records. They are then copied into the
+ * merged_hdr->out_text kstring (which will eventually become the
+ * output header text).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+
+static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate,
+ trans_tbl_t* tbl) {
+
+ kstring_t *out_text = &merged_hdr->out_sq;
+ khash_t(c2i)* sq_tids = merged_hdr->sq_tids;
+ hdr_match_t *new_sq_matches = NULL;
+ char *text;
+ hdr_match_t matches[2];
+ int32_t i, missing;
+ int32_t old_n_targets = merged_hdr->n_targets;
+ khiter_t iter;
+ int min_tid = -1;
+
+ // Fill in the tid part of the translation table, adding new targets
+ // to the merged header as we go.
+
for (i = 0; i < translate->n_targets; ++i) {
- khiter_t iter = kh_get(c2i, out_tid, translate->target_name[i]);
-
- if (iter == kh_end(out_tid)) { // Append missing entries to out
- tbl->tid_trans[i] = out->n_targets++;
- out->target_name = (char**)realloc(out->target_name, sizeof(char*)*out->n_targets);
- out->target_name[out->n_targets-1] = strdup(translate->target_name[i]);
- out->target_len = (uint32_t*)realloc(out->target_len, sizeof(uint32_t)*out->n_targets);
- out->target_len[out->n_targets-1] = translate->target_len[i];
- // grep line with regex '^@SQ.*\tSN:%s(\t.*$|$)', translate->target_name[i]
- // from translate->text
- regex_t sq_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- kstring_t seq_regex = { 0, 0, NULL };
- ksprintf(&seq_regex, "^@SQ.*\tSN:%s(\t.*$|$)", translate->target_name[i]);
- regcomp(&sq_id, seq_regex.s, REG_EXTENDED|REG_NEWLINE);
- free(seq_regex.s);
- if (regexec(&sq_id, translate->text, 1, matches, 0) != 0)
- {
- fprintf(stderr, "[trans_tbl_init] @SQ SN (%s) found in binary header but not text header.\n",translate->target_name[i]);
- exit(1);
+
+ // Check if it's a new target.
+ iter = kh_get(c2i, sq_tids, translate->target_name[i]);
+
+ if (iter == kh_end(sq_tids)) {
+ int ret;
+ // Append missing entries to out_hdr
+
+ if (merged_hdr->n_targets == merged_hdr->targets_sz) {
+ if (grow_target_list(merged_hdr)) goto fail;
}
- regfree(&sq_id);
- // Produce our output line and append it to out_text
- kputc('\n', &out_text);
- kputsn(translate->text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &out_text);
+ merged_hdr->target_name[merged_hdr->n_targets] = strdup(translate->target_name[i]);
+ if (merged_hdr->target_name[merged_hdr->n_targets] == NULL) goto memfail;
+ merged_hdr->target_len[merged_hdr->n_targets] = translate->target_len[i];
+
+ // Record the new identifier for reference below,
+ // and when building the ttable for other inputs.
+ iter = kh_put(c2i, sq_tids,
+ merged_hdr->target_name[merged_hdr->n_targets], &ret);
+ if (ret < 0) {
+ free(merged_hdr->target_name[merged_hdr->n_targets]);
+ goto memfail;
+ }
+ assert(ret > 0); // Should not be in hash already.
- free(matches);
+ kh_value(sq_tids, iter) = merged_hdr->n_targets;
+ tbl->tid_trans[i] = merged_hdr->n_targets++;
} else {
- tbl->tid_trans[i] = kh_value(out_tid, iter);
+ tbl->tid_trans[i] = kh_value(sq_tids, iter);
}
+
if (tbl->tid_trans[i] > min_tid) {
min_tid = tbl->tid_trans[i];
} else {
tbl->lost_coord_sort = true;
}
}
- kh_destroy(c2i, out_tid);
-
- // grep @RG id's
- regex_t rg_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- regcomp(&rg_id, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- char* text = translate->text;
- klist_t(hdrln) *rg_list = kl_init(hdrln);
- while(1) { // foreach rg id in translate's header
- if (regexec(&rg_id, text, 2, matches, 0) != 0) break;
- // matches[0] is the whole @RG line; matches[1] is the ID field value
- kstring_t match_id = { 0, 0, NULL };
- kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id);
-
- // is our matched ID in our output list already
- regex_t rg_id_search;
- kstring_t rg_regex = { 0, 0, NULL };
- ksprintf(&rg_regex, "^@RG.*\tID:%s(\t.*$|$)", match_id.s);
- regcomp(&rg_id_search, rg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB);
- free(rg_regex.s);
- kstring_t transformed_id = { 0, 0, NULL };
- bool transformed_equals_match;
- if (regexec(&rg_id_search, out->text, 0, NULL, 0) != 0 || merge_rg) {
- // Not in there so can add it as 1-1 mapping
- kputs(match_id.s, &transformed_id);
- transformed_equals_match = true;
- } else {
- // It's in there so we need to transform it by appending random number to id
- ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48());
- transformed_equals_match = false;
+
+ if (merged_hdr->n_targets == old_n_targets)
+ return 0; // Everything done if no new targets.
+
+ // Otherwise, find @SQ lines in translate->text for all newly added targets.
+
+ new_sq_matches = malloc((merged_hdr->n_targets - old_n_targets)
+ * sizeof(*new_sq_matches));
+ if (new_sq_matches == NULL) goto memfail;
+
+ for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) {
+ new_sq_matches[i].rm_so = new_sq_matches[i].rm_eo = -1;
+ }
+
+ text = translate->text;
+ while (hdr_line_match(text, "@SQ", "SN", matches) == 0) {
+ // matches[0] is whole line, matches[1] is SN value.
+
+ // This is a bit disgusting, but avoids a copy...
+ char c = text[matches[1].rm_eo];
+ int idx;
+
+ text[matches[1].rm_eo] = '\0';
+
+ // Look up the SN value in the sq_tids hash.
+ iter = kh_get(c2i, sq_tids, text + matches[1].rm_so);
+ text[matches[1].rm_eo] = c; // restore text
+
+ if (iter == kh_end(sq_tids)) {
+ // Warn about this, but it's not really fatal.
+ fprintf(stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n",
+ __func__,
+ (int) (matches[1].rm_eo - matches[1].rm_so),
+ text + matches[1].rm_so);
+ text += matches[0].rm_eo;
+ continue; // Skip to next
}
- regfree(&rg_id_search);
- // Insert it into our translation map
- int in_there = 0;
- khiter_t iter = kh_put(c2c, tbl->rg_trans, ks_release(&match_id), &in_there);
- char *transformed_id_s = ks_release(&transformed_id);
- kh_value(tbl->rg_trans,iter) = transformed_id_s;
- // take matched line and replace ID with transformed_id
- kstring_t transformed_line = { 0, 0, NULL };
- if (transformed_equals_match) {
- kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line);
+ idx = kh_value(sq_tids, iter);
+ if (idx >= old_n_targets) {
+ // is a new SQ, so record position so we can add it to out_text.
+ assert(idx < merged_hdr->n_targets);
+ ptrdiff_t off = text - translate->text;
+ new_sq_matches[idx - old_n_targets].rm_so = matches[0].rm_so + off;
+ new_sq_matches[idx - old_n_targets].rm_eo = matches[0].rm_eo + off;
+ }
+
+ // Carry on searching from end of current match
+ text += matches[0].rm_eo;
+ }
+
+ // Check if any new targets have been missed
+ missing = 0;
+ for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) {
+ if (new_sq_matches[i].rm_so >= 0) {
+ if (match_to_ks(translate->text, &new_sq_matches[i], out_text))
+ goto memfail;
+ if (kputc('\n', out_text) == EOF) goto memfail;
} else {
- kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id_s, &transformed_line);
- kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
+ fprintf(stderr, "[E::%s] @SQ SN (%s) found in binary header but not text header.\n",
+ __func__, merged_hdr->target_name[i + old_n_targets]);
+ missing++;
}
+ }
+ if (missing) goto fail;
+
+ free(new_sq_matches);
+ return 0;
- if (!(transformed_equals_match && merge_rg)) {
- // append line to linked list for PG processing
- char** ln = kl_pushp(hdrln, rg_list);
- *ln = ks_release(&transformed_line); // Give away to linked list
+ memfail:
+ perror(__func__);
+ fail:
+ free(new_sq_matches);
+ return -1;
+}
+
+/*
+ * Common code for setting up RG and PG record ID tag translation.
+ *
+ * is_rg is true for RG translation, false for PG.
+ * translate is the input bam header
+ * merge is true if tags with the same ID are to be merged.
+ * known_ids is the set of IDs already in the output header.
+ * id_map is the translation map from input header IDs to output header IDs
+ * If override is set, it will be used to replace the existing ID (RG only)
+ *
+ * known_ids and id_map have entries for the new IDs added to them.
+ *
+ * Return value is a linked list of header lines with the translated IDs,
+ * or NULL if something went wrong (probably out of memory).
+ *
+ */
+
+static klist_t(hdrln) * trans_rg_pg(bool is_rg, bam_hdr_t *translate,
+ bool merge, khash_t(cset)* known_ids,
+ khash_t(c2c)* id_map, char *override) {
+ hdr_match_t matches[2];
+ khiter_t iter;
+ const char *text = translate->text;
+ const char *rec_type = is_rg ? "@RG" : "@PG";
+ klist_t(hdrln) *hdr_lines;
+
+ hdr_lines = kl_init(hdrln);
+
+ // Search through translate's header
+ while (hdr_line_match(text, rec_type, "ID", matches) == 0) {
+ // matches[0] is the whole @RG/PG line; matches[1] is the ID field value
+
+ kstring_t orig_id = { 0, 0, NULL }; // ID in original header
+ kstring_t transformed_id = { 0, 0, NULL }; // ID in output header
+ char *map_value; // Value to store in id_map
+ bool id_changed; // Have we changed the ID?
+ bool not_found_in_output; // ID isn't in the output header (yet)
+
+ // Take a copy of the ID as we'll need it for a hash key.
+ if (match_to_ks(text, &matches[1], &orig_id)) goto memfail;
+
+ // is our matched ID in our output ID set already?
+ iter = kh_get(cset, known_ids, ks_str(&orig_id));
+ not_found_in_output = (iter == kh_end(known_ids));
+
+ if (override) {
+ // Override original ID (RG only)
+#ifdef OVERRIDE_DOES_NOT_MERGE
+ if (gen_unique_id(override, known_ids, false, &transformed_id))
+ goto memfail;
+ not_found_in_output = true; // As ID now unique
+#else
+ if (kputs(override, &transformed_id) == EOF) goto memfail;
+ // Know about override already?
+ iter = kh_get(cset, known_ids, ks_str(&transformed_id));
+ not_found_in_output = (iter == kh_end(known_ids));
+#endif
+ id_changed = true;
+ } else {
+ if ( not_found_in_output || merge) {
+ // Not in there or merging so can add it as 1-1 mapping
+ if (ks_to_ks(&orig_id, &transformed_id)) goto memfail;
+ id_changed = false;
+ } else {
+ // It's in there so we need to transform it by appending
+ // a random number to the id
+ if (gen_unique_id(ks_str(&orig_id), known_ids,
+ true, &transformed_id))
+ goto memfail;
+ id_changed = true;
+ not_found_in_output = true; // As ID now unique
+ }
}
- else free(transformed_line.s);
- text += matches[0].rm_eo; // next!
- }
- regfree(&rg_id);
+ // Does this line need to go into our output header?
+ if (not_found_in_output) {
- // Do same for PG id's
- regex_t pg_id;
- regcomp(&pg_id, "^@PG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- text = translate->text;
- klist_t(hdrln) *pg_list = kl_init(hdrln);
- while(1) { // foreach pg id in translate's header
- if (regexec(&pg_id, text, 2, matches, 0) != 0) break;
- kstring_t match_id = { 0, 0, NULL };
- kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id);
-
- // is our matched ID in our output list already
- regex_t pg_id_search;
- kstring_t pg_regex = { 0, 0, NULL };
- ksprintf(&pg_regex, "^@PG.*\tID:%s(\t.*$|$)", match_id.s);
- regcomp(&pg_id_search, pg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB);
- free(pg_regex.s);
- kstring_t transformed_id = { 0, 0, NULL };
- bool transformed_equals_match;
- if (regexec(&pg_id_search, out->text, 0, NULL, 0) != 0 || merge_pg) {
- // Not in there so can add it as 1-1 mapping
- kputs(match_id.s, &transformed_id);
- transformed_equals_match = true;
+ // Take matched line and replace ID with transformed_id
+ kstring_t new_hdr_line = { 0, 0, NULL };
+
+ if (!id_changed) { // Can just copy
+ if (match_to_ks(text, &matches[0], &new_hdr_line)) goto memfail;
+ } else { // Substitute new name for original
+ if (range_to_ks(text, matches[0].rm_so, matches[1].rm_so,
+ &new_hdr_line)) goto memfail;
+ if (ks_to_ks(&transformed_id, &new_hdr_line)) goto memfail;
+ if (range_to_ks(text, matches[1].rm_eo, matches[0].rm_eo,
+ &new_hdr_line)) goto memfail;
+ }
+
+ // append line to output linked list
+ char** ln = kl_pushp(hdrln, hdr_lines);
+ *ln = ks_release(&new_hdr_line); // Give away to linked list
+
+ // Need to add it to known_ids set
+ int in_there = 0;
+ iter = kh_put(cset, known_ids, ks_str(&transformed_id), &in_there);
+ if (in_there < 0) goto memfail;
+ assert(in_there > 0); // Should not already be in the map
+ map_value = ks_release(&transformed_id);
} else {
- // It's in there so we need to transform it by appending random number to id
- ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48());
- transformed_equals_match = false;
+ // Use existing string in id_map
+ assert(kh_exist(known_ids, iter));
+ map_value = kh_key(known_ids, iter);
+ free(ks_release(&transformed_id));
}
- regfree(&pg_id_search);
// Insert it into our translation map
int in_there = 0;
- khiter_t iter = kh_put(c2c, tbl->pg_trans, ks_release(&match_id), &in_there);
- char *transformed_id_s = ks_release(&transformed_id);
- kh_value(tbl->pg_trans,iter) = transformed_id_s;
- // take matched line and replace ID with transformed_id
- kstring_t transformed_line = { 0, 0, NULL };
- if (transformed_equals_match) {
- kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line);
- } else {
- kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id_s, &transformed_line);
- kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
+ iter = kh_put(c2c, id_map, ks_release(&orig_id), &in_there);
+ kh_value(id_map, iter) = map_value;
+
+ text += matches[0].rm_eo; // next!
+ }
+
+ // If there are no RG lines in the file and we are overriding add one
+ if (is_rg && override && kl_begin(hdr_lines) == NULL) {
+ kstring_t new_id = {0, 0, NULL};
+ kstring_t line = {0, 0, NULL};
+ kstring_t empty = {0, 0, NULL};
+ int in_there = 0;
+ char** ln;
+
+ // Get the new ID
+ if (gen_unique_id(override, known_ids, false, &new_id))
+ goto memfail;
+
+ // Make into a header line and add to linked list
+ ksprintf(&line, "@RG\tID:%s", ks_str(&new_id));
+ ln = kl_pushp(hdrln, hdr_lines);
+ *ln = ks_release(&line);
+
+ // Put into known_ids set
+ iter = kh_put(cset, known_ids, ks_str(&new_id), &in_there);
+ if (in_there < 0) goto memfail;
+ assert(in_there > 0); // Should be a new entry
+
+ // Put into translation map (key is empty string)
+ if (kputs("", &empty) == EOF) goto memfail;
+ iter = kh_put(c2c, id_map, ks_release(&empty), &in_there);
+ if (in_there < 0) goto memfail;
+ assert(in_there > 0); // Should be a new entry
+ kh_value(id_map, iter) = ks_release(&new_id);
+ }
+
+ return hdr_lines;
+
+ memfail:
+ perror(__func__);
+ if (hdr_lines) kl_destroy(hdrln, hdr_lines);
+ return NULL;
+}
+
+/*
+ * Common code for completing RG and PG record translation.
+ *
+ * Input is a list of header lines, and the mapping from input to
+ * output @PG record IDs.
+ *
+ * RG and PG records can contain tags that cross-reference to other @PG
+ * records. This fixes the tags to contain the new IDs before adding
+ * them to the output header text.
+ */
+
+static int finish_rg_pg(bool is_rg, klist_t(hdrln) *hdr_lines,
+ khash_t(c2c)* pg_map, kstring_t *out_text) {
+ const char *search = is_rg ? "\tPG:" : "\tPP:";
+ khiter_t idx;
+ char *line = NULL;
+
+ while ((kl_shift(hdrln, hdr_lines, &line)) == 0) {
+ char *id = strstr(line, search); // Look for tag to fix
+ int pos1 = 0, pos2 = 0;
+ char *new_id = NULL;
+
+ if (id) {
+ // Found a tag. Look up the value in the translation map
+ // to see what it should be changed to in the output file.
+ char *end, tmp;
+
+ id += 4; // Point to value
+ end = strchr(id, '\t'); // Find end of tag
+ if (!end) end = id + strlen(id);
+
+ tmp = *end;
+ *end = '\0'; // Temporarily get the value on its own.
+
+ // Look-up in translation table
+ idx = kh_get(c2c, pg_map, id);
+ if (idx == kh_end(pg_map)) {
+ // Not found, warn.
+ fprintf(stderr, "[W::%s] Tag %s%s not found in @PG records\n",
+ __func__, search + 1, id);
+ } else {
+ // Remember new id and splice points on original string
+ new_id = kh_value(pg_map, idx);
+ pos1 = id - line;
+ pos2 = end - line;
+ }
+
+ *end = tmp; // Restore string
}
- if (!(transformed_equals_match && merge_pg)) {
- // append line to linked list for PP processing
- char** ln = kl_pushp(hdrln, pg_list);
- *ln = ks_release(&transformed_line); // Give away to linked list
+ // Copy line to output:
+ // line[0..pos1), new_id (if not NULL), line[pos2..end), '\n'
+
+ if (pos1 && range_to_ks(line, 0, pos1, out_text)) goto memfail;
+ if (new_id && kputs(new_id, out_text) == EOF) goto memfail;
+ if (kputs(line + pos2, out_text) == EOF) goto memfail;
+ if (kputc('\n', out_text) == EOF) goto memfail;
+ free(line); // No longer needed
+ line = NULL;
+ }
+
+ return 0;
+
+ memfail:
+ perror(__func__);
+ free(line); // Prevent leakage as no longer on list
+ return -1;
+}
+
+/*
+ * Build the translation table for an input *am file. This stores mappings
+ * which allow IDs to be converted from those used in the input file
+ * to the ones which will be used in the output. The mappings are for:
+ * Reference sequence IDs (for @SQ records)
+ * @RG record ID tags
+ * @PG record ID tags
+ *
+ * At the same time, new header text is built up by copying records
+ * from the input bam file. This will eventually become the header for
+ * the output file. When copied, the ID tags for @RG and @PG records
+ * are replaced with their values. The @PG PP: and @RG PG: tags
+ * are also modified if necessary.
+ *
+ * merged_hdr holds state on the output header (which IDs are present, etc.)
+ * translate is the input header
+ * tbl is the translation table that gets filled in.
+ * merge_rg controls merging of @RG records
+ * merge_pg controls merging of @PG records
+ * If rg_override is not NULL, it will be used to replace the existing @RG ID
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+
+static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate,
+ trans_tbl_t* tbl, bool merge_rg, bool merge_pg,
+ char* rg_override)
+{
+ klist_t(hdrln) *rg_list = NULL;
+ klist_t(hdrln) *pg_list = NULL;
+
+ tbl->n_targets = translate->n_targets;
+ tbl->rg_trans = tbl->pg_trans = NULL;
+ tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int));
+ if (tbl->tid_trans == NULL) goto memfail;
+ tbl->rg_trans = kh_init(c2c);
+ if (tbl->rg_trans == NULL) goto memfail;
+ tbl->pg_trans = kh_init(c2c);
+ if (tbl->pg_trans == NULL) goto memfail;
+
+ tbl->lost_coord_sort = false;
+
+ // Get the @HD record (if not there already).
+ if (trans_tbl_add_hd(merged_hdr, translate)) goto fail;
+
+ // Fill in map and add header lines for @SQ records
+ if (trans_tbl_add_sq(merged_hdr, translate, tbl)) goto fail;
+
+ // Get translated header lines and fill in map for @RG records
+ rg_list = trans_rg_pg(true, translate, merge_rg, merged_hdr->rg_ids,
+ tbl->rg_trans, rg_override);
+ if (!rg_list) goto fail;
+
+ // Get translated header lines and fill in map for @PG records
+ pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids,
+ tbl->pg_trans, NULL);
+
+ // Fix-up PG: tags in the new @RG records and add to output
+ if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg))
+ goto fail;
+
+ // Fix-up PP: tags in the new @PG records and add to output
+ if (finish_rg_pg(false, pg_list, tbl->pg_trans, &merged_hdr->out_pg))
+ goto fail;
+
+ kl_destroy(hdrln, rg_list); rg_list = NULL;
+ kl_destroy(hdrln, pg_list); pg_list = NULL;
+
+ // Just append @CO headers without translation
+ const char *line, *end_pointer;
+ for (line = translate->text; *line; line = end_pointer + 1) {
+ end_pointer = strchr(line, '\n');
+ if (strncmp(line, "@CO", 3) == 0) {
+ if (end_pointer) {
+ if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF)
+ goto memfail;
+ } else { // Last line with no trailing '\n'
+ if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail;
+ if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail;
+ }
}
- else free(transformed_line.s);
- text += matches[0].rm_eo; // next!
+ if (end_pointer == NULL) break;
+ }
+
+ return 0;
+
+ memfail:
+ perror(__func__);
+ fail:
+ trans_tbl_destroy(tbl);
+ if (rg_list) kl_destroy(hdrln, rg_list);
+ if (pg_list) kl_destroy(hdrln, pg_list);
+ return -1;
+}
+
+static inline void move_kstr_to_text(char **text, kstring_t *ks) {
+ memcpy(*text, ks_str(ks), ks_len(ks));
+ *text += ks_len(ks);
+ **text = '\0';
+ free(ks_release(ks));
+}
+
+/*
+ * Populate a bam_hdr_t struct from data in a merged_header_t.
+ */
+
+static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) {
+ size_t txt_sz;
+ char *text;
+ bam_hdr_t *hdr;
+
+ // Check output text size
+ txt_sz = (ks_len(&merged_hdr->out_hd)
+ + ks_len(&merged_hdr->out_sq)
+ + ks_len(&merged_hdr->out_rg)
+ + ks_len(&merged_hdr->out_pg)
+ + ks_len(&merged_hdr->out_co));
+ if (txt_sz >= INT32_MAX) {
+ fprintf(stderr, "[%s] Output header text too long\n", __func__);
+ return NULL;
}
- regfree(&pg_id);
- // need to translate PP's on the fly in second pass because they may not be in correct order and need complete tbl->pg_trans to do this
- // for each line {
- // with ID replaced with tranformed_id and PP's transformed using the translation table
- // }
- regex_t pg_pp;
- regcomp(&pg_pp, "^@PG.*\tPP:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- kliter_t(hdrln) *iter = kl_begin(pg_list);
- while (iter != kl_end(pg_list)) {
- char* data = kl_val(iter);
-
- kstring_t transformed_line = { 0, 0, NULL };
- // Find PP tag
- if (regexec(&pg_pp, data, 2, matches, 0) == 0) {
- // Lookup in hash table
- kstring_t pp_id = { 0, 0, NULL };
- kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pp_id);
-
- khiter_t k = kh_get(c2c, tbl->pg_trans, pp_id.s);
- free(pp_id.s);
- char* transformed_id = kh_value(tbl->pg_trans,k);
- // Replace
- kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id, &transformed_line);
- kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
- } else { kputs(data, &transformed_line); }
- // Produce our output line and append it to out_text
- kputc('\n', &out_text);
- kputsn(transformed_line.s, transformed_line.l, &out_text);
-
- free(transformed_line.s);
- free(data);
- iter = kl_next(iter);
+
+ // Allocate new header
+ hdr = bam_hdr_init();
+ if (hdr == NULL) goto memfail;
+
+ // Transfer targets arrays to new header
+ hdr->n_targets = merged_hdr->n_targets;
+ if (hdr->n_targets > 0) {
+ // Try to shrink targets arrays to correct size
+ hdr->target_name = realloc(merged_hdr->target_name,
+ hdr->n_targets * sizeof(char*));
+ if (!hdr->target_name) hdr->target_name = merged_hdr->target_name;
+
+ hdr->target_len = realloc(merged_hdr->target_len,
+ hdr->n_targets * sizeof(uint32_t));
+ if (!hdr->target_len) hdr->target_len = merged_hdr->target_len;
+
+ // These have either been freed by realloc() or, in the unlikely
+ // event that failed, have had their ownership transferred to hdr
+ merged_hdr->target_name = NULL;
+ merged_hdr->target_len = NULL;
}
- regfree(&pg_pp);
-
- // Need to also translate @RG PG's on the fly too
- regex_t rg_pg;
- regcomp(&rg_pg, "^@RG.*\tPG:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- kliter_t(hdrln) *rg_iter = kl_begin(rg_list);
- while (rg_iter != kl_end(rg_list)) {
- char* data = kl_val(rg_iter);
-
- kstring_t transformed_line = { 0, 0, NULL };
- // Find PG tag
- if (regexec(&rg_pg, data, 2, matches, 0) == 0) {
- // Lookup in hash table
- kstring_t pg_id = { 0, 0, NULL };
- kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pg_id);
-
- khiter_t k = kh_get(c2c, tbl->pg_trans, pg_id.s);
- free(pg_id.s);
- char* transformed_id = kh_value(tbl->pg_trans,k);
- // Replace
- kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id, &transformed_line);
- kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
- } else { kputs(data, &transformed_line); }
- // Produce our output line and append it to out_text
- kputc('\n', &out_text);
- kputsn(transformed_line.s, transformed_line.l, &out_text);
-
- free(transformed_line.s);
- free(data);
- rg_iter = kl_next(rg_iter);
+ else {
+ hdr->target_name = NULL;
+ hdr->target_len = NULL;
}
- regfree(&rg_pg);
- kl_destroy(hdrln,pg_list);
- kl_destroy(hdrln,rg_list);
- free(matches);
+ // Allocate text
+ text = hdr->text = malloc(txt_sz + 1);
+ if (!text) goto memfail;
+
+ // Put header text in order @HD, @SQ, @RG, @PG, @CO
+ move_kstr_to_text(&text, &merged_hdr->out_hd);
+ move_kstr_to_text(&text, &merged_hdr->out_sq);
+ move_kstr_to_text(&text, &merged_hdr->out_rg);
+ move_kstr_to_text(&text, &merged_hdr->out_pg);
+ move_kstr_to_text(&text, &merged_hdr->out_co);
+ hdr->l_text = txt_sz;
- // Add trailing \n and write back to header
- free(out->text);
- kputc('\n', &out_text);
- out->l_text = out_text.l;
- out->text = ks_release(&out_text);
+ return hdr;
+
+ memfail:
+ perror(__func__);
+ bam_hdr_destroy(hdr);
+ return NULL;
+}
+
+/*
+ * Free a merged_header_t struct and all associated data.
+ *
+ * Note that the keys to the rg_ids and pg_ids sets are also used as
+ * values in the translation tables. This function should therefore not
+ * be called until the translation tables are no longer needed.
+ */
+
+static void free_merged_header(merged_header_t *merged_hdr) {
+ size_t i;
+ khiter_t iter;
+ if (!merged_hdr) return;
+ free(ks_release(&merged_hdr->out_hd));
+ free(ks_release(&merged_hdr->out_sq));
+ free(ks_release(&merged_hdr->out_rg));
+ free(ks_release(&merged_hdr->out_pg));
+ free(ks_release(&merged_hdr->out_co));
+ if (merged_hdr->target_name) {
+ for (i = 0; i < merged_hdr->n_targets; i++) {
+ free(merged_hdr->target_name[i]);
+ }
+ free(merged_hdr->target_name);
+ }
+ free(merged_hdr->target_len);
+ kh_destroy(c2i, merged_hdr->sq_tids);
+
+ if (merged_hdr->rg_ids) {
+ for (iter = kh_begin(merged_hdr->rg_ids);
+ iter != kh_end(merged_hdr->rg_ids); ++iter) {
+ if (kh_exist(merged_hdr->rg_ids, iter))
+ free(kh_key(merged_hdr->rg_ids, iter));
+ }
+ kh_destroy(cset, merged_hdr->rg_ids);
+ }
+
+ if (merged_hdr->pg_ids) {
+ for (iter = kh_begin(merged_hdr->pg_ids);
+ iter != kh_end(merged_hdr->pg_ids); ++iter) {
+ if (kh_exist(merged_hdr->pg_ids, iter))
+ free(kh_key(merged_hdr->pg_ids, iter));
+ }
+ kh_destroy(cset, merged_hdr->pg_ids);
+ }
+
+ free(merged_hdr);
}
static void bam_translate(bam1_t* b, trans_tbl_t* tbl)
@@ -496,10 +978,25 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl)
if (k != kh_end(tbl->rg_trans)) {
char* translate_rg = kh_value(tbl->rg_trans,k);
bam_aux_del(b, rg);
- bam_aux_append(b, "RG", 'Z', strlen(translate_rg) + 1, (uint8_t*)translate_rg);
+ if (translate_rg) {
+ bam_aux_append(b, "RG", 'Z', strlen(translate_rg) + 1,
+ (uint8_t*)translate_rg);
+ }
} else {
- fprintf(stderr, "[bam_translate] RG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_rg, bam_get_qname(b));
+ char *tmp = strdup(decoded_rg);
+ fprintf(stderr,
+ "[bam_translate] RG tag \"%s\" on read \"%s\" encountered "
+ "with no corresponding entry in header, tag lost. "
+ "Unknown tags are only reported once per input file for "
+ "each tag ID.\n",
+ decoded_rg, bam_get_qname(b));
bam_aux_del(b, rg);
+ // Prevent future whinges
+ if (tmp) {
+ int in_there = 0;
+ k = kh_put(c2c, tbl->rg_trans, tmp, &in_there);
+ if (in_there > 0) kh_value(tbl->rg_trans, k) = NULL;
+ }
}
}
@@ -511,10 +1008,25 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl)
if (k != kh_end(tbl->pg_trans)) {
char* translate_pg = kh_value(tbl->pg_trans,k);
bam_aux_del(b, pg);
- bam_aux_append(b, "PG", 'Z', strlen(translate_pg) + 1, (uint8_t*)translate_pg);
+ if (translate_pg) {
+ bam_aux_append(b, "PG", 'Z', strlen(translate_pg) + 1,
+ (uint8_t*)translate_pg);
+ }
} else {
- fprintf(stderr, "[bam_translate] PG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_pg, bam_get_qname(b));
+ char *tmp = strdup(decoded_pg);
+ fprintf(stderr,
+ "[bam_translate] PG tag \"%s\" on read \"%s\" encountered "
+ "with no corresponding entry in header, tag lost. "
+ "Unknown tags are only reported once per input file for "
+ "each tag ID.\n",
+ decoded_pg, bam_get_qname(b));
bam_aux_del(b, pg);
+ // Prevent future whinges
+ if (tmp) {
+ int in_there = 0;
+ k = kh_put(c2c, tbl->pg_trans, tmp, &in_there);
+ if (in_there > 0) kh_value(tbl->pg_trans, k) = NULL;
+ }
}
}
}
@@ -579,20 +1091,28 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
@param flag flags that control how the merge is undertaken
@param reg region to merge
@param n_threads number of threads to use (passed to htslib)
+ @param in_fmt format options for input files
+ @param out_fmt output file format and options
@discussion Padding information may NOT correctly maintained. This
function is NOT thread safe.
*/
-int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads)
+int bam_merge_core2(int by_qname, const char *out, const char *mode,
+ const char *headers, int n, char * const *fn, int flag,
+ const char *reg, int n_threads,
+ const htsFormat *in_fmt, const htsFormat *out_fmt)
{
samFile *fpout, **fp;
heap1_t *heap;
bam_hdr_t *hout = NULL;
+ bam_hdr_t *hin = NULL;
int i, j, *RG_len = NULL;
uint64_t idx = 0;
char **RG = NULL;
hts_itr_t **iter = NULL;
bam_hdr_t **hdr = NULL;
trans_tbl_t *translation_tbl = NULL;
+ merged_header_t *merged_hdr = init_merged_header();
+ if (!merged_hdr) return -1;
// Is there a specified pre-prepared header to use for output?
if (headers) {
@@ -602,8 +1122,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
return -1;
}
- hout = sam_hdr_read(fpheaders);
+ hin = sam_hdr_read(fpheaders);
sam_close(fpheaders);
+ if (hin == NULL) {
+ fprintf(stderr, "[bam_merge_core] couldn't read headers for '%s'\n",
+ headers);
+ return -1;
+ }
}
g_is_by_qname = by_qname;
@@ -612,14 +1137,15 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
+ RG = (char**)calloc(n, sizeof(char*));
// prepare RG tag from file names
if (flag & MERGE_RG) {
- RG = (char**)calloc(n, sizeof(char*));
RG_len = (int*)calloc(n, sizeof(int));
for (i = 0; i != n; ++i) {
int l = strlen(fn[i]);
const char *s = fn[i];
- if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
+ if (l > 4 && (strcmp(s + l - 4, ".bam") == 0 || strcmp(s + l - 4, ".sam") == 0)) l -= 4;
+ if (l > 5 && strcmp(s + l - 5, ".cram") == 0) l -= 5;
for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
++j; l -= j;
RG[i] = (char*)calloc(l + 1, 1);
@@ -627,28 +1153,50 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
strncpy(RG[i], s + j, l);
}
}
+
+ if (hin) {
+ // Popluate merged_hdr from the pre-prepared header
+ trans_tbl_t dummy;
+ int res;
+ res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
+ flag & MERGE_COMBINE_PG, NULL);
+ trans_tbl_destroy(&dummy);
+ if (res) return -1; // FIXME: memory leak
+ }
+
// open and read the header from each file
for (i = 0; i < n; ++i) {
bam_hdr_t *hin;
- fp[i] = sam_open(fn[i], "r");
+ fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
int j;
fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
- for (j = 0; j < i; ++j) sam_close(fp[j]);
+ for (j = 0; j < i; ++j) {
+ bam_hdr_destroy(hdr[i]);
+ sam_close(fp[j]);
+ }
free(fp); free(heap);
// FIXME: possible memory leak
return -1;
}
hin = sam_hdr_read(fp[i]);
- if (hout)
- trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
- else {
- // As yet, no headers to merge into...
- hout = bam_hdr_dup(hin);
- // ...so no need to translate header into itself
- trans_tbl_init(hout, hin, translation_tbl+i, true, true);
+ if (hin == NULL) {
+ fprintf(stderr, "[bam_merge_core] failed to read header for '%s'\n",
+ fn[i]);
+ for (j = 0; j < i; ++j) {
+ bam_hdr_destroy(hdr[i]);
+ sam_close(fp[j]);
+ }
+ free(fp); free(heap);
+ // FIXME: possible memory leak
+ return -1;
}
+ if (trans_tbl_init(merged_hdr, hin, translation_tbl+i,
+ flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG,
+ RG[i]))
+ return -1; // FIXME: memory leak
+
// TODO sam_itr_next() doesn't yet work for SAM files,
// so for those keep the headers around for use with sam_read1()
if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
@@ -659,8 +1207,20 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
}
}
+ // Did we get an @HD line?
+ if (!merged_hdr->have_hd) {
+ fprintf(stderr, "[W::%s] No @HD tag found.\n", __func__);
+ /* FIXME: Should we add an @HD line here, and if so what should
+ we put in it? Ideally we want a way of getting htslib to tell
+ us the SAM version number to assume given no @HD line. Is
+ it also safe to assume that the output is coordinate sorted?
+ SO: is optional so we don't have to have it.*/
+ /* ksprintf(&merged_hdr->out_hd, "@HD\tVN:1.5\tSO:coordinate\n"); */
+ }
+
// Transform the header into standard form
- pretty_header(&hout->text,hout->l_text);
+ hout = finish_merged_header(merged_hdr);
+ if (!hout) return -1; // FIXME: memory leak
// If we're only merging a specified region move our iters to start at that point
if (reg) {
@@ -668,19 +1228,33 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
int tid, beg, end;
const char *name_lim = hts_parse_reg(reg, &beg, &end);
- char *name = malloc(name_lim - reg + 1);
- memcpy(name, reg, name_lim - reg);
- name[name_lim - reg] = '\0';
- tid = bam_name2id(hout, name);
- free(name);
+ if (name_lim) {
+ char *name = malloc(name_lim - reg + 1);
+ memcpy(name, reg, name_lim - reg);
+ name[name_lim - reg] = '\0';
+ tid = bam_name2id(hout, name);
+ free(name);
+ }
+ else {
+ // not parsable as a region, but possibly a sequence named "foo:a"
+ tid = bam_name2id(hout, reg);
+ beg = 0;
+ end = INT_MAX;
+ }
if (tid < 0) {
- fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
+ if (name_lim) fprintf(stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg);
+ else fprintf(stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg);
return -1;
}
for (i = 0; i < n; ++i) {
hts_idx_t *idx = sam_index_load(fp[i], fn[i]);
// (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
int mapped_tid = rtrans[i*hout->n_targets+tid];
+ if (idx == NULL) {
+ fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
+ __func__, fn[i]);
+ return -1;
+ }
if (mapped_tid != INT32_MIN) {
iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
} else {
@@ -723,7 +1297,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
}
// Open output file and write header
- if ((fpout = sam_open(out, mode)) == 0) {
+ if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
return -1;
}
@@ -755,7 +1329,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
// Clean up and close
if (flag & MERGE_RG) {
for (i = 0; i != n; ++i) free(RG[i]);
- free(RG); free(RG_len);
+ free(RG_len);
}
for (i = 0; i < n; ++i) {
trans_tbl_destroy(translation_tbl + i);
@@ -763,37 +1337,45 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
bam_hdr_destroy(hdr[i]);
sam_close(fp[i]);
}
+ bam_hdr_destroy(hin);
bam_hdr_destroy(hout);
+ free_merged_header(merged_hdr);
sam_close(fpout);
- free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
+ free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
return 0;
}
+// Unused here but may be used by legacy samtools-using third-party code
int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg)
{
char mode[12];
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0);
+ return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL);
}
static void merge_usage(FILE *to)
{
- fprintf(to, "Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> <in2.bam> [<in3.bam> ... <inN.bam>]\n\n");
- fprintf(to, "Options: -n sort by read names\n");
- fprintf(to, " -r attach RG tag (inferred from file names)\n");
- fprintf(to, " -u uncompressed BAM output\n");
- fprintf(to, " -f overwrite the output BAM if exist\n");
- fprintf(to, " -1 compress level 1\n");
- fprintf(to, " -l INT compression level, from 0 to 9 [-1]\n");
- fprintf(to, " -@ INT number of BAM compression threads [0]\n");
- fprintf(to, " -R STR merge file in the specified region STR [all]\n");
- fprintf(to, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n");
- fprintf(to, " -c combine RG tags with colliding IDs rather than amending them\n");
- fprintf(to, " -p combine PG tags with colliding IDs rather than amending them\n");
- fprintf(to, " -s VALUE override random seed\n");
- fprintf(to, " -b FILE list of input BAM filenames, one per line [null]\n\n");
+ fprintf(to,
+"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
+"\n"
+"Options:\n"
+" -n Sort by read names\n"
+" -r Attach RG tag (inferred from file names)\n"
+" -u Uncompressed BAM output\n"
+" -f Overwrite the output BAM if exist\n"
+" -1 Compress level 1\n"
+" -l INT Compression level, from 0 to 9 [-1]\n"
+" -R STR Merge file in the specified region STR [all]\n"
+" -h FILE Copy the header in FILE to <out.bam> [in1.bam]\n"
+" -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n"
+" -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n"
+" -s VALUE Override random seed\n"
+" -b FILE List of input BAM filenames, one per line [null]\n"
+" -@, --threads INT\n"
+" Number of BAM/CRAM compression threads [0]\n");
+ sam_global_opt_help(to, "-.O..");
}
int bam_merge(int argc, char *argv[])
@@ -804,12 +1386,19 @@ int bam_merge(int argc, char *argv[])
char** fn = NULL;
int fn_size = 0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ { "threads", required_argument, NULL, '@' },
+ { NULL, 0, NULL, 0 }
+ };
+
if (argc == 1) {
merge_usage(stdout);
return 0;
}
- while ((c = getopt(argc, argv, "h:nru1R:f@:l:cps:b:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': flag |= MERGE_RG; break;
case 'f': flag |= MERGE_FORCE; break;
@@ -840,6 +1429,10 @@ int bam_merge(int argc, char *argv[])
}
break;
}
+
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': merge_usage(stderr); return 1;
}
}
if ( argc - optind < 1 ) {
@@ -865,22 +1458,28 @@ int bam_merge(int argc, char *argv[])
if (fn == NULL) { ret = 1; goto end; }
memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
}
- if (fn_size+nargcfiles < 2) {
- fprintf(stderr, "You must specify at least 2 input files.\n");
+ if (fn_size+nargcfiles < 1) {
+ fprintf(stderr, "You must specify at least one (and usually two or more) input files.\n");
merge_usage(stderr);
return 1;
}
strcpy(mode, "wb");
+ sam_open_mode(mode+1, argv[optind], NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
- if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers, fn_size+nargcfiles, fn, flag, reg, n_threads) < 0) ret = 1;
+ if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
+ fn_size+nargcfiles, fn, flag, reg, n_threads,
+ &ga.in, &ga.out) < 0)
+ ret = 1;
+
end:
if (fn_size > 0) {
int i;
for (i=0; i<fn_size; i++) free(fn[i]);
- free(fn);
}
+ free(fn);
free(reg);
free(fn_headers);
+ sam_global_args_free(&ga);
return ret;
}
@@ -944,11 +1543,11 @@ typedef struct {
int index;
} worker_t;
-static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads)
+static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
{
size_t i;
samFile* fp;
- fp = sam_open(fn, mode);
+ fp = sam_open_format(fn, mode, fmt);
if (fp == NULL) return;
sam_hdr_write(fp, h);
if (n_threads > 1) hts_set_threads(fp, n_threads);
@@ -964,7 +1563,17 @@ static void *worker(void *data)
ks_mergesort(sort, w->buf_len, w->buf, 0);
name = (char*)calloc(strlen(w->prefix) + 20, 1);
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0);
+ write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0, NULL);
+
+// Consider using CRAM temporary files if the final output is CRAM.
+// Typically it is comparable speed while being smaller.
+// hts_opt opt[2] = {
+// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL},
+// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
+// };
+// opt[0].next = &opt[1];
+// write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt);
+
free(name);
return 0;
}
@@ -1009,17 +1618,22 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
@param fnout name of the final output file to be written
@param modeout sam_open() mode to be used to create the final output file
@param max_mem approxiate maximum memory (very inaccurate)
+ @param in_fmt input file format options
+ @param out_fmt output file format and options
@return 0 for successful sorting, negative on errors
@discussion It may create multiple temporary subalignment files
- and then merge them by calling bam_merge_core(). This function is
+ and then merge them by calling bam_merge_core2(). This function is
NOT thread safe.
*/
-int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const char *fnout, const char *modeout, size_t _max_mem, int n_threads)
+int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
+ const char *fnout, const char *modeout,
+ size_t _max_mem, int n_threads,
+ const htsFormat *in_fmt, const htsFormat *out_fmt)
{
- int ret, i, n_files = 0;
+ int ret = -1, i, n_files = 0;
size_t mem, max_k, k, max_mem;
- bam_hdr_t *header;
+ bam_hdr_t *header = NULL;
samFile *fp;
bam1_t *b, **buf;
@@ -1028,12 +1642,17 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
max_k = k = 0; mem = 0;
max_mem = _max_mem * n_threads;
buf = NULL;
- fp = sam_open(fn, "r");
+ fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
- fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
- return -1;
+ const char *message = strerror(errno);
+ fprintf(stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
+ return -2;
}
header = sam_hdr_read(fp);
+ if (header == NULL) {
+ fprintf(stderr, "[bam_sort_core] failed to read header for '%s'\n", fn);
+ goto err;
+ }
if (is_by_qname) change_SO(header, "queryname");
else change_SO(header, "coordinate");
// write sub files
@@ -1059,12 +1678,16 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
mem = k = 0;
}
}
- if (ret != -1)
- fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+ if (ret != -1) {
+ fprintf(stderr, "[bam_sort_core] truncated file. Aborting.\n");
+ ret = -1;
+ goto err;
+ }
+
// write the final output
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
- write_buffer(fnout, modeout, k, buf, header, n_threads);
+ write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt);
} else { // then merge
char **fns;
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
@@ -1074,10 +1697,12 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
- if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads) < 0) {
+ if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
+ MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads,
+ in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
- return -1;
+ goto err;
}
for (i = 0; i < n_files; ++i) {
unlink(fns[i]);
@@ -1085,25 +1710,30 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
}
free(fns);
}
+
+ ret = 0;
+
+ err:
// free
for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]);
free(buf);
bam_hdr_destroy(header);
sam_close(fp);
- return 0;
+ return ret;
}
+// Unused here but may be used by legacy samtools-using third-party code
int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
{
int ret;
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
sprintf(fnout, "%s.bam", prefix);
- ret = bam_sort_core_ext(is_by_qname, fn, prefix, fnout, "wb", max_mem, 0);
+ ret = bam_sort_core_ext(is_by_qname, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
free(fnout);
return ret;
}
-static int sort_usage(FILE *fp, int status)
+static void sort_usage(FILE *fp)
{
fprintf(fp,
"Usage: samtools sort [options...] [in.bam]\n"
@@ -1112,33 +1742,29 @@ static int sort_usage(FILE *fp, int status)
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -n Sort by read name\n"
" -o FILE Write final output to FILE rather than standard output\n"
-" -O FORMAT Write output as FORMAT ('sam'/'bam'/'cram') (either -O or\n"
-" -T PREFIX Write temporary files to PREFIX.nnnn.bam -T is required)\n"
-" -@ INT Set number of sorting and compression threads [1]\n"
-"\n"
-"Legacy usage: samtools sort [options...] <in.bam> <out.prefix>\n"
-"Options:\n"
-" -f Use <out.prefix> as full final filename rather than prefix\n"
-" -o Write final output to stdout rather than <out.prefix>.bam\n"
-" -l,m,n,@ Similar to corresponding options above\n");
- return status;
+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
+" -@, --threads INT\n"
+" Set number of sorting and compression threads [1]\n");
+ sam_global_opt_help(fp, "-.O..");
}
int bam_sort(int argc, char *argv[])
{
size_t max_mem = 768<<20; // 512MB
- int c, i, modern, nargs, is_by_qname = 0, is_stdout = 0, ret = EXIT_SUCCESS, n_threads = 0, level = -1, full_path = 0;
- char *fnout = "-", *fmtout = NULL, modeout[12], *tmpprefix = NULL;
- kstring_t fnout_buffer = { 0, 0, NULL };
-
- modern = 0;
- for (i = 1; i < argc; ++i)
- if (argv[i][0] == '-' && strpbrk(argv[i], "OT")) { modern = 1; break; }
-
- while ((c = getopt(argc, argv, modern? "l:m:no:O:T:@:" : "fnom:@:l:")) >= 0) {
+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
+ char *fnout = "-", modeout[12];
+ kstring_t tmpprefix = { 0, 0, NULL };
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ { "threads", required_argument, NULL, '@' },
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:", lopts, NULL)) >= 0) {
switch (c) {
- case 'f': full_path = 1; break;
- case 'o': if (modern) fnout = optarg; else is_stdout = 1; break;
+ case 'o': fnout = optarg; o_seen = 1; break;
case 'n': is_by_qname = 1; break;
case 'm': {
char *q;
@@ -1148,49 +1774,57 @@ int bam_sort(int argc, char *argv[])
else if (*q == 'g' || *q == 'G') max_mem <<= 30;
break;
}
- case 'O': fmtout = optarg; break;
- case 'T': tmpprefix = optarg; break;
+ case 'T': kputs(optarg, &tmpprefix); break;
case '@': n_threads = atoi(optarg); break;
case 'l': level = atoi(optarg); break;
- default: return sort_usage(stderr, EXIT_FAILURE);
+
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': sort_usage(stderr); ret = EXIT_FAILURE; goto sort_end;
}
}
nargs = argc - optind;
- if (argc == 1)
- return sort_usage(stdout, EXIT_SUCCESS);
- else if (modern? (nargs > 1) : (nargs != 2))
- return sort_usage(stderr, EXIT_FAILURE);
-
- if (!modern) {
- fmtout = "bam";
- if (is_stdout) fnout = "-";
- else if (full_path) fnout = argv[optind+1];
- else {
- ksprintf(&fnout_buffer, "%s.%s", argv[optind+1], fmtout);
- fnout = fnout_buffer.s;
- }
- tmpprefix = argv[optind+1];
+ if (nargs == 0 && isatty(STDIN_FILENO)) {
+ sort_usage(stdout);
+ ret = EXIT_SUCCESS;
+ goto sort_end;
}
+ else if (nargs >= 2) {
+ // If exactly two, user probably tried to specify legacy <out.prefix>
+ if (nargs == 2)
+ fprintf(stderr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n");
- strcpy(modeout, "w");
- if (sam_open_mode(&modeout[1], fnout, fmtout) < 0) {
- if (fmtout) fprintf(stderr, "[bam_sort] can't parse output format \"%s\"\n", fmtout);
- else fprintf(stderr, "[bam_sort] can't determine output format\n");
+ sort_usage(stderr);
ret = EXIT_FAILURE;
goto sort_end;
}
+
+ strcpy(modeout, "wb");
+ sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
- if (tmpprefix == NULL) {
- fprintf(stderr, "[bam_sort] no prefix specified for temporary files (use -T option)\n");
+ if (tmpprefix.l == 0)
+ ksprintf(&tmpprefix, "%s.tmp", (nargs > 0)? argv[optind] : "STDIN");
+
+ ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
+ tmpprefix.s, fnout, modeout, max_mem, n_threads,
+ &ga.in, &ga.out);
+ if (ret >= 0)
+ ret = EXIT_SUCCESS;
+ else {
+ char dummy[4];
+ // If we failed on opening the input file & it has no .bam/.cram/etc
+ // extension, the user probably tried legacy -o <infile> <out.prefix>
+ if (ret == -2 && o_seen && nargs > 0 && sam_open_mode(dummy, argv[optind], NULL) < 0)
+ fprintf(stderr, "[bam_sort] Note the <out.prefix> argument has been replaced by -T/-o options\n");
+
ret = EXIT_FAILURE;
- goto sort_end;
}
- if (bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-", tmpprefix, fnout, modeout, max_mem, n_threads) < 0) ret = EXIT_FAILURE;
-
sort_end:
- free(fnout_buffer.s);
+ free(tmpprefix.s);
+ sam_global_args_free(&ga);
+
return ret;
}
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c
index 33d7f5c..d486beb 100644
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -2,10 +2,11 @@
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2014 Genome Research Ltd.
+ Copyright (C) 2008-2015 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
+ Author: Martin Pollard <mp15 at sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -31,14 +32,16 @@ DEALINGS IN THE SOFTWARE. */
#include <errno.h>
#include <stdio.h>
#include <string.h>
-#include <regex.h>
#include <time.h>
#include <unistd.h>
+#include <getopt.h>
+#include <assert.h>
#include "htslib/ksort.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "sam_opts.h"
#if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L
#define NEED_MEMSET_PATTERN4
@@ -59,10 +62,11 @@ void memset_pattern4(void *target, const void *pattern, size_t size) {
#endif
KHASH_INIT(c2c, char*, char*, 1, kh_str_hash_func, kh_str_hash_equal)
+KHASH_INIT(cset, char*, char, 0, kh_str_hash_func, kh_str_hash_equal)
KHASH_MAP_INIT_STR(c2i, int)
-#define __free_char(p)
-KLIST_INIT(hdrln, char*, __free_char)
+#define hdrln_free_char(p)
+KLIST_INIT(hdrln, char*, hdrln_free_char)
static int g_is_by_qname = 0;
@@ -113,6 +117,22 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
KSORT_INIT(heap, heap1_t, heap_lt)
+typedef struct merged_header {
+ kstring_t out_hd;
+ kstring_t out_sq;
+ kstring_t out_rg;
+ kstring_t out_pg;
+ kstring_t out_co;
+ char **target_name;
+ uint32_t *target_len;
+ size_t n_targets;
+ size_t targets_sz;
+ khash_t(c2i) *sq_tids;
+ khash_t(cset) *rg_ids;
+ khash_t(cset) *pg_ids;
+ bool have_hd;
+} merged_header_t;
+
typedef struct trans_tbl {
int32_t n_targets;
int* tid_trans;
@@ -121,18 +141,100 @@ typedef struct trans_tbl {
bool lost_coord_sort;
} trans_tbl_t;
+/* Something to look like a regmatch_t */
+typedef struct hdr_match {
+ ptrdiff_t rm_so;
+ ptrdiff_t rm_eo;
+} hdr_match_t;
+
+/*
+ * Search for header lines of a particular record type.
+ *
+ * This replaces a regex search for something like /^@SQ.*\tSN:([^\t]+).*$/
+ * but is much quicker. The locations found are returned in *matches,
+ * which has a signature the same as that of a regmatch_t.
+ *
+ * rec is the record type to match (i.e. @HD, @SQ, @PG or @RG)
+ * tag is a tag type in the record to match (SN for @SQ, ID for @PG or @RG)
+ *
+ * The location of the record (if found) is returned in matches[0]
+ * If tag is not NULL, the record is searched for the presence of the
+ * given tag. If found, the location of the value is returned in matches[1].
+ * If the tag isn't found then the record is ignored and the search resumes
+ * on the next header line.
+ *
+ * For simplicity, some assumptions are made about rec and tag:
+ * rec should include the leading '@' sign and be three characters long.
+ * tag should be exactly two characters long.
+ * These are always string constants when this is called below, so we don't
+ * bother to check here.
+ *
+ * Returns 0 if a match was found, -1 if not.
+ */
+
+
+static int hdr_line_match(const char *text, const char *rec,
+ const char *tag, hdr_match_t *matches) {
+ const char *line_start, *line_end = text;
+ const char *tag_start, *tag_end;
+
+ for (;;) {
+ // Find record, ensure either at start of text or follows '\n'
+ line_start = strstr(line_end, rec);
+ while (line_start && line_start > text && *(line_start - 1) != '\n') {
+ line_start = strstr(line_start + 3, rec);
+ }
+ if (!line_start) return -1;
+
+ // Find end of header line
+ line_end = strchr(line_start, '\n');
+ if (!line_end) line_end = line_start + strlen(line_start);
+
+ matches[0].rm_so = line_start - text;
+ matches[0].rm_eo = line_end - text;
+ if (!tag) return 0; // Match found if not looking for tag.
+
+ for (tag_start = line_start + 3; tag_start < line_end; tag_start++) {
+ // Find possible tag start. Hacky but quick.
+ while (*tag_start > '\n') tag_start++;
+
+ // Check it
+ if (tag_start[0] == '\t'
+ && strncmp(tag_start + 1, tag, 2) == 0
+ && tag_start[3] == ':') {
+ // Found tag, record location and return.
+ tag_end = tag_start + 4;
+ while (*tag_end && *tag_end != '\t' && *tag_end != '\n')
+ ++tag_end;
+ matches[1].rm_so = tag_start - text + 4;
+ matches[1].rm_eo = tag_end - text;
+ return 0;
+ }
+ }
+ // Couldn't find tag, try again from end of current record.
+ }
+}
+
static void trans_tbl_destroy(trans_tbl_t *tbl) {
- free(tbl->tid_trans);
khiter_t iter;
+
+ free(tbl->tid_trans);
+
+ /*
+ * The values for the tbl->rg_trans and tbl->pg_trans hashes are pointers
+ * to keys in the rg_ids and pg_ids sets of the merged_header_t, so
+ * they should not be freed here.
+ *
+ * The keys are unique to each hash entry, so they do have to go.
+ */
+
for (iter = kh_begin(tbl->rg_trans); iter != kh_end(tbl->rg_trans); ++iter) {
if (kh_exist(tbl->rg_trans, iter)) {
- free(kh_value(tbl->rg_trans, iter));
free(kh_key(tbl->rg_trans, iter));
}
}
for (iter = kh_begin(tbl->pg_trans); iter != kh_end(tbl->pg_trans); ++iter) {
if (kh_exist(tbl->pg_trans, iter)) {
- free(kh_value(tbl->pg_trans, iter));
free(kh_key(tbl->pg_trans, iter));
}
}
@@ -141,347 +243,727 @@ static void trans_tbl_destroy(trans_tbl_t *tbl) {
kh_destroy(c2c,tbl->pg_trans);
}
-// Takes in existing header and rewrites it in the usual order HD, SQ, RG, PG CO, other
-static void pretty_header(char** text_in_out, int32_t text_len)
-{
- char* output, *output_pointer;
- output = output_pointer = (char*)calloc(1,text_len+1);
- output[text_len] = '\0';
-
- // Read @HD and write
- regex_t hd_regex, sq_regex, pg_regex, rg_regex, co_regex, other_regex;
- regmatch_t matches[1];
- if (regcomp( &hd_regex, "^@HD.*$", REG_EXTENDED|REG_NEWLINE ))
- abort();
- if (regexec( &hd_regex, *text_in_out, 1, &matches[0], 0 ) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, *text_in_out+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- }
- regfree(&hd_regex);
-
- // Read @SQ's and write
- if (regcomp( &sq_regex, "^@SQ.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* sq_pointer = *text_in_out;
- while (*text_in_out+text_len > sq_pointer && regexec( &sq_regex, sq_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, sq_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- sq_pointer += matches[0].rm_eo + 1;
- }
- regfree(&sq_regex);
-
- // Read @RG's and write
- if (regcomp( &rg_regex, "^@RG.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* rg_pointer = *text_in_out;
- while (*text_in_out+text_len > rg_pointer && regexec( &rg_regex, rg_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, rg_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- rg_pointer += matches[0].rm_eo + 1;
- }
- regfree(&rg_regex);
-
- // Read @PG's and write
- if (regcomp( &pg_regex, "^@PG.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* pg_pointer = *text_in_out;
- while (*text_in_out+text_len > pg_pointer && regexec( &pg_regex, pg_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, pg_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- pg_pointer += matches[0].rm_eo + 1;
- }
- regfree(&pg_regex);
-
- // Read @CO's and write
- if (regcomp( &co_regex, "^@CO.*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* co_pointer = *text_in_out;
- while (*text_in_out+text_len > co_pointer && regexec( &co_regex, co_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, co_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- co_pointer += matches[0].rm_eo + 1;
- }
- regfree(&co_regex);
-
- // Read any other not HD,SQ,RG,PG,CO tags and write
- if (regcomp( &other_regex, "^@([^HSCPR]|H[^D]|S[^Q]|[PR][^G]|C[^O]).*$", REG_EXTENDED|REG_NEWLINE )) abort();
- char* other_pointer = *text_in_out;
- while (*text_in_out+text_len > other_pointer && regexec( &other_regex, other_pointer, 1, &matches[0], 0) == 0) {
- size_t match_size = matches[0].rm_eo - matches[0].rm_so;
- memcpy(output_pointer, other_pointer+matches[0].rm_so, match_size);
- output_pointer[match_size] = '\n';
- output_pointer += match_size + 1;
- other_pointer += matches[0].rm_eo + 1;
- }
- regfree(&other_regex);
+/*
+ * Create a merged_header_t struct.
+ */
- // Safety check, make sure we copied it all, if we didn't something is wrong with the header
- if ( output+text_len != output_pointer ) {
- fprintf(pysamerr, "[pretty_header] invalid header\n");
- exit(1);
+static merged_header_t * init_merged_header() {
+ merged_header_t *merged_hdr;
+
+ merged_hdr = calloc(1, sizeof(*merged_hdr));
+ if (merged_hdr == NULL) return NULL;
+
+ merged_hdr->targets_sz = 16;
+ merged_hdr->target_name = malloc(merged_hdr->targets_sz
+ * sizeof(*merged_hdr->target_name));
+ if (NULL == merged_hdr->target_name) goto fail;
+
+ merged_hdr->target_len = malloc(merged_hdr->targets_sz
+ * sizeof(*merged_hdr->target_len));
+ if (NULL == merged_hdr->target_len) goto fail;
+
+ merged_hdr->sq_tids = kh_init(c2i);
+ if (merged_hdr->sq_tids == NULL) goto fail;
+
+ merged_hdr->rg_ids = kh_init(cset);
+ if (merged_hdr->rg_ids == NULL) goto fail;
+
+ merged_hdr->pg_ids = kh_init(cset);
+ if (merged_hdr->pg_ids == NULL) goto fail;
+
+ return merged_hdr;
+
+ fail:
+ perror("[init_merged_header]");
+ kh_destroy(cset, merged_hdr->pg_ids);
+ kh_destroy(cset, merged_hdr->rg_ids);
+ kh_destroy(c2i, merged_hdr->sq_tids);
+ free(merged_hdr->target_name);
+ free(merged_hdr->target_len);
+ free(merged_hdr);
+ return NULL;
+}
+
+/* Some handy kstring manipulating functions */
+
+// Append char range to kstring
+static inline int range_to_ks(const char *src, int from, int to,
+ kstring_t *dest) {
+ return kputsn(src + from, to - from, dest) != to - from;
+}
+
+// Append a header line match to kstring
+static inline int match_to_ks(const char *src, const hdr_match_t *match,
+ kstring_t *dest) {
+ return range_to_ks(src, match->rm_so, match->rm_eo, dest);
+}
+
+// Append a kstring to a kstring
+static inline int ks_to_ks(kstring_t *src, kstring_t *dest) {
+ return kputsn(ks_str(src), ks_len(src), dest) != ks_len(src);
+}
+
+/*
+ * Generate a unique ID by appending a random suffix to a given prefix.
+ * existing_ids is the set of IDs that are already in use.
+ * If always_add_suffix is true, the suffix will always be included.
+ * If false, prefix will be returned unchanged if it isn't in existing_ids.
+ */
+
+static int gen_unique_id(char *prefix, khash_t(cset) *existing_ids,
+ bool always_add_suffix, kstring_t *dest) {
+ khiter_t iter;
+
+ if (!always_add_suffix) {
+ // Try prefix on its own first
+ iter = kh_get(cset, existing_ids, prefix);
+ if (iter == kh_end(existing_ids)) { // prefix isn't used yet
+ dest->l = 0;
+ if (kputs(prefix, dest) == EOF) return -1;
+ return 0;
+ }
}
- free(*text_in_out);
- *text_in_out = output;
+
+ do {
+ dest->l = 0;
+ ksprintf(dest, "%s-%0lX", prefix, lrand48());
+ iter = kh_get(cset, existing_ids, ks_str(dest));
+ } while (iter != kh_end(existing_ids));
+
+ return 0;
}
-static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg)
-{
- tbl->n_targets = translate->n_targets;
- tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int));
- tbl->rg_trans = kh_init(c2c);
- tbl->pg_trans = kh_init(c2c);
- if (!tbl->tid_trans || !tbl->rg_trans || !tbl->pg_trans) { perror("out of memory"); exit(-1); }
+/*
+ * Add the @HD line to the new header
+ * In practice the @HD line will come from the first input header.
+ */
- int32_t out_len = out->l_text;
- while (out_len > 0 && out->text[out_len-1] == '\n') {--out_len; } // strip trailing \n's
- kstring_t out_text = { 0, 0, NULL };
- kputsn(out->text, out_len, &out_text);
+static int trans_tbl_add_hd(merged_header_t* merged_hdr,
+ bam_hdr_t *translate) {
+ hdr_match_t match = {0, 0};
- int i, min_tid = -1;
- tbl->lost_coord_sort = false;
+ // TODO: handle case when @HD needs merging.
+ if (merged_hdr->have_hd) return 0;
- khash_t(c2i) *out_tid = kh_init(c2i);
- for (i = 0; i < out->n_targets; ++i) {
- int ret;
- khiter_t iter = kh_put(c2i, out_tid, out->target_name[i], &ret);
- if (ret <= 0) abort();
- kh_value(out_tid, iter) = i;
+ if (hdr_line_match(translate->text, "@HD", NULL, &match) != 0) {
+ return 0;
}
+ if (match_to_ks(translate->text, &match, &merged_hdr->out_hd)) goto memfail;
+ if (kputc('\n', &merged_hdr->out_hd) == EOF) goto memfail;
+ merged_hdr->have_hd = true;
+
+ return 0;
+
+ memfail:
+ perror(__func__);
+ return -1;
+}
+
+static inline int grow_target_list(merged_header_t* merged_hdr) {
+ size_t new_size;
+ char **new_names;
+ uint32_t *new_len;
+
+ new_size = merged_hdr->targets_sz * 2;
+ new_names = realloc(merged_hdr->target_name, sizeof(*new_names) * new_size);
+ if (!new_names) goto fail;
+ merged_hdr->target_name = new_names;
+
+ new_len = realloc(merged_hdr->target_len, sizeof(*new_len) * new_size);
+ if (!new_len) goto fail;
+ merged_hdr->target_len = new_len;
+
+ merged_hdr->targets_sz = new_size;
+
+ return 0;
+
+ fail:
+ perror(__func__);
+ return -1;
+}
+
+/*
+ * Add @SQ records to the translation table.
+ *
+ * Go through the target list for the input header. Any new targets found
+ * are added to the output header target list. At the same time, a mapping
+ * from the input to output target ids is stored in tbl.
+ *
+ * If any new targets are found, the header text is scanned to find the
+ * corresponding @SQ records. They are then copied into the
+ * merged_hdr->out_text kstring (which will eventually become the
+ * output header text).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+
+static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate,
+ trans_tbl_t* tbl) {
+
+ kstring_t *out_text = &merged_hdr->out_sq;
+ khash_t(c2i)* sq_tids = merged_hdr->sq_tids;
+ hdr_match_t *new_sq_matches = NULL;
+ char *text;
+ hdr_match_t matches[2];
+ int32_t i, missing;
+ int32_t old_n_targets = merged_hdr->n_targets;
+ khiter_t iter;
+ int min_tid = -1;
+
+ // Fill in the tid part of the translation table, adding new targets
+ // to the merged header as we go.
+
for (i = 0; i < translate->n_targets; ++i) {
- khiter_t iter = kh_get(c2i, out_tid, translate->target_name[i]);
-
- if (iter == kh_end(out_tid)) { // Append missing entries to out
- tbl->tid_trans[i] = out->n_targets++;
- out->target_name = (char**)realloc(out->target_name, sizeof(char*)*out->n_targets);
- out->target_name[out->n_targets-1] = strdup(translate->target_name[i]);
- out->target_len = (uint32_t*)realloc(out->target_len, sizeof(uint32_t)*out->n_targets);
- out->target_len[out->n_targets-1] = translate->target_len[i];
- // grep line with regex '^@SQ.*\tSN:%s(\t.*$|$)', translate->target_name[i]
- // from translate->text
- regex_t sq_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- kstring_t seq_regex = { 0, 0, NULL };
- ksprintf(&seq_regex, "^@SQ.*\tSN:%s(\t.*$|$)", translate->target_name[i]);
- regcomp(&sq_id, seq_regex.s, REG_EXTENDED|REG_NEWLINE);
- free(seq_regex.s);
- if (regexec(&sq_id, translate->text, 1, matches, 0) != 0)
- {
- fprintf(pysamerr, "[trans_tbl_init] @SQ SN (%s) found in binary header but not text header.\n",translate->target_name[i]);
- exit(1);
+
+ // Check if it's a new target.
+ iter = kh_get(c2i, sq_tids, translate->target_name[i]);
+
+ if (iter == kh_end(sq_tids)) {
+ int ret;
+ // Append missing entries to out_hdr
+
+ if (merged_hdr->n_targets == merged_hdr->targets_sz) {
+ if (grow_target_list(merged_hdr)) goto fail;
}
- regfree(&sq_id);
- // Produce our output line and append it to out_text
- kputc('\n', &out_text);
- kputsn(translate->text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &out_text);
+ merged_hdr->target_name[merged_hdr->n_targets] = strdup(translate->target_name[i]);
+ if (merged_hdr->target_name[merged_hdr->n_targets] == NULL) goto memfail;
+ merged_hdr->target_len[merged_hdr->n_targets] = translate->target_len[i];
+
+ // Record the new identifier for reference below,
+ // and when building the ttable for other inputs.
+ iter = kh_put(c2i, sq_tids,
+ merged_hdr->target_name[merged_hdr->n_targets], &ret);
+ if (ret < 0) {
+ free(merged_hdr->target_name[merged_hdr->n_targets]);
+ goto memfail;
+ }
+ assert(ret > 0); // Should not be in hash already.
- free(matches);
+ kh_value(sq_tids, iter) = merged_hdr->n_targets;
+ tbl->tid_trans[i] = merged_hdr->n_targets++;
} else {
- tbl->tid_trans[i] = kh_value(out_tid, iter);
+ tbl->tid_trans[i] = kh_value(sq_tids, iter);
}
+
if (tbl->tid_trans[i] > min_tid) {
min_tid = tbl->tid_trans[i];
} else {
tbl->lost_coord_sort = true;
}
}
- kh_destroy(c2i, out_tid);
-
- // grep @RG id's
- regex_t rg_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- regcomp(&rg_id, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- char* text = translate->text;
- klist_t(hdrln) *rg_list = kl_init(hdrln);
- while(1) { // foreach rg id in translate's header
- if (regexec(&rg_id, text, 2, matches, 0) != 0) break;
- // matches[0] is the whole @RG line; matches[1] is the ID field value
- kstring_t match_id = { 0, 0, NULL };
- kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id);
-
- // is our matched ID in our output list already
- regex_t rg_id_search;
- kstring_t rg_regex = { 0, 0, NULL };
- ksprintf(&rg_regex, "^@RG.*\tID:%s(\t.*$|$)", match_id.s);
- regcomp(&rg_id_search, rg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB);
- free(rg_regex.s);
- kstring_t transformed_id = { 0, 0, NULL };
- bool transformed_equals_match;
- if (regexec(&rg_id_search, out->text, 0, NULL, 0) != 0 || merge_rg) {
- // Not in there so can add it as 1-1 mapping
- kputs(match_id.s, &transformed_id);
- transformed_equals_match = true;
- } else {
- // It's in there so we need to transform it by appending random number to id
- ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48());
- transformed_equals_match = false;
+
+ if (merged_hdr->n_targets == old_n_targets)
+ return 0; // Everything done if no new targets.
+
+ // Otherwise, find @SQ lines in translate->text for all newly added targets.
+
+ new_sq_matches = malloc((merged_hdr->n_targets - old_n_targets)
+ * sizeof(*new_sq_matches));
+ if (new_sq_matches == NULL) goto memfail;
+
+ for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) {
+ new_sq_matches[i].rm_so = new_sq_matches[i].rm_eo = -1;
+ }
+
+ text = translate->text;
+ while (hdr_line_match(text, "@SQ", "SN", matches) == 0) {
+ // matches[0] is whole line, matches[1] is SN value.
+
+ // This is a bit disgusting, but avoids a copy...
+ char c = text[matches[1].rm_eo];
+ int idx;
+
+ text[matches[1].rm_eo] = '\0';
+
+ // Look up the SN value in the sq_tids hash.
+ iter = kh_get(c2i, sq_tids, text + matches[1].rm_so);
+ text[matches[1].rm_eo] = c; // restore text
+
+ if (iter == kh_end(sq_tids)) {
+ // Warn about this, but it's not really fatal.
+ fprintf(pysamerr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n",
+ __func__,
+ (int) (matches[1].rm_eo - matches[1].rm_so),
+ text + matches[1].rm_so);
+ text += matches[0].rm_eo;
+ continue; // Skip to next
}
- regfree(&rg_id_search);
- // Insert it into our translation map
- int in_there = 0;
- khiter_t iter = kh_put(c2c, tbl->rg_trans, ks_release(&match_id), &in_there);
- char *transformed_id_s = ks_release(&transformed_id);
- kh_value(tbl->rg_trans,iter) = transformed_id_s;
- // take matched line and replace ID with transformed_id
- kstring_t transformed_line = { 0, 0, NULL };
- if (transformed_equals_match) {
- kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line);
+ idx = kh_value(sq_tids, iter);
+ if (idx >= old_n_targets) {
+ // is a new SQ, so record position so we can add it to out_text.
+ assert(idx < merged_hdr->n_targets);
+ ptrdiff_t off = text - translate->text;
+ new_sq_matches[idx - old_n_targets].rm_so = matches[0].rm_so + off;
+ new_sq_matches[idx - old_n_targets].rm_eo = matches[0].rm_eo + off;
+ }
+
+ // Carry on searching from end of current match
+ text += matches[0].rm_eo;
+ }
+
+ // Check if any new targets have been missed
+ missing = 0;
+ for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) {
+ if (new_sq_matches[i].rm_so >= 0) {
+ if (match_to_ks(translate->text, &new_sq_matches[i], out_text))
+ goto memfail;
+ if (kputc('\n', out_text) == EOF) goto memfail;
} else {
- kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id_s, &transformed_line);
- kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
+ fprintf(pysamerr, "[E::%s] @SQ SN (%s) found in binary header but not text header.\n",
+ __func__, merged_hdr->target_name[i + old_n_targets]);
+ missing++;
}
+ }
+ if (missing) goto fail;
+
+ free(new_sq_matches);
+ return 0;
- if (!(transformed_equals_match && merge_rg)) {
- // append line to linked list for PG processing
- char** ln = kl_pushp(hdrln, rg_list);
- *ln = ks_release(&transformed_line); // Give away to linked list
+ memfail:
+ perror(__func__);
+ fail:
+ free(new_sq_matches);
+ return -1;
+}
+
+/*
+ * Common code for setting up RG and PG record ID tag translation.
+ *
+ * is_rg is true for RG translation, false for PG.
+ * translate is the input bam header
+ * merge is true if tags with the same ID are to be merged.
+ * known_ids is the set of IDs already in the output header.
+ * id_map is the translation map from input header IDs to output header IDs
+ * If override is set, it will be used to replace the existing ID (RG only)
+ *
+ * known_ids and id_map have entries for the new IDs added to them.
+ *
+ * Return value is a linked list of header lines with the translated IDs,
+ * or NULL if something went wrong (probably out of memory).
+ *
+ */
+
+static klist_t(hdrln) * trans_rg_pg(bool is_rg, bam_hdr_t *translate,
+ bool merge, khash_t(cset)* known_ids,
+ khash_t(c2c)* id_map, char *override) {
+ hdr_match_t matches[2];
+ khiter_t iter;
+ const char *text = translate->text;
+ const char *rec_type = is_rg ? "@RG" : "@PG";
+ klist_t(hdrln) *hdr_lines;
+
+ hdr_lines = kl_init(hdrln);
+
+ // Search through translate's header
+ while (hdr_line_match(text, rec_type, "ID", matches) == 0) {
+ // matches[0] is the whole @RG/PG line; matches[1] is the ID field value
+
+ kstring_t orig_id = { 0, 0, NULL }; // ID in original header
+ kstring_t transformed_id = { 0, 0, NULL }; // ID in output header
+ char *map_value; // Value to store in id_map
+ bool id_changed; // Have we changed the ID?
+ bool not_found_in_output; // ID isn't in the output header (yet)
+
+ // Take a copy of the ID as we'll need it for a hash key.
+ if (match_to_ks(text, &matches[1], &orig_id)) goto memfail;
+
+ // is our matched ID in our output ID set already?
+ iter = kh_get(cset, known_ids, ks_str(&orig_id));
+ not_found_in_output = (iter == kh_end(known_ids));
+
+ if (override) {
+ // Override original ID (RG only)
+#ifdef OVERRIDE_DOES_NOT_MERGE
+ if (gen_unique_id(override, known_ids, false, &transformed_id))
+ goto memfail;
+ not_found_in_output = true; // As ID now unique
+#else
+ if (kputs(override, &transformed_id) == EOF) goto memfail;
+ // Know about override already?
+ iter = kh_get(cset, known_ids, ks_str(&transformed_id));
+ not_found_in_output = (iter == kh_end(known_ids));
+#endif
+ id_changed = true;
+ } else {
+ if ( not_found_in_output || merge) {
+ // Not in there or merging so can add it as 1-1 mapping
+ if (ks_to_ks(&orig_id, &transformed_id)) goto memfail;
+ id_changed = false;
+ } else {
+ // It's in there so we need to transform it by appending
+ // a random number to the id
+ if (gen_unique_id(ks_str(&orig_id), known_ids,
+ true, &transformed_id))
+ goto memfail;
+ id_changed = true;
+ not_found_in_output = true; // As ID now unique
+ }
}
- else free(transformed_line.s);
- text += matches[0].rm_eo; // next!
- }
- regfree(&rg_id);
+ // Does this line need to go into our output header?
+ if (not_found_in_output) {
- // Do same for PG id's
- regex_t pg_id;
- regcomp(&pg_id, "^@PG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- text = translate->text;
- klist_t(hdrln) *pg_list = kl_init(hdrln);
- while(1) { // foreach pg id in translate's header
- if (regexec(&pg_id, text, 2, matches, 0) != 0) break;
- kstring_t match_id = { 0, 0, NULL };
- kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id);
-
- // is our matched ID in our output list already
- regex_t pg_id_search;
- kstring_t pg_regex = { 0, 0, NULL };
- ksprintf(&pg_regex, "^@PG.*\tID:%s(\t.*$|$)", match_id.s);
- regcomp(&pg_id_search, pg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB);
- free(pg_regex.s);
- kstring_t transformed_id = { 0, 0, NULL };
- bool transformed_equals_match;
- if (regexec(&pg_id_search, out->text, 0, NULL, 0) != 0 || merge_pg) {
- // Not in there so can add it as 1-1 mapping
- kputs(match_id.s, &transformed_id);
- transformed_equals_match = true;
+ // Take matched line and replace ID with transformed_id
+ kstring_t new_hdr_line = { 0, 0, NULL };
+
+ if (!id_changed) { // Can just copy
+ if (match_to_ks(text, &matches[0], &new_hdr_line)) goto memfail;
+ } else { // Substitute new name for original
+ if (range_to_ks(text, matches[0].rm_so, matches[1].rm_so,
+ &new_hdr_line)) goto memfail;
+ if (ks_to_ks(&transformed_id, &new_hdr_line)) goto memfail;
+ if (range_to_ks(text, matches[1].rm_eo, matches[0].rm_eo,
+ &new_hdr_line)) goto memfail;
+ }
+
+ // append line to output linked list
+ char** ln = kl_pushp(hdrln, hdr_lines);
+ *ln = ks_release(&new_hdr_line); // Give away to linked list
+
+ // Need to add it to known_ids set
+ int in_there = 0;
+ iter = kh_put(cset, known_ids, ks_str(&transformed_id), &in_there);
+ if (in_there < 0) goto memfail;
+ assert(in_there > 0); // Should not already be in the map
+ map_value = ks_release(&transformed_id);
} else {
- // It's in there so we need to transform it by appending random number to id
- ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48());
- transformed_equals_match = false;
+ // Use existing string in id_map
+ assert(kh_exist(known_ids, iter));
+ map_value = kh_key(known_ids, iter);
+ free(ks_release(&transformed_id));
}
- regfree(&pg_id_search);
// Insert it into our translation map
int in_there = 0;
- khiter_t iter = kh_put(c2c, tbl->pg_trans, ks_release(&match_id), &in_there);
- char *transformed_id_s = ks_release(&transformed_id);
- kh_value(tbl->pg_trans,iter) = transformed_id_s;
- // take matched line and replace ID with transformed_id
- kstring_t transformed_line = { 0, 0, NULL };
- if (transformed_equals_match) {
- kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line);
- } else {
- kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id_s, &transformed_line);
- kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
+ iter = kh_put(c2c, id_map, ks_release(&orig_id), &in_there);
+ kh_value(id_map, iter) = map_value;
+
+ text += matches[0].rm_eo; // next!
+ }
+
+ // If there are no RG lines in the file and we are overriding add one
+ if (is_rg && override && kl_begin(hdr_lines) == NULL) {
+ kstring_t new_id = {0, 0, NULL};
+ kstring_t line = {0, 0, NULL};
+ kstring_t empty = {0, 0, NULL};
+ int in_there = 0;
+ char** ln;
+
+ // Get the new ID
+ if (gen_unique_id(override, known_ids, false, &new_id))
+ goto memfail;
+
+ // Make into a header line and add to linked list
+ ksprintf(&line, "@RG\tID:%s", ks_str(&new_id));
+ ln = kl_pushp(hdrln, hdr_lines);
+ *ln = ks_release(&line);
+
+ // Put into known_ids set
+ iter = kh_put(cset, known_ids, ks_str(&new_id), &in_there);
+ if (in_there < 0) goto memfail;
+ assert(in_there > 0); // Should be a new entry
+
+ // Put into translation map (key is empty string)
+ if (kputs("", &empty) == EOF) goto memfail;
+ iter = kh_put(c2c, id_map, ks_release(&empty), &in_there);
+ if (in_there < 0) goto memfail;
+ assert(in_there > 0); // Should be a new entry
+ kh_value(id_map, iter) = ks_release(&new_id);
+ }
+
+ return hdr_lines;
+
+ memfail:
+ perror(__func__);
+ if (hdr_lines) kl_destroy(hdrln, hdr_lines);
+ return NULL;
+}
+
+/*
+ * Common code for completing RG and PG record translation.
+ *
+ * Input is a list of header lines, and the mapping from input to
+ * output @PG record IDs.
+ *
+ * RG and PG records can contain tags that cross-reference to other @PG
+ * records. This fixes the tags to contain the new IDs before adding
+ * them to the output header text.
+ */
+
+static int finish_rg_pg(bool is_rg, klist_t(hdrln) *hdr_lines,
+ khash_t(c2c)* pg_map, kstring_t *out_text) {
+ const char *search = is_rg ? "\tPG:" : "\tPP:";
+ khiter_t idx;
+ char *line = NULL;
+
+ while ((kl_shift(hdrln, hdr_lines, &line)) == 0) {
+ char *id = strstr(line, search); // Look for tag to fix
+ int pos1 = 0, pos2 = 0;
+ char *new_id = NULL;
+
+ if (id) {
+ // Found a tag. Look up the value in the translation map
+ // to see what it should be changed to in the output file.
+ char *end, tmp;
+
+ id += 4; // Point to value
+ end = strchr(id, '\t'); // Find end of tag
+ if (!end) end = id + strlen(id);
+
+ tmp = *end;
+ *end = '\0'; // Temporarily get the value on its own.
+
+ // Look-up in translation table
+ idx = kh_get(c2c, pg_map, id);
+ if (idx == kh_end(pg_map)) {
+ // Not found, warn.
+ fprintf(pysamerr, "[W::%s] Tag %s%s not found in @PG records\n",
+ __func__, search + 1, id);
+ } else {
+ // Remember new id and splice points on original string
+ new_id = kh_value(pg_map, idx);
+ pos1 = id - line;
+ pos2 = end - line;
+ }
+
+ *end = tmp; // Restore string
}
- if (!(transformed_equals_match && merge_pg)) {
- // append line to linked list for PP processing
- char** ln = kl_pushp(hdrln, pg_list);
- *ln = ks_release(&transformed_line); // Give away to linked list
+ // Copy line to output:
+ // line[0..pos1), new_id (if not NULL), line[pos2..end), '\n'
+
+ if (pos1 && range_to_ks(line, 0, pos1, out_text)) goto memfail;
+ if (new_id && kputs(new_id, out_text) == EOF) goto memfail;
+ if (kputs(line + pos2, out_text) == EOF) goto memfail;
+ if (kputc('\n', out_text) == EOF) goto memfail;
+ free(line); // No longer needed
+ line = NULL;
+ }
+
+ return 0;
+
+ memfail:
+ perror(__func__);
+ free(line); // Prevent leakage as no longer on list
+ return -1;
+}
+
+/*
+ * Build the translation table for an input *am file. This stores mappings
+ * which allow IDs to be converted from those used in the input file
+ * to the ones which will be used in the output. The mappings are for:
+ * Reference sequence IDs (for @SQ records)
+ * @RG record ID tags
+ * @PG record ID tags
+ *
+ * At the same time, new header text is built up by copying records
+ * from the input bam file. This will eventually become the header for
+ * the output file. When copied, the ID tags for @RG and @PG records
+ * are replaced with their values. The @PG PP: and @RG PG: tags
+ * are also modified if necessary.
+ *
+ * merged_hdr holds state on the output header (which IDs are present, etc.)
+ * translate is the input header
+ * tbl is the translation table that gets filled in.
+ * merge_rg controls merging of @RG records
+ * merge_pg controls merging of @PG records
+ * If rg_override is not NULL, it will be used to replace the existing @RG ID
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+
+static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate,
+ trans_tbl_t* tbl, bool merge_rg, bool merge_pg,
+ char* rg_override)
+{
+ klist_t(hdrln) *rg_list = NULL;
+ klist_t(hdrln) *pg_list = NULL;
+
+ tbl->n_targets = translate->n_targets;
+ tbl->rg_trans = tbl->pg_trans = NULL;
+ tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int));
+ if (tbl->tid_trans == NULL) goto memfail;
+ tbl->rg_trans = kh_init(c2c);
+ if (tbl->rg_trans == NULL) goto memfail;
+ tbl->pg_trans = kh_init(c2c);
+ if (tbl->pg_trans == NULL) goto memfail;
+
+ tbl->lost_coord_sort = false;
+
+ // Get the @HD record (if not there already).
+ if (trans_tbl_add_hd(merged_hdr, translate)) goto fail;
+
+ // Fill in map and add header lines for @SQ records
+ if (trans_tbl_add_sq(merged_hdr, translate, tbl)) goto fail;
+
+ // Get translated header lines and fill in map for @RG records
+ rg_list = trans_rg_pg(true, translate, merge_rg, merged_hdr->rg_ids,
+ tbl->rg_trans, rg_override);
+ if (!rg_list) goto fail;
+
+ // Get translated header lines and fill in map for @PG records
+ pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids,
+ tbl->pg_trans, NULL);
+
+ // Fix-up PG: tags in the new @RG records and add to output
+ if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg))
+ goto fail;
+
+ // Fix-up PP: tags in the new @PG records and add to output
+ if (finish_rg_pg(false, pg_list, tbl->pg_trans, &merged_hdr->out_pg))
+ goto fail;
+
+ kl_destroy(hdrln, rg_list); rg_list = NULL;
+ kl_destroy(hdrln, pg_list); pg_list = NULL;
+
+ // Just append @CO headers without translation
+ const char *line, *end_pointer;
+ for (line = translate->text; *line; line = end_pointer + 1) {
+ end_pointer = strchr(line, '\n');
+ if (strncmp(line, "@CO", 3) == 0) {
+ if (end_pointer) {
+ if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF)
+ goto memfail;
+ } else { // Last line with no trailing '\n'
+ if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail;
+ if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail;
+ }
}
- else free(transformed_line.s);
- text += matches[0].rm_eo; // next!
+ if (end_pointer == NULL) break;
+ }
+
+ return 0;
+
+ memfail:
+ perror(__func__);
+ fail:
+ trans_tbl_destroy(tbl);
+ if (rg_list) kl_destroy(hdrln, rg_list);
+ if (pg_list) kl_destroy(hdrln, pg_list);
+ return -1;
+}
+
+static inline void move_kstr_to_text(char **text, kstring_t *ks) {
+ memcpy(*text, ks_str(ks), ks_len(ks));
+ *text += ks_len(ks);
+ **text = '\0';
+ free(ks_release(ks));
+}
+
+/*
+ * Populate a bam_hdr_t struct from data in a merged_header_t.
+ */
+
+static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) {
+ size_t txt_sz;
+ char *text;
+ bam_hdr_t *hdr;
+
+ // Check output text size
+ txt_sz = (ks_len(&merged_hdr->out_hd)
+ + ks_len(&merged_hdr->out_sq)
+ + ks_len(&merged_hdr->out_rg)
+ + ks_len(&merged_hdr->out_pg)
+ + ks_len(&merged_hdr->out_co));
+ if (txt_sz >= INT32_MAX) {
+ fprintf(pysamerr, "[%s] Output header text too long\n", __func__);
+ return NULL;
}
- regfree(&pg_id);
- // need to translate PP's on the fly in second pass because they may not be in correct order and need complete tbl->pg_trans to do this
- // for each line {
- // with ID replaced with tranformed_id and PP's transformed using the translation table
- // }
- regex_t pg_pp;
- regcomp(&pg_pp, "^@PG.*\tPP:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- kliter_t(hdrln) *iter = kl_begin(pg_list);
- while (iter != kl_end(pg_list)) {
- char* data = kl_val(iter);
-
- kstring_t transformed_line = { 0, 0, NULL };
- // Find PP tag
- if (regexec(&pg_pp, data, 2, matches, 0) == 0) {
- // Lookup in hash table
- kstring_t pp_id = { 0, 0, NULL };
- kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pp_id);
-
- khiter_t k = kh_get(c2c, tbl->pg_trans, pp_id.s);
- free(pp_id.s);
- char* transformed_id = kh_value(tbl->pg_trans,k);
- // Replace
- kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id, &transformed_line);
- kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
- } else { kputs(data, &transformed_line); }
- // Produce our output line and append it to out_text
- kputc('\n', &out_text);
- kputsn(transformed_line.s, transformed_line.l, &out_text);
-
- free(transformed_line.s);
- free(data);
- iter = kl_next(iter);
+
+ // Allocate new header
+ hdr = bam_hdr_init();
+ if (hdr == NULL) goto memfail;
+
+ // Transfer targets arrays to new header
+ hdr->n_targets = merged_hdr->n_targets;
+ if (hdr->n_targets > 0) {
+ // Try to shrink targets arrays to correct size
+ hdr->target_name = realloc(merged_hdr->target_name,
+ hdr->n_targets * sizeof(char*));
+ if (!hdr->target_name) hdr->target_name = merged_hdr->target_name;
+
+ hdr->target_len = realloc(merged_hdr->target_len,
+ hdr->n_targets * sizeof(uint32_t));
+ if (!hdr->target_len) hdr->target_len = merged_hdr->target_len;
+
+ // These have either been freed by realloc() or, in the unlikely
+ // event that failed, have had their ownership transferred to hdr
+ merged_hdr->target_name = NULL;
+ merged_hdr->target_len = NULL;
}
- regfree(&pg_pp);
-
- // Need to also translate @RG PG's on the fly too
- regex_t rg_pg;
- regcomp(&rg_pg, "^@RG.*\tPG:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
- kliter_t(hdrln) *rg_iter = kl_begin(rg_list);
- while (rg_iter != kl_end(rg_list)) {
- char* data = kl_val(rg_iter);
-
- kstring_t transformed_line = { 0, 0, NULL };
- // Find PG tag
- if (regexec(&rg_pg, data, 2, matches, 0) == 0) {
- // Lookup in hash table
- kstring_t pg_id = { 0, 0, NULL };
- kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pg_id);
-
- khiter_t k = kh_get(c2c, tbl->pg_trans, pg_id.s);
- free(pg_id.s);
- char* transformed_id = kh_value(tbl->pg_trans,k);
- // Replace
- kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line);
- kputs(transformed_id, &transformed_line);
- kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
- } else { kputs(data, &transformed_line); }
- // Produce our output line and append it to out_text
- kputc('\n', &out_text);
- kputsn(transformed_line.s, transformed_line.l, &out_text);
-
- free(transformed_line.s);
- free(data);
- rg_iter = kl_next(rg_iter);
+ else {
+ hdr->target_name = NULL;
+ hdr->target_len = NULL;
}
- regfree(&rg_pg);
- kl_destroy(hdrln,pg_list);
- kl_destroy(hdrln,rg_list);
- free(matches);
+ // Allocate text
+ text = hdr->text = malloc(txt_sz + 1);
+ if (!text) goto memfail;
+
+ // Put header text in order @HD, @SQ, @RG, @PG, @CO
+ move_kstr_to_text(&text, &merged_hdr->out_hd);
+ move_kstr_to_text(&text, &merged_hdr->out_sq);
+ move_kstr_to_text(&text, &merged_hdr->out_rg);
+ move_kstr_to_text(&text, &merged_hdr->out_pg);
+ move_kstr_to_text(&text, &merged_hdr->out_co);
+ hdr->l_text = txt_sz;
- // Add trailing \n and write back to header
- free(out->text);
- kputc('\n', &out_text);
- out->l_text = out_text.l;
- out->text = ks_release(&out_text);
+ return hdr;
+
+ memfail:
+ perror(__func__);
+ bam_hdr_destroy(hdr);
+ return NULL;
+}
+
+/*
+ * Free a merged_header_t struct and all associated data.
+ *
+ * Note that the keys to the rg_ids and pg_ids sets are also used as
+ * values in the translation tables. This function should therefore not
+ * be called until the translation tables are no longer needed.
+ */
+
+static void free_merged_header(merged_header_t *merged_hdr) {
+ size_t i;
+ khiter_t iter;
+ if (!merged_hdr) return;
+ free(ks_release(&merged_hdr->out_hd));
+ free(ks_release(&merged_hdr->out_sq));
+ free(ks_release(&merged_hdr->out_rg));
+ free(ks_release(&merged_hdr->out_pg));
+ free(ks_release(&merged_hdr->out_co));
+ if (merged_hdr->target_name) {
+ for (i = 0; i < merged_hdr->n_targets; i++) {
+ free(merged_hdr->target_name[i]);
+ }
+ free(merged_hdr->target_name);
+ }
+ free(merged_hdr->target_len);
+ kh_destroy(c2i, merged_hdr->sq_tids);
+
+ if (merged_hdr->rg_ids) {
+ for (iter = kh_begin(merged_hdr->rg_ids);
+ iter != kh_end(merged_hdr->rg_ids); ++iter) {
+ if (kh_exist(merged_hdr->rg_ids, iter))
+ free(kh_key(merged_hdr->rg_ids, iter));
+ }
+ kh_destroy(cset, merged_hdr->rg_ids);
+ }
+
+ if (merged_hdr->pg_ids) {
+ for (iter = kh_begin(merged_hdr->pg_ids);
+ iter != kh_end(merged_hdr->pg_ids); ++iter) {
+ if (kh_exist(merged_hdr->pg_ids, iter))
+ free(kh_key(merged_hdr->pg_ids, iter));
+ }
+ kh_destroy(cset, merged_hdr->pg_ids);
+ }
+
+ free(merged_hdr);
}
static void bam_translate(bam1_t* b, trans_tbl_t* tbl)
@@ -498,10 +980,25 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl)
if (k != kh_end(tbl->rg_trans)) {
char* translate_rg = kh_value(tbl->rg_trans,k);
bam_aux_del(b, rg);
- bam_aux_append(b, "RG", 'Z', strlen(translate_rg) + 1, (uint8_t*)translate_rg);
+ if (translate_rg) {
+ bam_aux_append(b, "RG", 'Z', strlen(translate_rg) + 1,
+ (uint8_t*)translate_rg);
+ }
} else {
- fprintf(pysamerr, "[bam_translate] RG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_rg, bam_get_qname(b));
+ char *tmp = strdup(decoded_rg);
+ fprintf(pysamerr,
+ "[bam_translate] RG tag \"%s\" on read \"%s\" encountered "
+ "with no corresponding entry in header, tag lost. "
+ "Unknown tags are only reported once per input file for "
+ "each tag ID.\n",
+ decoded_rg, bam_get_qname(b));
bam_aux_del(b, rg);
+ // Prevent future whinges
+ if (tmp) {
+ int in_there = 0;
+ k = kh_put(c2c, tbl->rg_trans, tmp, &in_there);
+ if (in_there > 0) kh_value(tbl->rg_trans, k) = NULL;
+ }
}
}
@@ -513,10 +1010,25 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl)
if (k != kh_end(tbl->pg_trans)) {
char* translate_pg = kh_value(tbl->pg_trans,k);
bam_aux_del(b, pg);
- bam_aux_append(b, "PG", 'Z', strlen(translate_pg) + 1, (uint8_t*)translate_pg);
+ if (translate_pg) {
+ bam_aux_append(b, "PG", 'Z', strlen(translate_pg) + 1,
+ (uint8_t*)translate_pg);
+ }
} else {
- fprintf(pysamerr, "[bam_translate] PG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_pg, bam_get_qname(b));
+ char *tmp = strdup(decoded_pg);
+ fprintf(pysamerr,
+ "[bam_translate] PG tag \"%s\" on read \"%s\" encountered "
+ "with no corresponding entry in header, tag lost. "
+ "Unknown tags are only reported once per input file for "
+ "each tag ID.\n",
+ decoded_pg, bam_get_qname(b));
bam_aux_del(b, pg);
+ // Prevent future whinges
+ if (tmp) {
+ int in_there = 0;
+ k = kh_put(c2c, tbl->pg_trans, tmp, &in_there);
+ if (in_there > 0) kh_value(tbl->pg_trans, k) = NULL;
+ }
}
}
}
@@ -581,20 +1093,28 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
@param flag flags that control how the merge is undertaken
@param reg region to merge
@param n_threads number of threads to use (passed to htslib)
+ @param in_fmt format options for input files
+ @param out_fmt output file format and options
@discussion Padding information may NOT correctly maintained. This
function is NOT thread safe.
*/
-int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads)
+int bam_merge_core2(int by_qname, const char *out, const char *mode,
+ const char *headers, int n, char * const *fn, int flag,
+ const char *reg, int n_threads,
+ const htsFormat *in_fmt, const htsFormat *out_fmt)
{
samFile *fpout, **fp;
heap1_t *heap;
bam_hdr_t *hout = NULL;
+ bam_hdr_t *hin = NULL;
int i, j, *RG_len = NULL;
uint64_t idx = 0;
char **RG = NULL;
hts_itr_t **iter = NULL;
bam_hdr_t **hdr = NULL;
trans_tbl_t *translation_tbl = NULL;
+ merged_header_t *merged_hdr = init_merged_header();
+ if (!merged_hdr) return -1;
// Is there a specified pre-prepared header to use for output?
if (headers) {
@@ -604,8 +1124,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
return -1;
}
- hout = sam_hdr_read(fpheaders);
+ hin = sam_hdr_read(fpheaders);
sam_close(fpheaders);
+ if (hin == NULL) {
+ fprintf(pysamerr, "[bam_merge_core] couldn't read headers for '%s'\n",
+ headers);
+ return -1;
+ }
}
g_is_by_qname = by_qname;
@@ -614,14 +1139,15 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
+ RG = (char**)calloc(n, sizeof(char*));
// prepare RG tag from file names
if (flag & MERGE_RG) {
- RG = (char**)calloc(n, sizeof(char*));
RG_len = (int*)calloc(n, sizeof(int));
for (i = 0; i != n; ++i) {
int l = strlen(fn[i]);
const char *s = fn[i];
- if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
+ if (l > 4 && (strcmp(s + l - 4, ".bam") == 0 || strcmp(s + l - 4, ".sam") == 0)) l -= 4;
+ if (l > 5 && strcmp(s + l - 5, ".cram") == 0) l -= 5;
for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
++j; l -= j;
RG[i] = (char*)calloc(l + 1, 1);
@@ -629,28 +1155,50 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
strncpy(RG[i], s + j, l);
}
}
+
+ if (hin) {
+ // Popluate merged_hdr from the pre-prepared header
+ trans_tbl_t dummy;
+ int res;
+ res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
+ flag & MERGE_COMBINE_PG, NULL);
+ trans_tbl_destroy(&dummy);
+ if (res) return -1; // FIXME: memory leak
+ }
+
// open and read the header from each file
for (i = 0; i < n; ++i) {
bam_hdr_t *hin;
- fp[i] = sam_open(fn[i], "r");
+ fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
int j;
fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]);
- for (j = 0; j < i; ++j) sam_close(fp[j]);
+ for (j = 0; j < i; ++j) {
+ bam_hdr_destroy(hdr[i]);
+ sam_close(fp[j]);
+ }
free(fp); free(heap);
// FIXME: possible memory leak
return -1;
}
hin = sam_hdr_read(fp[i]);
- if (hout)
- trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
- else {
- // As yet, no headers to merge into...
- hout = bam_hdr_dup(hin);
- // ...so no need to translate header into itself
- trans_tbl_init(hout, hin, translation_tbl+i, true, true);
+ if (hin == NULL) {
+ fprintf(pysamerr, "[bam_merge_core] failed to read header for '%s'\n",
+ fn[i]);
+ for (j = 0; j < i; ++j) {
+ bam_hdr_destroy(hdr[i]);
+ sam_close(fp[j]);
+ }
+ free(fp); free(heap);
+ // FIXME: possible memory leak
+ return -1;
}
+ if (trans_tbl_init(merged_hdr, hin, translation_tbl+i,
+ flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG,
+ RG[i]))
+ return -1; // FIXME: memory leak
+
// TODO sam_itr_next() doesn't yet work for SAM files,
// so for those keep the headers around for use with sam_read1()
if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
@@ -661,8 +1209,20 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
}
}
+ // Did we get an @HD line?
+ if (!merged_hdr->have_hd) {
+ fprintf(pysamerr, "[W::%s] No @HD tag found.\n", __func__);
+ /* FIXME: Should we add an @HD line here, and if so what should
+ we put in it? Ideally we want a way of getting htslib to tell
+ us the SAM version number to assume given no @HD line. Is
+ it also safe to assume that the output is coordinate sorted?
+ SO: is optional so we don't have to have it.*/
+ /* ksprintf(&merged_hdr->out_hd, "@HD\tVN:1.5\tSO:coordinate\n"); */
+ }
+
// Transform the header into standard form
- pretty_header(&hout->text,hout->l_text);
+ hout = finish_merged_header(merged_hdr);
+ if (!hout) return -1; // FIXME: memory leak
// If we're only merging a specified region move our iters to start at that point
if (reg) {
@@ -670,19 +1230,33 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
int tid, beg, end;
const char *name_lim = hts_parse_reg(reg, &beg, &end);
- char *name = malloc(name_lim - reg + 1);
- memcpy(name, reg, name_lim - reg);
- name[name_lim - reg] = '\0';
- tid = bam_name2id(hout, name);
- free(name);
+ if (name_lim) {
+ char *name = malloc(name_lim - reg + 1);
+ memcpy(name, reg, name_lim - reg);
+ name[name_lim - reg] = '\0';
+ tid = bam_name2id(hout, name);
+ free(name);
+ }
+ else {
+ // not parsable as a region, but possibly a sequence named "foo:a"
+ tid = bam_name2id(hout, reg);
+ beg = 0;
+ end = INT_MAX;
+ }
if (tid < 0) {
- fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__);
+ if (name_lim) fprintf(pysamerr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg);
+ else fprintf(pysamerr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg);
return -1;
}
for (i = 0; i < n; ++i) {
hts_idx_t *idx = sam_index_load(fp[i], fn[i]);
// (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
int mapped_tid = rtrans[i*hout->n_targets+tid];
+ if (idx == NULL) {
+ fprintf(pysamerr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
+ __func__, fn[i]);
+ return -1;
+ }
if (mapped_tid != INT32_MIN) {
iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
} else {
@@ -725,7 +1299,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
}
// Open output file and write header
- if ((fpout = sam_open(out, mode)) == 0) {
+ if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__);
return -1;
}
@@ -757,7 +1331,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
// Clean up and close
if (flag & MERGE_RG) {
for (i = 0; i != n; ++i) free(RG[i]);
- free(RG); free(RG_len);
+ free(RG_len);
}
for (i = 0; i < n; ++i) {
trans_tbl_destroy(translation_tbl + i);
@@ -765,37 +1339,45 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
bam_hdr_destroy(hdr[i]);
sam_close(fp[i]);
}
+ bam_hdr_destroy(hin);
bam_hdr_destroy(hout);
+ free_merged_header(merged_hdr);
sam_close(fpout);
- free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
+ free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
return 0;
}
+// Unused here but may be used by legacy samtools-using third-party code
int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg)
{
char mode[12];
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0);
+ return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL);
}
static void merge_usage(FILE *to)
{
- fprintf(to, "Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> <in2.bam> [<in3.bam> ... <inN.bam>]\n\n");
- fprintf(to, "Options: -n sort by read names\n");
- fprintf(to, " -r attach RG tag (inferred from file names)\n");
- fprintf(to, " -u uncompressed BAM output\n");
- fprintf(to, " -f overwrite the output BAM if exist\n");
- fprintf(to, " -1 compress level 1\n");
- fprintf(to, " -l INT compression level, from 0 to 9 [-1]\n");
- fprintf(to, " -@ INT number of BAM compression threads [0]\n");
- fprintf(to, " -R STR merge file in the specified region STR [all]\n");
- fprintf(to, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n");
- fprintf(to, " -c combine RG tags with colliding IDs rather than amending them\n");
- fprintf(to, " -p combine PG tags with colliding IDs rather than amending them\n");
- fprintf(to, " -s VALUE override random seed\n");
- fprintf(to, " -b FILE list of input BAM filenames, one per line [null]\n\n");
+ fprintf(to,
+"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
+"\n"
+"Options:\n"
+" -n Sort by read names\n"
+" -r Attach RG tag (inferred from file names)\n"
+" -u Uncompressed BAM output\n"
+" -f Overwrite the output BAM if exist\n"
+" -1 Compress level 1\n"
+" -l INT Compression level, from 0 to 9 [-1]\n"
+" -R STR Merge file in the specified region STR [all]\n"
+" -h FILE Copy the header in FILE to <out.bam> [in1.bam]\n"
+" -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n"
+" -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n"
+" -s VALUE Override random seed\n"
+" -b FILE List of input BAM filenames, one per line [null]\n"
+" -@, --threads INT\n"
+" Number of BAM/CRAM compression threads [0]\n");
+ sam_global_opt_help(to, "-.O..");
}
int bam_merge(int argc, char *argv[])
@@ -806,12 +1388,19 @@ int bam_merge(int argc, char *argv[])
char** fn = NULL;
int fn_size = 0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ { "threads", required_argument, NULL, '@' },
+ { NULL, 0, NULL, 0 }
+ };
+
if (argc == 1) {
merge_usage(stdout);
return 0;
}
- while ((c = getopt(argc, argv, "h:nru1R:f@:l:cps:b:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': flag |= MERGE_RG; break;
case 'f': flag |= MERGE_FORCE; break;
@@ -842,6 +1431,10 @@ int bam_merge(int argc, char *argv[])
}
break;
}
+
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': merge_usage(pysamerr); return 1;
}
}
if ( argc - optind < 1 ) {
@@ -867,22 +1460,28 @@ int bam_merge(int argc, char *argv[])
if (fn == NULL) { ret = 1; goto end; }
memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
}
- if (fn_size+nargcfiles < 2) {
- fprintf(pysamerr, "You must specify at least 2 input files.\n");
+ if (fn_size+nargcfiles < 1) {
+ fprintf(pysamerr, "You must specify at least one (and usually two or more) input files.\n");
merge_usage(pysamerr);
return 1;
}
strcpy(mode, "wb");
+ sam_open_mode(mode+1, argv[optind], NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
- if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers, fn_size+nargcfiles, fn, flag, reg, n_threads) < 0) ret = 1;
+ if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
+ fn_size+nargcfiles, fn, flag, reg, n_threads,
+ &ga.in, &ga.out) < 0)
+ ret = 1;
+
end:
if (fn_size > 0) {
int i;
for (i=0; i<fn_size; i++) free(fn[i]);
- free(fn);
}
+ free(fn);
free(reg);
free(fn_headers);
+ sam_global_args_free(&ga);
return ret;
}
@@ -946,11 +1545,11 @@ typedef struct {
int index;
} worker_t;
-static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads)
+static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
{
size_t i;
samFile* fp;
- fp = sam_open(fn, mode);
+ fp = sam_open_format(fn, mode, fmt);
if (fp == NULL) return;
sam_hdr_write(fp, h);
if (n_threads > 1) hts_set_threads(fp, n_threads);
@@ -966,7 +1565,17 @@ static void *worker(void *data)
ks_mergesort(sort, w->buf_len, w->buf, 0);
name = (char*)calloc(strlen(w->prefix) + 20, 1);
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0);
+ write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0, NULL);
+
+// Consider using CRAM temporary files if the final output is CRAM.
+// Typically it is comparable speed while being smaller.
+// hts_opt opt[2] = {
+// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL},
+// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
+// };
+// opt[0].next = &opt[1];
+// write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt);
+
free(name);
return 0;
}
@@ -1011,17 +1620,22 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
@param fnout name of the final output file to be written
@param modeout sam_open() mode to be used to create the final output file
@param max_mem approxiate maximum memory (very inaccurate)
+ @param in_fmt input file format options
+ @param out_fmt output file format and options
@return 0 for successful sorting, negative on errors
@discussion It may create multiple temporary subalignment files
- and then merge them by calling bam_merge_core(). This function is
+ and then merge them by calling bam_merge_core2(). This function is
NOT thread safe.
*/
-int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const char *fnout, const char *modeout, size_t _max_mem, int n_threads)
+int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
+ const char *fnout, const char *modeout,
+ size_t _max_mem, int n_threads,
+ const htsFormat *in_fmt, const htsFormat *out_fmt)
{
- int ret, i, n_files = 0;
+ int ret = -1, i, n_files = 0;
size_t mem, max_k, k, max_mem;
- bam_hdr_t *header;
+ bam_hdr_t *header = NULL;
samFile *fp;
bam1_t *b, **buf;
@@ -1030,12 +1644,17 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
max_k = k = 0; mem = 0;
max_mem = _max_mem * n_threads;
buf = NULL;
- fp = sam_open(fn, "r");
+ fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
- fprintf(pysamerr, "[bam_sort_core] fail to open file %s\n", fn);
- return -1;
+ const char *message = strerror(errno);
+ fprintf(pysamerr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
+ return -2;
}
header = sam_hdr_read(fp);
+ if (header == NULL) {
+ fprintf(pysamerr, "[bam_sort_core] failed to read header for '%s'\n", fn);
+ goto err;
+ }
if (is_by_qname) change_SO(header, "queryname");
else change_SO(header, "coordinate");
// write sub files
@@ -1061,12 +1680,16 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
mem = k = 0;
}
}
- if (ret != -1)
- fprintf(pysamerr, "[bam_sort_core] truncated file. Continue anyway.\n");
+ if (ret != -1) {
+ fprintf(pysamerr, "[bam_sort_core] truncated file. Aborting.\n");
+ ret = -1;
+ goto err;
+ }
+
// write the final output
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
- write_buffer(fnout, modeout, k, buf, header, n_threads);
+ write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt);
} else { // then merge
char **fns;
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
@@ -1076,10 +1699,12 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
- if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads) < 0) {
+ if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
+ MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads,
+ in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
- return -1;
+ goto err;
}
for (i = 0; i < n_files; ++i) {
unlink(fns[i]);
@@ -1087,25 +1712,30 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
}
free(fns);
}
+
+ ret = 0;
+
+ err:
// free
for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]);
free(buf);
bam_hdr_destroy(header);
sam_close(fp);
- return 0;
+ return ret;
}
+// Unused here but may be used by legacy samtools-using third-party code
int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
{
int ret;
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
sprintf(fnout, "%s.bam", prefix);
- ret = bam_sort_core_ext(is_by_qname, fn, prefix, fnout, "wb", max_mem, 0);
+ ret = bam_sort_core_ext(is_by_qname, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
free(fnout);
return ret;
}
-static int sort_usage(FILE *fp, int status)
+static void sort_usage(FILE *fp)
{
fprintf(fp,
"Usage: samtools sort [options...] [in.bam]\n"
@@ -1114,33 +1744,29 @@ static int sort_usage(FILE *fp, int status)
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -n Sort by read name\n"
" -o FILE Write final output to FILE rather than standard output\n"
-" -O FORMAT Write output as FORMAT ('sam'/'bam'/'cram') (either -O or\n"
-" -T PREFIX Write temporary files to PREFIX.nnnn.bam -T is required)\n"
-" -@ INT Set number of sorting and compression threads [1]\n"
-"\n"
-"Legacy usage: samtools sort [options...] <in.bam> <out.prefix>\n"
-"Options:\n"
-" -f Use <out.prefix> as full final filename rather than prefix\n"
-" -o Write final output to stdout rather than <out.prefix>.bam\n"
-" -l,m,n,@ Similar to corresponding options above\n");
- return status;
+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
+" -@, --threads INT\n"
+" Set number of sorting and compression threads [1]\n");
+ sam_global_opt_help(fp, "-.O..");
}
int bam_sort(int argc, char *argv[])
{
size_t max_mem = 768<<20; // 512MB
- int c, i, modern, nargs, is_by_qname = 0, is_stdout = 0, ret = EXIT_SUCCESS, n_threads = 0, level = -1, full_path = 0;
- char *fnout = "-", *fmtout = NULL, modeout[12], *tmpprefix = NULL;
- kstring_t fnout_buffer = { 0, 0, NULL };
-
- modern = 0;
- for (i = 1; i < argc; ++i)
- if (argv[i][0] == '-' && strpbrk(argv[i], "OT")) { modern = 1; break; }
-
- while ((c = getopt(argc, argv, modern? "l:m:no:O:T:@:" : "fnom:@:l:")) >= 0) {
+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
+ char *fnout = "-", modeout[12];
+ kstring_t tmpprefix = { 0, 0, NULL };
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ { "threads", required_argument, NULL, '@' },
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:", lopts, NULL)) >= 0) {
switch (c) {
- case 'f': full_path = 1; break;
- case 'o': if (modern) fnout = optarg; else is_stdout = 1; break;
+ case 'o': fnout = optarg; o_seen = 1; break;
case 'n': is_by_qname = 1; break;
case 'm': {
char *q;
@@ -1150,49 +1776,57 @@ int bam_sort(int argc, char *argv[])
else if (*q == 'g' || *q == 'G') max_mem <<= 30;
break;
}
- case 'O': fmtout = optarg; break;
- case 'T': tmpprefix = optarg; break;
+ case 'T': kputs(optarg, &tmpprefix); break;
case '@': n_threads = atoi(optarg); break;
case 'l': level = atoi(optarg); break;
- default: return sort_usage(pysamerr, EXIT_FAILURE);
+
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': sort_usage(pysamerr); ret = EXIT_FAILURE; goto sort_end;
}
}
nargs = argc - optind;
- if (argc == 1)
- return sort_usage(stdout, EXIT_SUCCESS);
- else if (modern? (nargs > 1) : (nargs != 2))
- return sort_usage(pysamerr, EXIT_FAILURE);
-
- if (!modern) {
- fmtout = "bam";
- if (is_stdout) fnout = "-";
- else if (full_path) fnout = argv[optind+1];
- else {
- ksprintf(&fnout_buffer, "%s.%s", argv[optind+1], fmtout);
- fnout = fnout_buffer.s;
- }
- tmpprefix = argv[optind+1];
+ if (nargs == 0 && isatty(STDIN_FILENO)) {
+ sort_usage(stdout);
+ ret = EXIT_SUCCESS;
+ goto sort_end;
}
+ else if (nargs >= 2) {
+ // If exactly two, user probably tried to specify legacy <out.prefix>
+ if (nargs == 2)
+ fprintf(pysamerr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n");
- strcpy(modeout, "w");
- if (sam_open_mode(&modeout[1], fnout, fmtout) < 0) {
- if (fmtout) fprintf(pysamerr, "[bam_sort] can't parse output format \"%s\"\n", fmtout);
- else fprintf(pysamerr, "[bam_sort] can't determine output format\n");
+ sort_usage(pysamerr);
ret = EXIT_FAILURE;
goto sort_end;
}
+
+ strcpy(modeout, "wb");
+ sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
- if (tmpprefix == NULL) {
- fprintf(pysamerr, "[bam_sort] no prefix specified for temporary files (use -T option)\n");
+ if (tmpprefix.l == 0)
+ ksprintf(&tmpprefix, "%s.tmp", (nargs > 0)? argv[optind] : "STDIN");
+
+ ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
+ tmpprefix.s, fnout, modeout, max_mem, n_threads,
+ &ga.in, &ga.out);
+ if (ret >= 0)
+ ret = EXIT_SUCCESS;
+ else {
+ char dummy[4];
+ // If we failed on opening the input file & it has no .bam/.cram/etc
+ // extension, the user probably tried legacy -o <infile> <out.prefix>
+ if (ret == -2 && o_seen && nargs > 0 && sam_open_mode(dummy, argv[optind], NULL) < 0)
+ fprintf(pysamerr, "[bam_sort] Note the <out.prefix> argument has been replaced by -T/-o options\n");
+
ret = EXIT_FAILURE;
- goto sort_end;
}
- if (bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-", tmpprefix, fnout, modeout, max_mem, n_threads) < 0) ret = EXIT_FAILURE;
-
sort_end:
- free(fnout_buffer.s);
+ free(tmpprefix.s);
+ sam_global_args_free(&ga);
+
return ret;
}
diff --git a/samtools/bam_split.c b/samtools/bam_split.c
index 1d07f07..e44acc0 100644
--- a/samtools/bam_split.c
+++ b/samtools/bam_split.c
@@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */
#include <regex.h>
#include <htslib/khash.h>
#include <htslib/kstring.h>
+#include "sam_opts.h"
KHASH_MAP_INIT_STR(c2i, int)
@@ -42,6 +43,7 @@ struct parsed_opts {
char* unaccounted_name;
char* output_format_string;
bool verbose;
+ sam_global_args ga;
};
typedef struct parsed_opts parsed_opts_t;
@@ -60,7 +62,7 @@ struct state {
typedef struct state state_t;
-static void cleanup_state(state_t* status);
+static int cleanup_state(state_t* status);
static void cleanup_opts(parsed_opts_t* opts);
static void usage(FILE *write_to)
@@ -69,16 +71,19 @@ static void usage(FILE *write_to)
"Usage: samtools split [-u <unaccounted.bam>[:<unaccounted_header.sam>]]\n"
" [-f <format_string>] [-v] <merged.bam>\n"
"Options:\n"
-" -f STRING output filename format string [\"%%*_%%#.bam\"]\n"
+" -f STRING output filename format string [\"%%*_%%#.%%.\"]\n"
" -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n"
" -u FILE1:FILE2 ...and override the header with FILE2\n"
-" -v verbose output\n"
+" -v verbose output\n");
+ sam_global_opt_help(write_to, "-....");
+ fprintf(write_to,
"\n"
"Format string expansions:\n"
" %%%% %%\n"
" %%* basename\n"
" %%# @RG index\n"
" %%! @RG ID\n"
+" %%. filename extension for output format\n"
);
}
@@ -90,11 +95,18 @@ static parsed_opts_t* parse_args(int argc, char** argv)
const char* optstring = "vf:u:";
char* delim;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1);
if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; }
+ sam_global_args_init(&retval->ga);
+
int opt;
- while ((opt = getopt(argc, argv, optstring)) != -1) {
+ while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) {
switch (opt) {
case 'f':
retval->output_format_string = strdup(optarg);
@@ -113,13 +125,16 @@ static parsed_opts_t* parse_args(int argc, char** argv)
}
break;
default:
+ if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break;
+ /* else fall-through */
+ case '?':
usage(stdout);
free(retval);
return NULL;
}
}
- if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.bam");
+ if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.%.");
argc -= optind;
argv += optind;
@@ -138,7 +153,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
}
// Expands a output filename format string
-static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx)
+static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
{
kstring_t str = { 0, 0, NULL };
const char* pointer = format_string;
@@ -159,6 +174,13 @@ static char* expand_format_string(const char* format_string, const char* basenam
case '!':
kputs(rg_id, &str);
break;
+ case '.':
+ // Only really need to cope with sam, bam, cram
+ if (format->format != unknown_format)
+ kputs(hts_format_file_extension(format), &str);
+ else
+ kputs("bam", &str);
+ break;
case '\0':
// Error is: fprintf(stderr, "bad format string, trailing %%\n");
free(str.s);
@@ -302,29 +324,41 @@ static state_t* init(parsed_opts_t* opts)
return NULL;
}
- retval->merged_input_file = sam_open(opts->merged_input_name, "rb");
+ retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
if (!retval->merged_input_file) {
fprintf(stderr, "Could not open input file (%s)\n", opts->merged_input_name);
free(retval);
return NULL;
}
retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
+ if (retval->merged_input_header == NULL) {
+ fprintf(stderr, "Could not read header for file '%s'\n",
+ opts->merged_input_name);
+ cleanup_state(retval);
+ return NULL;
+ }
if (opts->unaccounted_name) {
if (opts->unaccounted_header_name) {
- samFile* hdr_load = sam_open(opts->unaccounted_header_name, "r");
+ samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
fprintf(stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
cleanup_state(retval);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
+ if (retval->unaccounted_header == NULL) {
+ fprintf(stderr, "Could not read header for file '%s'\n",
+ opts->unaccounted_header_name);
+ cleanup_state(retval);
+ return NULL;
+ }
sam_close(hdr_load);
} else {
retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header);
}
- retval->unaccounted_file = sam_open(opts->unaccounted_name, "wb");
+ retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
fprintf(stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
cleanup_state(retval);
@@ -359,14 +393,19 @@ static state_t* init(parsed_opts_t* opts)
for (i = 0; i < retval->output_count; i++) {
char* output_filename = NULL;
- if ( ( output_filename = expand_format_string(opts->output_format_string, input_base_name, retval->rg_id[i], i) ) == NULL) {
+ output_filename = expand_format_string(opts->output_format_string,
+ input_base_name,
+ retval->rg_id[i], i,
+ &opts->ga.out);
+
+ if ( output_filename == NULL ) {
fprintf(stderr, "Error expanding output filename format string.\r\n");
cleanup_state(retval);
free(input_base_name);
return NULL;
}
- retval->rg_output_file[i] = sam_open(output_filename, "wb");
+ retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
fprintf(stderr, "Could not open output file: %s\r\n", output_filename);
cleanup_state(retval);
@@ -412,10 +451,15 @@ static bool split(state_t* state)
bam1_t* file_read = bam_init1();
// Read the first record
- if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) {
+ int r;
+ if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
// Nothing more to read? Ignore this file
bam_destroy1(file_read);
file_read = NULL;
+ if (r < -1) {
+ fprintf(stderr, "Could not write read sequence\n");
+ return false;
+ }
}
while (file_read != NULL) {
@@ -433,7 +477,10 @@ static bool split(state_t* state)
if (iter != kh_end(state->rg_hash)) {
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
- sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read);
+ if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
+ fprintf(stderr, "Could not write sequence\n");
+ return false;
+ }
} else {
// otherwise write to the unaccounted bam if there is one or fail
if (state->unaccounted_file == NULL) {
@@ -445,31 +492,40 @@ static bool split(state_t* state)
bam_destroy1(file_read);
return false;
} else {
- sam_write1(state->unaccounted_file, state->unaccounted_header, file_read);
+ if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
+ fprintf(stderr, "Could not write sequence\n");
+ return false;
+ }
}
}
// Replace written read with the next one to process
- if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) {
+ if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
// Nothing more to read? Ignore this file in future
bam_destroy1(file_read);
file_read = NULL;
+ if (r < -1) {
+ fprintf(stderr, "Could not write read sequence\n");
+ return false;
+ }
}
}
return true;
}
-static void cleanup_state(state_t* status)
+static int cleanup_state(state_t* status)
{
- if (!status) return;
+ int ret = 0;
+
+ if (!status) return 0;
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
- if (status->unaccounted_file) sam_close(status->unaccounted_file);
+ if (status->unaccounted_file) ret |= sam_close(status->unaccounted_file);
sam_close(status->merged_input_file);
size_t i;
for (i = 0; i < status->output_count; i++) {
bam_hdr_destroy(status->rg_output_header[i]);
- sam_close(status->rg_output_file[i]);
+ ret |= sam_close(status->rg_output_file[i]);
free(status->rg_id[i]);
}
bam_hdr_destroy(status->merged_input_header);
@@ -478,6 +534,8 @@ static void cleanup_state(state_t* status)
kh_destroy_c2i(status->rg_hash);
free(status->rg_id);
free(status);
+
+ return ret;
}
static void cleanup_opts(parsed_opts_t* opts)
@@ -487,6 +545,7 @@ static void cleanup_opts(parsed_opts_t* opts)
free(opts->unaccounted_header_name);
free(opts->unaccounted_name);
free(opts->output_format_string);
+ sam_global_args_free(&opts->ga);
free(opts);
}
@@ -500,7 +559,7 @@ int main_split(int argc, char** argv)
if (split(status)) ret = 0;
- cleanup_state(status);
+ ret |= (cleanup_state(status) != 0);
cleanup_opts:
cleanup_opts(opts);
diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c
index ba03c3d..329556f 100644
--- a/samtools/bam_split.c.pysam.c
+++ b/samtools/bam_split.c.pysam.c
@@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */
#include <regex.h>
#include <htslib/khash.h>
#include <htslib/kstring.h>
+#include "sam_opts.h"
KHASH_MAP_INIT_STR(c2i, int)
@@ -44,6 +45,7 @@ struct parsed_opts {
char* unaccounted_name;
char* output_format_string;
bool verbose;
+ sam_global_args ga;
};
typedef struct parsed_opts parsed_opts_t;
@@ -62,7 +64,7 @@ struct state {
typedef struct state state_t;
-static void cleanup_state(state_t* status);
+static int cleanup_state(state_t* status);
static void cleanup_opts(parsed_opts_t* opts);
static void usage(FILE *write_to)
@@ -71,16 +73,19 @@ static void usage(FILE *write_to)
"Usage: samtools split [-u <unaccounted.bam>[:<unaccounted_header.sam>]]\n"
" [-f <format_string>] [-v] <merged.bam>\n"
"Options:\n"
-" -f STRING output filename format string [\"%%*_%%#.bam\"]\n"
+" -f STRING output filename format string [\"%%*_%%#.%%.\"]\n"
" -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n"
" -u FILE1:FILE2 ...and override the header with FILE2\n"
-" -v verbose output\n"
+" -v verbose output\n");
+ sam_global_opt_help(write_to, "-....");
+ fprintf(write_to,
"\n"
"Format string expansions:\n"
" %%%% %%\n"
" %%* basename\n"
" %%# @RG index\n"
" %%! @RG ID\n"
+" %%. filename extension for output format\n"
);
}
@@ -92,11 +97,18 @@ static parsed_opts_t* parse_args(int argc, char** argv)
const char* optstring = "vf:u:";
char* delim;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1);
if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; }
+ sam_global_args_init(&retval->ga);
+
int opt;
- while ((opt = getopt(argc, argv, optstring)) != -1) {
+ while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) {
switch (opt) {
case 'f':
retval->output_format_string = strdup(optarg);
@@ -115,13 +127,16 @@ static parsed_opts_t* parse_args(int argc, char** argv)
}
break;
default:
+ if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break;
+ /* else fall-through */
+ case '?':
usage(stdout);
free(retval);
return NULL;
}
}
- if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.bam");
+ if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.%.");
argc -= optind;
argv += optind;
@@ -140,7 +155,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
}
// Expands a output filename format string
-static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx)
+static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
{
kstring_t str = { 0, 0, NULL };
const char* pointer = format_string;
@@ -161,6 +176,13 @@ static char* expand_format_string(const char* format_string, const char* basenam
case '!':
kputs(rg_id, &str);
break;
+ case '.':
+ // Only really need to cope with sam, bam, cram
+ if (format->format != unknown_format)
+ kputs(hts_format_file_extension(format), &str);
+ else
+ kputs("bam", &str);
+ break;
case '\0':
// Error is: fprintf(pysamerr, "bad format string, trailing %%\n");
free(str.s);
@@ -304,29 +326,41 @@ static state_t* init(parsed_opts_t* opts)
return NULL;
}
- retval->merged_input_file = sam_open(opts->merged_input_name, "rb");
+ retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
if (!retval->merged_input_file) {
fprintf(pysamerr, "Could not open input file (%s)\n", opts->merged_input_name);
free(retval);
return NULL;
}
retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
+ if (retval->merged_input_header == NULL) {
+ fprintf(pysamerr, "Could not read header for file '%s'\n",
+ opts->merged_input_name);
+ cleanup_state(retval);
+ return NULL;
+ }
if (opts->unaccounted_name) {
if (opts->unaccounted_header_name) {
- samFile* hdr_load = sam_open(opts->unaccounted_header_name, "r");
+ samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
fprintf(pysamerr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
cleanup_state(retval);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
+ if (retval->unaccounted_header == NULL) {
+ fprintf(pysamerr, "Could not read header for file '%s'\n",
+ opts->unaccounted_header_name);
+ cleanup_state(retval);
+ return NULL;
+ }
sam_close(hdr_load);
} else {
retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header);
}
- retval->unaccounted_file = sam_open(opts->unaccounted_name, "wb");
+ retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
fprintf(pysamerr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
cleanup_state(retval);
@@ -361,14 +395,19 @@ static state_t* init(parsed_opts_t* opts)
for (i = 0; i < retval->output_count; i++) {
char* output_filename = NULL;
- if ( ( output_filename = expand_format_string(opts->output_format_string, input_base_name, retval->rg_id[i], i) ) == NULL) {
+ output_filename = expand_format_string(opts->output_format_string,
+ input_base_name,
+ retval->rg_id[i], i,
+ &opts->ga.out);
+
+ if ( output_filename == NULL ) {
fprintf(pysamerr, "Error expanding output filename format string.\r\n");
cleanup_state(retval);
free(input_base_name);
return NULL;
}
- retval->rg_output_file[i] = sam_open(output_filename, "wb");
+ retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
fprintf(pysamerr, "Could not open output file: %s\r\n", output_filename);
cleanup_state(retval);
@@ -414,10 +453,15 @@ static bool split(state_t* state)
bam1_t* file_read = bam_init1();
// Read the first record
- if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) {
+ int r;
+ if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
// Nothing more to read? Ignore this file
bam_destroy1(file_read);
file_read = NULL;
+ if (r < -1) {
+ fprintf(pysamerr, "Could not write read sequence\n");
+ return false;
+ }
}
while (file_read != NULL) {
@@ -435,7 +479,10 @@ static bool split(state_t* state)
if (iter != kh_end(state->rg_hash)) {
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
- sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read);
+ if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
+ fprintf(pysamerr, "Could not write sequence\n");
+ return false;
+ }
} else {
// otherwise write to the unaccounted bam if there is one or fail
if (state->unaccounted_file == NULL) {
@@ -447,31 +494,40 @@ static bool split(state_t* state)
bam_destroy1(file_read);
return false;
} else {
- sam_write1(state->unaccounted_file, state->unaccounted_header, file_read);
+ if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
+ fprintf(pysamerr, "Could not write sequence\n");
+ return false;
+ }
}
}
// Replace written read with the next one to process
- if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) {
+ if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
// Nothing more to read? Ignore this file in future
bam_destroy1(file_read);
file_read = NULL;
+ if (r < -1) {
+ fprintf(pysamerr, "Could not write read sequence\n");
+ return false;
+ }
}
}
return true;
}
-static void cleanup_state(state_t* status)
+static int cleanup_state(state_t* status)
{
- if (!status) return;
+ int ret = 0;
+
+ if (!status) return 0;
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
- if (status->unaccounted_file) sam_close(status->unaccounted_file);
+ if (status->unaccounted_file) ret |= sam_close(status->unaccounted_file);
sam_close(status->merged_input_file);
size_t i;
for (i = 0; i < status->output_count; i++) {
bam_hdr_destroy(status->rg_output_header[i]);
- sam_close(status->rg_output_file[i]);
+ ret |= sam_close(status->rg_output_file[i]);
free(status->rg_id[i]);
}
bam_hdr_destroy(status->merged_input_header);
@@ -480,6 +536,8 @@ static void cleanup_state(state_t* status)
kh_destroy_c2i(status->rg_hash);
free(status->rg_id);
free(status);
+
+ return ret;
}
static void cleanup_opts(parsed_opts_t* opts)
@@ -489,6 +547,7 @@ static void cleanup_opts(parsed_opts_t* opts)
free(opts->unaccounted_header_name);
free(opts->unaccounted_name);
free(opts->output_format_string);
+ sam_global_args_free(&opts->ga);
free(opts);
}
@@ -502,7 +561,7 @@ int main_split(int argc, char** argv)
if (split(status)) ret = 0;
- cleanup_state(status);
+ ret |= (cleanup_state(status) != 0);
cleanup_opts:
cleanup_opts(opts);
diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c
index 1bbebdb..5cb3235 100644
--- a/samtools/bam_stat.c
+++ b/samtools/bam_stat.c
@@ -1,6 +1,6 @@
/* bam_stat.c -- flagstat subcommand.
- Copyright (C) 2009, 2011, 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2009, 2011, 2013-2015 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -27,9 +27,10 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <limits.h>
+#include <getopt.h>
#include "htslib/sam.h"
-//#include "bam.h"
#include "samtools.h"
typedef struct {
@@ -81,18 +82,60 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h)
fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
return s;
}
+
+static const char *percent(char *buffer, long long n, long long total)
+{
+ if (total != 0) sprintf(buffer, "%.2f%%", (float)n / total * 100.0);
+ else strcpy(buffer, "N/A");
+ return buffer;
+}
+
+static void usage_exit(FILE *fp, int exit_status)
+{
+ fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] <in.bam>\n");
+ exit(exit_status);
+}
+
int bam_flagstat(int argc, char *argv[])
{
samFile *fp;
bam_hdr_t *header;
bam_flagstat_t *s;
- if (argc == optind) {
- fprintf(stderr, "Usage: samtools flagstat <in.bam>\n");
- return 1;
+ char b0[16], b1[16];
+ hts_opt *in_opts = NULL;
+ int c;
+
+ enum {
+ INPUT_FMT_OPTION = CHAR_MAX+1,
+ };
+
+ static const struct option lopts[] = {
+ {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION},
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) {
+ switch (c) {
+ case INPUT_FMT_OPTION:
+ if (hts_opt_add(&in_opts, optarg) < 0)
+ usage_exit(stderr, EXIT_FAILURE);
+ break;
+ default:
+ usage_exit(stderr, EXIT_FAILURE);
+ }
+ }
+
+ if (argc != optind+1) {
+ if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
+ else usage_exit(stderr, EXIT_FAILURE);
}
fp = sam_open(argv[optind], "r");
if (fp == NULL) {
- print_error_errno("Cannot open input file \"%s\"", argv[optind]);
+ print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]);
+ return 1;
+ }
+ if (hts_opt_apply(fp, in_opts)) {
+ fprintf(stderr, "Failed to apply input-fmt-options\n");
return 1;
}
@@ -108,22 +151,27 @@ int bam_flagstat(int argc, char *argv[])
}
header = sam_hdr_read(fp);
+ if (header == NULL) {
+ fprintf(stderr, "Failed to read header for \"%s\"\n", argv[optind]);
+ return 1;
+ }
s = bam_flagstat_core(fp, header);
printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
- printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
+ printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
- printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1]));
printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
- printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1]));
printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
free(s);
bam_hdr_destroy(header);
sam_close(fp);
+ hts_opt_free(in_opts);
return 0;
}
diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c
index 15a1242..a519312 100644
--- a/samtools/bam_stat.c.pysam.c
+++ b/samtools/bam_stat.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_stat.c -- flagstat subcommand.
- Copyright (C) 2009, 2011, 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2009, 2011, 2013-2015 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -29,9 +29,10 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <limits.h>
+#include <getopt.h>
#include "htslib/sam.h"
-//#include "bam.h"
#include "samtools.h"
typedef struct {
@@ -83,18 +84,60 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h)
fprintf(pysamerr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
return s;
}
+
+static const char *percent(char *buffer, long long n, long long total)
+{
+ if (total != 0) sprintf(buffer, "%.2f%%", (float)n / total * 100.0);
+ else strcpy(buffer, "N/A");
+ return buffer;
+}
+
+static void usage_exit(FILE *fp, int exit_status)
+{
+ fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] <in.bam>\n");
+ exit(exit_status);
+}
+
int bam_flagstat(int argc, char *argv[])
{
samFile *fp;
bam_hdr_t *header;
bam_flagstat_t *s;
- if (argc == optind) {
- fprintf(pysamerr, "Usage: samtools flagstat <in.bam>\n");
- return 1;
+ char b0[16], b1[16];
+ hts_opt *in_opts = NULL;
+ int c;
+
+ enum {
+ INPUT_FMT_OPTION = CHAR_MAX+1,
+ };
+
+ static const struct option lopts[] = {
+ {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION},
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) {
+ switch (c) {
+ case INPUT_FMT_OPTION:
+ if (hts_opt_add(&in_opts, optarg) < 0)
+ usage_exit(pysamerr, EXIT_FAILURE);
+ break;
+ default:
+ usage_exit(pysamerr, EXIT_FAILURE);
+ }
+ }
+
+ if (argc != optind+1) {
+ if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
+ else usage_exit(pysamerr, EXIT_FAILURE);
}
fp = sam_open(argv[optind], "r");
if (fp == NULL) {
- print_error_errno("Cannot open input file \"%s\"", argv[optind]);
+ print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]);
+ return 1;
+ }
+ if (hts_opt_apply(fp, in_opts)) {
+ fprintf(pysamerr, "Failed to apply input-fmt-options\n");
return 1;
}
@@ -110,22 +153,27 @@ int bam_flagstat(int argc, char *argv[])
}
header = sam_hdr_read(fp);
+ if (header == NULL) {
+ fprintf(pysamerr, "Failed to read header for \"%s\"\n", argv[optind]);
+ return 1;
+ }
s = bam_flagstat_core(fp, header);
printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
- printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
+ printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
- printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1]));
printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
- printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1]));
printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
free(s);
bam_hdr_destroy(header);
sam_close(fp);
+ hts_opt_free(in_opts);
return 0;
}
diff --git a/samtools/bam_tview.c b/samtools/bam_tview.c
index 6d6bc23..f86ae43 100644
--- a/samtools/bam_tview.c
+++ b/samtools/bam_tview.c
@@ -1,6 +1,6 @@
/* bam_tview.c -- tview subcommand.
- Copyright (C) 2008-2014 Genome Research Ltd.
+ Copyright (C) 2008-2015 Genome Research Ltd.
Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -29,27 +29,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/faidx.h>
#include <htslib/sam.h>
#include <htslib/bgzf.h>
-
-/*! @typedef
- @abstract Type of function to be called by sam_fetch().
- @param b the alignment
- @param data user provided data
- */
-typedef int (*sam_fetch_f)(const bam1_t *b, void *data);
-
-int sam_fetch(samFile *fp, const hts_idx_t *idx, int tid, int beg, int end, void *data, sam_fetch_f func)
-{
- int ret;
- hts_itr_t* iter;
- bam1_t* b = bam_init1();
- iter = sam_itr_queryi(idx, tid, beg, end);
- while ((ret = sam_itr_next(fp, iter, b)) >= 0) func(b, data);
- hts_itr_destroy(iter);
- bam_destroy1(b);
- return (ret == -1)? 0 : ret;
-}
-
-
+#include "sam_opts.h"
khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
{
@@ -74,7 +54,8 @@ khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
return rg_hash;
}
-int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, const char *samples)
+int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt)
{
assert(tv!=NULL);
assert(fn!=NULL);
@@ -82,7 +63,7 @@ int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, const char *sam
tv->color_for = TV_COLOR_MAPQ;
tv->is_dot = 1;
- tv->fp = sam_open(fn, "r");
+ tv->fp = sam_open_format(fn, "r", fmt);
if(tv->fp == NULL)
{
fprintf(stderr,"sam_open %s. %s\n", fn,fn_fa);
@@ -132,7 +113,6 @@ void base_tv_destroy(tview_t* tv)
int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
{
- extern const char bam_nt16_nt4_table[];
tview_t *tv = (tview_t*)data;
int i, j, c, rb, attr, max_ins = 0;
uint32_t call = 0;
@@ -231,7 +211,7 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void
if (x > 4) x = 4;
attr |= tv->my_colorpair(tv,x);
} else if (tv->color_for == TV_COLOR_NUCL) {
- x = bam_nt16_nt4_table[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
+ x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
attr |= tv->my_colorpair(tv,x);
} else if(tv->color_for == TV_COLOR_COL) {
x = 0;
@@ -241,7 +221,7 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void
case '2': x = 2; break;
case '3': x = 3; break;
case '4': x = 4; break;
- default: x = bam_nt16_nt4_table[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
+ default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
}
x+=5;
attr |= tv->my_colorpair(tv,x);
@@ -272,9 +252,8 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void
-int tv_fetch_func(const bam1_t *b, void *data)
+static int tv_push_aln(const bam1_t *b, tview_t *tv)
{
- tview_t *tv = (tview_t*)data;
/* If we are restricted to specific readgroups check RG is in the list */
if ( tv->rg_hash )
{
@@ -322,7 +301,11 @@ int base_draw_aln(tview_t *tv, int tid, int pos)
}
// draw aln
bam_lplbuf_reset(tv->lplbuf);
- sam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
+ hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
+ bam1_t *b = bam_init1();
+ while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv);
+ bam_destroy1(b);
+ hts_itr_destroy(iter);
bam_lplbuf_push(0, tv->lplbuf);
while (tv->ccol < tv->mcol) {
@@ -347,6 +330,7 @@ static void error(const char *format, ...)
" -d display output as (H)tml or (C)urses or (T)ext \n"
" -p chr:pos go directly to this position\n"
" -s STR display only reads from this sample or group\n");
+ sam_global_opt_help(stderr, "-.--.");
}
else
{
@@ -359,17 +343,27 @@ static void error(const char *format, ...)
}
enum dipsay_mode {display_ncurses,display_html,display_text};
-extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples);
-extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples);
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
+extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt);
+extern tview_t* html_tv_init(const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt);
+extern tview_t* text_tv_init(const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt);
int bam_tview_main(int argc, char *argv[])
{
int view_mode=display_ncurses;
tview_t* tv=NULL;
- char *samples=NULL, *position=NULL;
+ char *samples=NULL, *position=NULL, *ref;
int c;
- while ((c = getopt(argc, argv, "s:p:d:")) >= 0) {
+
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) {
switch (c) {
case 's': samples=optarg; break;
case 'p': position=optarg; break;
@@ -384,28 +378,28 @@ int bam_tview_main(int argc, char *argv[])
}
break;
}
- default: error(NULL);
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': error(NULL);
}
}
if (argc==optind) error(NULL);
+ ref = (optind+1>=argc)? ga.reference : argv[optind+1];
+
switch(view_mode)
{
case display_ncurses:
- {
- tv = curses_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ tv = curses_tv_init(argv[optind], ref, samples, &ga.in);
break;
- }
+
case display_text:
- {
- tv = text_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ tv = text_tv_init(argv[optind], ref, samples, &ga.in);
break;
- }
+
case display_html:
- {
- tv = html_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ tv = html_tv_init(argv[optind], ref, samples, &ga.in);
break;
- }
}
if (tv==NULL)
{
@@ -416,7 +410,9 @@ int bam_tview_main(int argc, char *argv[])
if ( position )
{
int tid, beg, end;
- *(char *)hts_parse_reg(position, &beg, &end) = '\0';
+ char *name_lim = (char *) hts_parse_reg(position, &beg, &end);
+ if (name_lim) *name_lim = '\0';
+ else beg = 0; // region parsing failed, but possibly a seq named "foo:a"
tid = bam_name2id(tv->header, position);
if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; }
}
diff --git a/samtools/bam_tview.c.pysam.c b/samtools/bam_tview.c.pysam.c
index 82be6e3..736b588 100644
--- a/samtools/bam_tview.c.pysam.c
+++ b/samtools/bam_tview.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_tview.c -- tview subcommand.
- Copyright (C) 2008-2014 Genome Research Ltd.
+ Copyright (C) 2008-2015 Genome Research Ltd.
Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -31,27 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/faidx.h>
#include <htslib/sam.h>
#include <htslib/bgzf.h>
-
-/*! @typedef
- @abstract Type of function to be called by sam_fetch().
- @param b the alignment
- @param data user provided data
- */
-typedef int (*sam_fetch_f)(const bam1_t *b, void *data);
-
-int sam_fetch(samFile *fp, const hts_idx_t *idx, int tid, int beg, int end, void *data, sam_fetch_f func)
-{
- int ret;
- hts_itr_t* iter;
- bam1_t* b = bam_init1();
- iter = sam_itr_queryi(idx, tid, beg, end);
- while ((ret = sam_itr_next(fp, iter, b)) >= 0) func(b, data);
- hts_itr_destroy(iter);
- bam_destroy1(b);
- return (ret == -1)? 0 : ret;
-}
-
-
+#include "sam_opts.h"
khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
{
@@ -76,7 +56,8 @@ khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
return rg_hash;
}
-int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, const char *samples)
+int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt)
{
assert(tv!=NULL);
assert(fn!=NULL);
@@ -84,7 +65,7 @@ int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, const char *sam
tv->color_for = TV_COLOR_MAPQ;
tv->is_dot = 1;
- tv->fp = sam_open(fn, "r");
+ tv->fp = sam_open_format(fn, "r", fmt);
if(tv->fp == NULL)
{
fprintf(pysamerr,"sam_open %s. %s\n", fn,fn_fa);
@@ -134,7 +115,6 @@ void base_tv_destroy(tview_t* tv)
int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
{
- extern const char bam_nt16_nt4_table[];
tview_t *tv = (tview_t*)data;
int i, j, c, rb, attr, max_ins = 0;
uint32_t call = 0;
@@ -233,7 +213,7 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void
if (x > 4) x = 4;
attr |= tv->my_colorpair(tv,x);
} else if (tv->color_for == TV_COLOR_NUCL) {
- x = bam_nt16_nt4_table[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
+ x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
attr |= tv->my_colorpair(tv,x);
} else if(tv->color_for == TV_COLOR_COL) {
x = 0;
@@ -243,7 +223,7 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void
case '2': x = 2; break;
case '3': x = 3; break;
case '4': x = 4; break;
- default: x = bam_nt16_nt4_table[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
+ default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
}
x+=5;
attr |= tv->my_colorpair(tv,x);
@@ -274,9 +254,8 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void
-int tv_fetch_func(const bam1_t *b, void *data)
+static int tv_push_aln(const bam1_t *b, tview_t *tv)
{
- tview_t *tv = (tview_t*)data;
/* If we are restricted to specific readgroups check RG is in the list */
if ( tv->rg_hash )
{
@@ -324,7 +303,11 @@ int base_draw_aln(tview_t *tv, int tid, int pos)
}
// draw aln
bam_lplbuf_reset(tv->lplbuf);
- sam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
+ hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
+ bam1_t *b = bam_init1();
+ while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv);
+ bam_destroy1(b);
+ hts_itr_destroy(iter);
bam_lplbuf_push(0, tv->lplbuf);
while (tv->ccol < tv->mcol) {
@@ -349,6 +332,7 @@ static void error(const char *format, ...)
" -d display output as (H)tml or (C)urses or (T)ext \n"
" -p chr:pos go directly to this position\n"
" -s STR display only reads from this sample or group\n");
+ sam_global_opt_help(pysamerr, "-.--.");
}
else
{
@@ -361,17 +345,27 @@ static void error(const char *format, ...)
}
enum dipsay_mode {display_ncurses,display_html,display_text};
-extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples);
-extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples);
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
+extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt);
+extern tview_t* html_tv_init(const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt);
+extern tview_t* text_tv_init(const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt);
int bam_tview_main(int argc, char *argv[])
{
int view_mode=display_ncurses;
tview_t* tv=NULL;
- char *samples=NULL, *position=NULL;
+ char *samples=NULL, *position=NULL, *ref;
int c;
- while ((c = getopt(argc, argv, "s:p:d:")) >= 0) {
+
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) {
switch (c) {
case 's': samples=optarg; break;
case 'p': position=optarg; break;
@@ -386,28 +380,28 @@ int bam_tview_main(int argc, char *argv[])
}
break;
}
- default: error(NULL);
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': error(NULL);
}
}
if (argc==optind) error(NULL);
+ ref = (optind+1>=argc)? ga.reference : argv[optind+1];
+
switch(view_mode)
{
case display_ncurses:
- {
- tv = curses_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ tv = curses_tv_init(argv[optind], ref, samples, &ga.in);
break;
- }
+
case display_text:
- {
- tv = text_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ tv = text_tv_init(argv[optind], ref, samples, &ga.in);
break;
- }
+
case display_html:
- {
- tv = html_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ tv = html_tv_init(argv[optind], ref, samples, &ga.in);
break;
- }
}
if (tv==NULL)
{
@@ -418,7 +412,9 @@ int bam_tview_main(int argc, char *argv[])
if ( position )
{
int tid, beg, end;
- *(char *)hts_parse_reg(position, &beg, &end) = '\0';
+ char *name_lim = (char *) hts_parse_reg(position, &beg, &end);
+ if (name_lim) *name_lim = '\0';
+ else beg = 0; // region parsing failed, but possibly a seq named "foo:a"
tid = bam_name2id(tv->header, position);
if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; }
}
diff --git a/samtools/bam_tview.h b/samtools/bam_tview.h
index 305e69f..e11e39d 100644
--- a/samtools/bam_tview.h
+++ b/samtools/bam_tview.h
@@ -53,7 +53,8 @@ typedef struct AbstractTview {
faidx_t* fai;
bcf_callaux_t* bca;
- int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name;
+ int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins;
+ int no_skip, show_name, inverse;
char *ref;
/* maps @RG ID => SM (sample), in practice only used to determine whether a particular RG is in the list of allowed ones */
khash_t(kh_rg) *rg_hash;
@@ -89,7 +90,8 @@ char bam_aux_getCQi(bam1_t *b, int i);
#define TV_BASE_COLOR_SPACE 1
int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
-int base_tv_init(tview_t*,const char *fn, const char *fn_fa, const char *samples);
+int base_tv_init(tview_t*,const char *fn, const char *fn_fa,
+ const char *samples, const htsFormat *fmt);
void base_tv_destroy(tview_t*);
int base_draw_aln(tview_t *tv, int tid, int pos);
diff --git a/samtools/bam_tview_curses.c b/samtools/bam_tview_curses.c
index 3b3db4f..d7edfe8 100644
--- a/samtools/bam_tview_curses.c
+++ b/samtools/bam_tview_curses.c
@@ -23,38 +23,29 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#undef _HAVE_CURSES
-
-#if _CURSES_LIB == 0
-#elif _CURSES_LIB == 1
-#include <curses.h>
-#ifndef NCURSES_VERSION
-#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled"
-#else
-#define _HAVE_CURSES
-#endif
-#elif _CURSES_LIB == 2
-#include <xcurses.h>
-#define _HAVE_CURSES
-#else
-#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled"
-#endif
-
+#include <config.h>
#include "bam_tview.h"
-#ifdef _HAVE_CURSES
-
-
+#ifdef HAVE_CURSES
+
+#if defined HAVE_NCURSESW_CURSES_H
+#include <ncursesw/curses.h>
+#elif defined HAVE_NCURSESW_H
+#include <ncursesw.h>
+#elif defined HAVE_NCURSES_CURSES_H
+#include <ncurses/curses.h>
+#elif defined HAVE_NCURSES_H
+#include <ncurses.h>
+#elif defined HAVE_CURSES_H
+#include <curses.h>
+#endif
typedef struct CursesTview {
tview_t view;
WINDOW *wgoto, *whelp;
} curses_tview_t;
-
-
-
#define FROM_TV(ptr) ((curses_tview_t*)ptr)
static void curses_destroy(tview_t* base)
@@ -110,6 +101,33 @@ static void curses_clear(struct AbstractTview* tv)
clear();
}
+static int curses_init_colors(int inverse)
+{
+ if (inverse) {
+ init_pair(1, COLOR_WHITE, COLOR_BLUE);
+ init_pair(2, COLOR_BLACK, COLOR_GREEN);
+ init_pair(3, COLOR_BLACK, COLOR_YELLOW);
+ init_pair(4, COLOR_BLACK, COLOR_WHITE);
+ init_pair(5, COLOR_BLACK, COLOR_GREEN);
+ init_pair(6, COLOR_BLACK, COLOR_CYAN);
+ init_pair(7, COLOR_WHITE, COLOR_MAGENTA);
+ init_pair(8, COLOR_WHITE, COLOR_RED);
+ init_pair(9, COLOR_WHITE, COLOR_BLUE);
+ } else {
+ init_pair(1, COLOR_BLUE, COLOR_BLACK);
+ init_pair(2, COLOR_GREEN, COLOR_BLACK);
+ init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+ init_pair(4, COLOR_WHITE, COLOR_BLACK);
+ init_pair(5, COLOR_GREEN, COLOR_BLACK);
+ init_pair(6, COLOR_CYAN, COLOR_BLACK);
+ init_pair(7, COLOR_MAGENTA, COLOR_BLACK);
+ init_pair(8, COLOR_RED, COLOR_BLACK);
+ init_pair(9, COLOR_BLUE, COLOR_BLACK);
+ }
+
+ return 0;
+}
+
static int curses_colorpair(struct AbstractTview* tv,int flag)
{
return COLOR_PAIR(flag);
@@ -145,10 +163,17 @@ static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
}
} else {
char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end);
- char name_terminator = *name_lim;
- *name_lim = '\0';
- _tid = bam_name2id(base->header, str);
- *name_lim = name_terminator;
+ if (name_lim) {
+ char name_terminator = *name_lim;
+ *name_lim = '\0';
+ _tid = bam_name2id(base->header, str);
+ *name_lim = name_terminator;
+ }
+ else {
+ // Unparsable region, but possibly a sequence named "foo:a"
+ _tid = bam_name2id(base->header, str);
+ _beg = 0;
+ }
if (_tid >= 0) {
*tid = _tid; *pos = _beg;
@@ -199,6 +224,7 @@ static void tv_win_help(curses_tview_t *tv) {
mvwprintw(win, r++, 2, "N Turn on nt view");
mvwprintw(win, r++, 2, "C Turn on cs view");
mvwprintw(win, r++, 2, "i Toggle on/off ins");
+ mvwprintw(win, r++, 2, "v Inverse video");
mvwprintw(win, r++, 2, "q Exit");
r++;
mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
@@ -231,6 +257,7 @@ static int curses_loop(tview_t* tv)
case 'n': tv->color_for = TV_COLOR_NUCL; break;
case 'c': tv->color_for = TV_COLOR_COL; break;
case 'z': tv->color_for = TV_COLOR_COLQ; break;
+ case 'v': curses_init_colors(tv->inverse = !tv->inverse); break;
case 's': tv->no_skip = !tv->no_skip; break;
case 'r': tv->show_name = !tv->show_name; break;
case KEY_LEFT:
@@ -268,7 +295,8 @@ end_loop:
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
tview_t* base=(tview_t*)tv;
@@ -278,7 +306,7 @@ tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
return 0;
}
- base_tv_init(base,fn,fn_fa,samples);
+ base_tv_init(base,fn,fn_fa,samples,fmt);
/* initialize callbacks */
#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
SET_CALLBACK(destroy);
@@ -301,32 +329,24 @@ tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
getmaxyx(stdscr, base->mrow, base->mcol);
tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(29, 40, 5, 5);
+ tv->whelp = newwin(30, 40, 5, 5);
start_color();
- init_pair(1, COLOR_BLUE, COLOR_BLACK);
- init_pair(2, COLOR_GREEN, COLOR_BLACK);
- init_pair(3, COLOR_YELLOW, COLOR_BLACK);
- init_pair(4, COLOR_WHITE, COLOR_BLACK);
- init_pair(5, COLOR_GREEN, COLOR_BLACK);
- init_pair(6, COLOR_CYAN, COLOR_BLACK);
- init_pair(7, COLOR_YELLOW, COLOR_BLACK);
- init_pair(8, COLOR_RED, COLOR_BLACK);
- init_pair(9, COLOR_BLUE, COLOR_BLACK);
+ curses_init_colors(0);
return base;
}
+#else // !HAVE_CURSES
-#else // #ifdef _HAVE_CURSES
-#include <stdio.h>
#warning "No curses library is available; tview with curses is disabled."
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
+extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt);
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
- return text_tv_init(fn,fn_fa,samples);
+ return text_tv_init(fn,fn_fa,samples,fmt);
}
-#endif // #ifdef _HAVE_CURSES
-
+#endif
diff --git a/samtools/bam_tview_curses.c.pysam.c b/samtools/bam_tview_curses.c.pysam.c
index 90f3673..bbeedf8 100644
--- a/samtools/bam_tview_curses.c.pysam.c
+++ b/samtools/bam_tview_curses.c.pysam.c
@@ -25,38 +25,29 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#undef _HAVE_CURSES
-
-#if _CURSES_LIB == 0
-#elif _CURSES_LIB == 1
-#include <curses.h>
-#ifndef NCURSES_VERSION
-#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled"
-#else
-#define _HAVE_CURSES
-#endif
-#elif _CURSES_LIB == 2
-#include <xcurses.h>
-#define _HAVE_CURSES
-#else
-#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled"
-#endif
-
+#include <config.h>
#include "bam_tview.h"
-#ifdef _HAVE_CURSES
-
-
+#ifdef HAVE_CURSES
+
+#if defined HAVE_NCURSESW_CURSES_H
+#include <ncursesw/curses.h>
+#elif defined HAVE_NCURSESW_H
+#include <ncursesw.h>
+#elif defined HAVE_NCURSES_CURSES_H
+#include <ncurses/curses.h>
+#elif defined HAVE_NCURSES_H
+#include <ncurses.h>
+#elif defined HAVE_CURSES_H
+#include <curses.h>
+#endif
typedef struct CursesTview {
tview_t view;
WINDOW *wgoto, *whelp;
} curses_tview_t;
-
-
-
#define FROM_TV(ptr) ((curses_tview_t*)ptr)
static void curses_destroy(tview_t* base)
@@ -112,6 +103,33 @@ static void curses_clear(struct AbstractTview* tv)
clear();
}
+static int curses_init_colors(int inverse)
+{
+ if (inverse) {
+ init_pair(1, COLOR_WHITE, COLOR_BLUE);
+ init_pair(2, COLOR_BLACK, COLOR_GREEN);
+ init_pair(3, COLOR_BLACK, COLOR_YELLOW);
+ init_pair(4, COLOR_BLACK, COLOR_WHITE);
+ init_pair(5, COLOR_BLACK, COLOR_GREEN);
+ init_pair(6, COLOR_BLACK, COLOR_CYAN);
+ init_pair(7, COLOR_WHITE, COLOR_MAGENTA);
+ init_pair(8, COLOR_WHITE, COLOR_RED);
+ init_pair(9, COLOR_WHITE, COLOR_BLUE);
+ } else {
+ init_pair(1, COLOR_BLUE, COLOR_BLACK);
+ init_pair(2, COLOR_GREEN, COLOR_BLACK);
+ init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+ init_pair(4, COLOR_WHITE, COLOR_BLACK);
+ init_pair(5, COLOR_GREEN, COLOR_BLACK);
+ init_pair(6, COLOR_CYAN, COLOR_BLACK);
+ init_pair(7, COLOR_MAGENTA, COLOR_BLACK);
+ init_pair(8, COLOR_RED, COLOR_BLACK);
+ init_pair(9, COLOR_BLUE, COLOR_BLACK);
+ }
+
+ return 0;
+}
+
static int curses_colorpair(struct AbstractTview* tv,int flag)
{
return COLOR_PAIR(flag);
@@ -147,10 +165,17 @@ static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
}
} else {
char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end);
- char name_terminator = *name_lim;
- *name_lim = '\0';
- _tid = bam_name2id(base->header, str);
- *name_lim = name_terminator;
+ if (name_lim) {
+ char name_terminator = *name_lim;
+ *name_lim = '\0';
+ _tid = bam_name2id(base->header, str);
+ *name_lim = name_terminator;
+ }
+ else {
+ // Unparsable region, but possibly a sequence named "foo:a"
+ _tid = bam_name2id(base->header, str);
+ _beg = 0;
+ }
if (_tid >= 0) {
*tid = _tid; *pos = _beg;
@@ -201,6 +226,7 @@ static void tv_win_help(curses_tview_t *tv) {
mvwprintw(win, r++, 2, "N Turn on nt view");
mvwprintw(win, r++, 2, "C Turn on cs view");
mvwprintw(win, r++, 2, "i Toggle on/off ins");
+ mvwprintw(win, r++, 2, "v Inverse video");
mvwprintw(win, r++, 2, "q Exit");
r++;
mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
@@ -233,6 +259,7 @@ static int curses_loop(tview_t* tv)
case 'n': tv->color_for = TV_COLOR_NUCL; break;
case 'c': tv->color_for = TV_COLOR_COL; break;
case 'z': tv->color_for = TV_COLOR_COLQ; break;
+ case 'v': curses_init_colors(tv->inverse = !tv->inverse); break;
case 's': tv->no_skip = !tv->no_skip; break;
case 'r': tv->show_name = !tv->show_name; break;
case KEY_LEFT:
@@ -270,7 +297,8 @@ end_loop:
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
tview_t* base=(tview_t*)tv;
@@ -280,7 +308,7 @@ tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
return 0;
}
- base_tv_init(base,fn,fn_fa,samples);
+ base_tv_init(base,fn,fn_fa,samples,fmt);
/* initialize callbacks */
#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
SET_CALLBACK(destroy);
@@ -303,32 +331,24 @@ tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
getmaxyx(stdscr, base->mrow, base->mcol);
tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(29, 40, 5, 5);
+ tv->whelp = newwin(30, 40, 5, 5);
start_color();
- init_pair(1, COLOR_BLUE, COLOR_BLACK);
- init_pair(2, COLOR_GREEN, COLOR_BLACK);
- init_pair(3, COLOR_YELLOW, COLOR_BLACK);
- init_pair(4, COLOR_WHITE, COLOR_BLACK);
- init_pair(5, COLOR_GREEN, COLOR_BLACK);
- init_pair(6, COLOR_CYAN, COLOR_BLACK);
- init_pair(7, COLOR_YELLOW, COLOR_BLACK);
- init_pair(8, COLOR_RED, COLOR_BLACK);
- init_pair(9, COLOR_BLUE, COLOR_BLACK);
+ curses_init_colors(0);
return base;
}
+#else // !HAVE_CURSES
-#else // #ifdef _HAVE_CURSES
-#include <stdio.h>
#warning "No curses library is available; tview with curses is disabled."
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
+extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt);
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
- return text_tv_init(fn,fn_fa,samples);
+ return text_tv_init(fn,fn_fa,samples,fmt);
}
-#endif // #ifdef _HAVE_CURSES
-
+#endif
diff --git a/samtools/bam_tview_html.c b/samtools/bam_tview_html.c
index c71672b..9db8fce 100644
--- a/samtools/bam_tview_html.c
+++ b/samtools/bam_tview_html.c
@@ -312,7 +312,8 @@ static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char*
}
*/
-tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
char* colstr=getenv("COLUMNS");
html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
@@ -326,7 +327,7 @@ tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
tv->screen=NULL;
tv->out=stdout;
tv->attributes=0;
- base_tv_init(base,fn,fn_fa,samples);
+ base_tv_init(base,fn,fn_fa,samples,fmt);
/* initialize callbacks */
#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
SET_CALLBACK(destroy);
@@ -364,9 +365,10 @@ tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
}
-tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
- tview_t* tv=html_tv_init(fn,fn_fa,samples);
+ tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt);
tv->my_drawaln=text_drawaln;
return tv;
}
diff --git a/samtools/bam_tview_html.c.pysam.c b/samtools/bam_tview_html.c.pysam.c
index e4529a5..b42c737 100644
--- a/samtools/bam_tview_html.c.pysam.c
+++ b/samtools/bam_tview_html.c.pysam.c
@@ -314,7 +314,8 @@ static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char*
}
*/
-tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
char* colstr=getenv("COLUMNS");
html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
@@ -328,7 +329,7 @@ tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
tv->screen=NULL;
tv->out=stdout;
tv->attributes=0;
- base_tv_init(base,fn,fn_fa,samples);
+ base_tv_init(base,fn,fn_fa,samples,fmt);
/* initialize callbacks */
#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
SET_CALLBACK(destroy);
@@ -366,9 +367,10 @@ tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
}
-tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples)
+tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
+ const htsFormat *fmt)
{
- tview_t* tv=html_tv_init(fn,fn_fa,samples);
+ tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt);
tv->my_drawaln=text_drawaln;
return tv;
}
diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c
index 1443eb2..ac97bb8 100644
--- a/samtools/bamshuf.c
+++ b/samtools/bamshuf.c
@@ -1,4 +1,4 @@
-/* bamshuf.c -- bamshuf subcommand.
+/* bamshuf.c -- collate subcommand.
Copyright (C) 2012 Broad Institute.
Copyright (C) 2013 Genome Research Ltd.
@@ -29,9 +29,10 @@ DEALINGS IN THE SOFTWARE. */
#include <string.h>
#include <assert.h>
#include "htslib/sam.h"
-#include "htslib/bgzf.h"
+#include "htslib/hts.h"
#include "htslib/ksort.h"
#include "samtools.h"
+#include "sam_opts.h"
#define DEF_CLEVEL 1
@@ -73,9 +74,10 @@ static inline int elem_lt(elem_t x, elem_t y)
KSORT_INIT(bamshuf, elem_t, elem_lt)
-static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout)
+static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
+ int is_stdout, sam_global_args *ga)
{
- BGZF *fp, *fpw, **fpt;
+ samFile *fp, *fpw, **fpt;
char **fnt, modew[8];
bam1_t *b;
int i, l;
@@ -83,101 +85,129 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, int
int64_t *cnt;
// split
- fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r");
+ fp = sam_open_format(fn, "r", &ga->in);
if (fp == NULL) {
- print_error_errno("Cannot open input file \"%s\"", fn);
+ print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
}
- h = bam_hdr_read(fp);
+ h = sam_hdr_read(fp);
+ if (h == NULL) {
+ fprintf(stderr, "Couldn't read header for '%s'\n", fn);
+ return 1;
+ }
fnt = (char**)calloc(n_files, sizeof(char*));
- fpt = (BGZF**)calloc(n_files, sizeof(BGZF*));
+ fpt = (samFile**)calloc(n_files, sizeof(samFile*));
cnt = (int64_t*)calloc(n_files, 8);
l = strlen(pre);
+
for (i = 0; i < n_files; ++i) {
fnt[i] = (char*)calloc(l + 10, 1);
sprintf(fnt[i], "%s.%.4d.bam", pre, i);
- fpt[i] = bgzf_open(fnt[i], "w1");
+ fpt[i] = sam_open(fnt[i], "wb1");
if (fpt[i] == NULL) {
- print_error_errno("Cannot open intermediate file \"%s\"", fnt[i]);
+ print_error_errno("collate", "Cannot open intermediate file \"%s\"", fnt[i]);
return 1;
}
- bam_hdr_write(fpt[i], h);
+ sam_hdr_write(fpt[i], h);
}
b = bam_init1();
- while (bam_read1(fp, b) >= 0) {
+ while (sam_read1(fp, h, b) >= 0) {
uint32_t x;
x = hash_X31_Wang(bam_get_qname(b)) % n_files;
- bam_write1(fpt[x], b);
+ sam_write1(fpt[x], h, b);
++cnt[x];
}
bam_destroy1(b);
- for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]);
+ for (i = 0; i < n_files; ++i) sam_close(fpt[i]);
free(fpt);
- bgzf_close(fp);
+ sam_close(fp);
+
// merge
- sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
+ sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
if (!is_stdout) { // output to a file
char *fnw = (char*)calloc(l + 5, 1);
- sprintf(fnw, "%s.bam", pre);
- fpw = bgzf_open(fnw, modew);
+ if (ga->out.format == unknown_format)
+ sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default
+ else
+ sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out));
+ fpw = sam_open_format(fnw, modew, &ga->out);
free(fnw);
- } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout
+ } else fpw = sam_open_format("-", modew, &ga->out); // output to stdout
if (fpw == NULL) {
- if (is_stdout) print_error_errno("Cannot open standard output");
- else print_error_errno("Cannot open output file \"%s.bam\"", pre);
+ if (is_stdout) print_error_errno("collate", "Cannot open standard output");
+ else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
return 1;
}
- bam_hdr_write(fpw, h);
- bam_hdr_destroy(h);
+ sam_hdr_write(fpw, h);
for (i = 0; i < n_files; ++i) {
int64_t j, c = cnt[i];
elem_t *a;
- fp = bgzf_open(fnt[i], "r");
- bam_hdr_destroy(bam_hdr_read(fp));
+ fp = sam_open_format(fnt[i], "r", &ga->in);
+ bam_hdr_destroy(sam_hdr_read(fp));
a = (elem_t*)calloc(c, sizeof(elem_t));
for (j = 0; j < c; ++j) {
a[j].b = bam_init1();
- assert(bam_read1(fp, a[j].b) >= 0);
+ sam_read1(fp, h, a[j].b);
a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
}
- bgzf_close(fp);
+ sam_close(fp);
unlink(fnt[i]);
free(fnt[i]);
ks_introsort(bamshuf, c, a);
for (j = 0; j < c; ++j) {
- bam_write1(fpw, a[j].b);
+ sam_write1(fpw, h, a[j].b);
bam_destroy1(a[j].b);
}
free(a);
}
- bgzf_close(fpw);
+ sam_close(fpw);
+ bam_hdr_destroy(h);
free(fnt); free(cnt);
+ sam_global_args_free(ga);
+
return 0;
}
+static int usage(FILE *fp, int n_files) {
+ fprintf(fp,
+ "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
+ "Options:\n"
+ " -O output to stdout\n"
+ " -u uncompressed BAM output\n"
+ " -l INT compression level [%d]\n" // DEF_CLEVEL
+ " -n INT number of temporary files [%d]\n", // n_files
+ DEF_CLEVEL, n_files);
+
+ sam_global_opt_help(fp, "-....");
+
+ return 1;
+}
+
int main_bamshuf(int argc, char *argv[])
{
int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
- while ((c = getopt(argc, argv, "n:l:uO")) >= 0) {
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'u': is_un = 1; break;
case 'O': is_stdout = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return usage(stderr, n_files);
}
}
if (is_un) clevel = 0;
- if (optind + 2 > argc) {
- fprintf(stderr,
-"Usage: samtools bamshuf [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
-"Options: -O output to stdout\n"
-" -u uncompressed BAM output\n"
-" -l INT compression level [%d]\n" // DEF_CLEVEL
-" -n INT number of temporary files [%d]\n", // n_files
- DEF_CLEVEL, n_files);
- return 1;
- }
- return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout);
+ if (optind + 2 > argc)
+ return usage(stderr, n_files);
+
+ return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout, &ga);
}
diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c
index ccc7bb1..d17cf9b 100644
--- a/samtools/bamshuf.c.pysam.c
+++ b/samtools/bamshuf.c.pysam.c
@@ -1,6 +1,6 @@
#include "pysam.h"
-/* bamshuf.c -- bamshuf subcommand.
+/* bamshuf.c -- collate subcommand.
Copyright (C) 2012 Broad Institute.
Copyright (C) 2013 Genome Research Ltd.
@@ -31,9 +31,10 @@ DEALINGS IN THE SOFTWARE. */
#include <string.h>
#include <assert.h>
#include "htslib/sam.h"
-#include "htslib/bgzf.h"
+#include "htslib/hts.h"
#include "htslib/ksort.h"
#include "samtools.h"
+#include "sam_opts.h"
#define DEF_CLEVEL 1
@@ -75,9 +76,10 @@ static inline int elem_lt(elem_t x, elem_t y)
KSORT_INIT(bamshuf, elem_t, elem_lt)
-static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout)
+static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
+ int is_stdout, sam_global_args *ga)
{
- BGZF *fp, *fpw, **fpt;
+ samFile *fp, *fpw, **fpt;
char **fnt, modew[8];
bam1_t *b;
int i, l;
@@ -85,101 +87,129 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, int
int64_t *cnt;
// split
- fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r");
+ fp = sam_open_format(fn, "r", &ga->in);
if (fp == NULL) {
- print_error_errno("Cannot open input file \"%s\"", fn);
+ print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
}
- h = bam_hdr_read(fp);
+ h = sam_hdr_read(fp);
+ if (h == NULL) {
+ fprintf(pysamerr, "Couldn't read header for '%s'\n", fn);
+ return 1;
+ }
fnt = (char**)calloc(n_files, sizeof(char*));
- fpt = (BGZF**)calloc(n_files, sizeof(BGZF*));
+ fpt = (samFile**)calloc(n_files, sizeof(samFile*));
cnt = (int64_t*)calloc(n_files, 8);
l = strlen(pre);
+
for (i = 0; i < n_files; ++i) {
fnt[i] = (char*)calloc(l + 10, 1);
sprintf(fnt[i], "%s.%.4d.bam", pre, i);
- fpt[i] = bgzf_open(fnt[i], "w1");
+ fpt[i] = sam_open(fnt[i], "wb1");
if (fpt[i] == NULL) {
- print_error_errno("Cannot open intermediate file \"%s\"", fnt[i]);
+ print_error_errno("collate", "Cannot open intermediate file \"%s\"", fnt[i]);
return 1;
}
- bam_hdr_write(fpt[i], h);
+ sam_hdr_write(fpt[i], h);
}
b = bam_init1();
- while (bam_read1(fp, b) >= 0) {
+ while (sam_read1(fp, h, b) >= 0) {
uint32_t x;
x = hash_X31_Wang(bam_get_qname(b)) % n_files;
- bam_write1(fpt[x], b);
+ sam_write1(fpt[x], h, b);
++cnt[x];
}
bam_destroy1(b);
- for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]);
+ for (i = 0; i < n_files; ++i) sam_close(fpt[i]);
free(fpt);
- bgzf_close(fp);
+ sam_close(fp);
+
// merge
- sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
+ sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
if (!is_stdout) { // output to a file
char *fnw = (char*)calloc(l + 5, 1);
- sprintf(fnw, "%s.bam", pre);
- fpw = bgzf_open(fnw, modew);
+ if (ga->out.format == unknown_format)
+ sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default
+ else
+ sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out));
+ fpw = sam_open_format(fnw, modew, &ga->out);
free(fnw);
- } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout
+ } else fpw = sam_open_format("-", modew, &ga->out); // output to stdout
if (fpw == NULL) {
- if (is_stdout) print_error_errno("Cannot open standard output");
- else print_error_errno("Cannot open output file \"%s.bam\"", pre);
+ if (is_stdout) print_error_errno("collate", "Cannot open standard output");
+ else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
return 1;
}
- bam_hdr_write(fpw, h);
- bam_hdr_destroy(h);
+ sam_hdr_write(fpw, h);
for (i = 0; i < n_files; ++i) {
int64_t j, c = cnt[i];
elem_t *a;
- fp = bgzf_open(fnt[i], "r");
- bam_hdr_destroy(bam_hdr_read(fp));
+ fp = sam_open_format(fnt[i], "r", &ga->in);
+ bam_hdr_destroy(sam_hdr_read(fp));
a = (elem_t*)calloc(c, sizeof(elem_t));
for (j = 0; j < c; ++j) {
a[j].b = bam_init1();
- assert(bam_read1(fp, a[j].b) >= 0);
+ sam_read1(fp, h, a[j].b);
a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
}
- bgzf_close(fp);
+ sam_close(fp);
unlink(fnt[i]);
free(fnt[i]);
ks_introsort(bamshuf, c, a);
for (j = 0; j < c; ++j) {
- bam_write1(fpw, a[j].b);
+ sam_write1(fpw, h, a[j].b);
bam_destroy1(a[j].b);
}
free(a);
}
- bgzf_close(fpw);
+ sam_close(fpw);
+ bam_hdr_destroy(h);
free(fnt); free(cnt);
+ sam_global_args_free(ga);
+
return 0;
}
+static int usage(FILE *fp, int n_files) {
+ fprintf(fp,
+ "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
+ "Options:\n"
+ " -O output to stdout\n"
+ " -u uncompressed BAM output\n"
+ " -l INT compression level [%d]\n" // DEF_CLEVEL
+ " -n INT number of temporary files [%d]\n", // n_files
+ DEF_CLEVEL, n_files);
+
+ sam_global_opt_help(fp, "-....");
+
+ return 1;
+}
+
int main_bamshuf(int argc, char *argv[])
{
int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
- while ((c = getopt(argc, argv, "n:l:uO")) >= 0) {
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'u': is_un = 1; break;
case 'O': is_stdout = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return usage(pysamerr, n_files);
}
}
if (is_un) clevel = 0;
- if (optind + 2 > argc) {
- fprintf(pysamerr,
-"Usage: samtools bamshuf [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
-"Options: -O output to stdout\n"
-" -u uncompressed BAM output\n"
-" -l INT compression level [%d]\n" // DEF_CLEVEL
-" -n INT number of temporary files [%d]\n", // n_files
- DEF_CLEVEL, n_files);
- return 1;
- }
- return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout);
+ if (optind + 2 > argc)
+ return usage(pysamerr, n_files);
+
+ return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout, &ga);
}
diff --git a/samtools/bamtk.c b/samtools/bamtk.c
new file mode 100644
index 0000000..4b4df77
--- /dev/null
+++ b/samtools/bamtk.c
@@ -0,0 +1,227 @@
+/* bamtk.c -- main samtools command front-end.
+
+ Copyright (C) 2008-2015 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include "htslib/hts.h"
+#include "samtools.h"
+#include "version.h"
+
+int bam_taf2baf(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
+int bam_merge(int argc, char *argv[]);
+int bam_index(int argc, char *argv[]);
+int bam_sort(int argc, char *argv[]);
+int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
+int bam_idxstats(int argc, char *argv[]);
+int main_samview(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
+int main_cut_target(int argc, char *argv[]);
+int main_phase(int argc, char *argv[]);
+int main_cat(int argc, char *argv[]);
+int main_depth(int argc, char *argv[]);
+int main_bam2fq(int argc, char *argv[]);
+int main_pad2unpad(int argc, char *argv[]);
+int main_bedcov(int argc, char *argv[]);
+int main_bamshuf(int argc, char *argv[]);
+int main_stats(int argc, char *argv[]);
+int main_flags(int argc, char *argv[]);
+int main_split(int argc, char *argv[]);
+int main_quickcheck(int argc, char *argv[]);
+int main_addreplacerg(int argc, char *argv[]);
+int faidx_main(int argc, char *argv[]);
+int dict_main(int argc, char *argv[]);
+
+const char *samtools_version()
+{
+ return SAMTOOLS_VERSION;
+}
+
+static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+{
+ fflush(stdout);
+ if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand);
+ else fprintf(stderr, "samtools: ");
+ vfprintf(stderr, format, args);
+ if (extra) fprintf(stderr, ": %s\n", extra);
+ else fprintf(stderr, "\n");
+ fflush(stderr);
+}
+
+void print_error(const char *subcommand, const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, NULL);
+ va_end(args);
+}
+
+void print_error_errno(const char *subcommand, const char *format, ...)
+{
+ int err = errno;
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, strerror(err));
+ va_end(args);
+}
+
+static void usage(FILE *fp)
+{
+ /* Please improve the grouping */
+
+ fprintf(fp,
+"\n"
+"Program: samtools (Tools for alignments in the SAM format)\n"
+"Version: %s (using htslib %s)\n\n", samtools_version(), hts_version());
+ fprintf(fp,
+"Usage: samtools <command> [options]\n"
+"\n"
+"Commands:\n"
+" -- Indexing\n"
+" dict create a sequence dictionary file\n"
+" faidx index/extract FASTA\n"
+" index index alignment\n"
+"\n"
+" -- Editing\n"
+" calmd recalculate MD/NM tags and '=' bases\n"
+" fixmate fix mate information\n"
+" reheader replace BAM header\n"
+" rmdup remove PCR duplicates\n"
+" targetcut cut fosmid regions (for fosmid pool only)\n"
+" addreplacerg adds or replaces RG tags\n"
+"\n"
+" -- File operations\n"
+" collate shuffle and group alignments by name\n"
+" cat concatenate BAMs\n"
+" merge merge sorted alignments\n"
+" mpileup multi-way pileup\n"
+" sort sort alignment file\n"
+" split splits a file by read group\n"
+" quickcheck quickly check if SAM/BAM/CRAM file appears intact\n"
+" fastq converts a BAM to a FASTQ\n"
+" fasta converts a BAM to a FASTA\n"
+"\n"
+" -- Statistics\n"
+" bedcov read depth per BED region\n"
+" depth compute the depth\n"
+" flagstat simple stats\n"
+" idxstats BAM index stats\n"
+" phase phase heterozygotes\n"
+" stats generate stats (former bamcheck)\n"
+"\n"
+" -- Viewing\n"
+" flags explain BAM flags\n"
+" tview text alignment viewer\n"
+" view SAM<->BAM<->CRAM conversion\n"
+" depad convert padded BAM to unpadded BAM\n"
+"\n");
+#ifdef _WIN32
+ fprintf(fp,
+"Note: The Windows version of SAMtools is mainly designed for read-only\n"
+" operations, such as viewing the alignments and generating the pileup.\n"
+" Binary files generated by the Windows version may be buggy.\n\n");
+#endif
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef _WIN32
+ setmode(fileno(stdout), O_BINARY);
+ setmode(fileno(stdin), O_BINARY);
+#endif
+ if (argc < 2) { usage(stderr); return 1; }
+
+ if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) {
+ if (argc == 2) { usage(stdout); return 0; }
+
+ // Otherwise change "samtools help COMMAND [...]" to "samtools COMMAND";
+ // main_xyz() functions by convention display the subcommand's usage
+ // when invoked without any arguments.
+ argv++;
+ argc = 2;
+ }
+
+ int ret = 0;
+ if (strcmp(argv[1], "view") == 0) ret = main_samview(argc-1, argv+1);
+ else if (strcmp(argv[1], "import") == 0) ret = main_import(argc-1, argv+1);
+ else if (strcmp(argv[1], "mpileup") == 0) ret = bam_mpileup(argc-1, argv+1);
+ else if (strcmp(argv[1], "merge") == 0) ret = bam_merge(argc-1, argv+1);
+ else if (strcmp(argv[1], "sort") == 0) ret = bam_sort(argc-1, argv+1);
+ else if (strcmp(argv[1], "index") == 0) ret = bam_index(argc-1, argv+1);
+ else if (strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1);
+ else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1);
+ else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1);
+ else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1);
+ else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1);
+ else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) ret = main_cat(argc-1, argv+1);
+ else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1);
+ else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1);
+ else if (strcmp(argv[1], "depth") == 0) ret = main_depth(argc-1, argv+1);
+ else if (strcmp(argv[1], "bam2fq") == 0 ||
+ strcmp(argv[1], "fastq") == 0 ||
+ strcmp(argv[1], "fasta") == 0) ret = main_bam2fq(argc-1, argv+1);
+ else if (strcmp(argv[1], "pad2unpad") == 0) ret = main_pad2unpad(argc-1, argv+1);
+ else if (strcmp(argv[1], "depad") == 0) ret = main_pad2unpad(argc-1, argv+1);
+ else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1);
+ else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1);
+ else if (strcmp(argv[1], "collate") == 0) ret = main_bamshuf(argc-1, argv+1);
+ else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1);
+ else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1);
+ else if (strcmp(argv[1], "split") == 0) ret = main_split(argc-1, argv+1);
+ else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1);
+ else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1);
+ else if (strcmp(argv[1], "pileup") == 0) {
+ fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
+ return 1;
+ }
+ else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "--version") == 0) {
+ printf(
+"samtools %s\n"
+"Using htslib %s\n"
+"Copyright (C) 2015 Genome Research Ltd.\n",
+ samtools_version(), hts_version());
+ }
+ else if (strcmp(argv[1], "--version-only") == 0) {
+ printf("%s+htslib-%s\n", samtools_version(), hts_version());
+ }
+ else {
+ fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
+ return 1;
+ }
+ return ret;
+}
diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c
new file mode 100644
index 0000000..a369810
--- /dev/null
+++ b/samtools/bamtk.c.pysam.c
@@ -0,0 +1,229 @@
+#include "pysam.h"
+
+/* bamtk.c -- main samtools command front-end.
+
+ Copyright (C) 2008-2015 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include "htslib/hts.h"
+#include "samtools.h"
+#include "version.h"
+
+int bam_taf2baf(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
+int bam_merge(int argc, char *argv[]);
+int bam_index(int argc, char *argv[]);
+int bam_sort(int argc, char *argv[]);
+int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
+int bam_idxstats(int argc, char *argv[]);
+int main_samview(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
+int main_cut_target(int argc, char *argv[]);
+int main_phase(int argc, char *argv[]);
+int main_cat(int argc, char *argv[]);
+int main_depth(int argc, char *argv[]);
+int main_bam2fq(int argc, char *argv[]);
+int main_pad2unpad(int argc, char *argv[]);
+int main_bedcov(int argc, char *argv[]);
+int main_bamshuf(int argc, char *argv[]);
+int main_stats(int argc, char *argv[]);
+int main_flags(int argc, char *argv[]);
+int main_split(int argc, char *argv[]);
+int main_quickcheck(int argc, char *argv[]);
+int main_addreplacerg(int argc, char *argv[]);
+int faidx_main(int argc, char *argv[]);
+int dict_main(int argc, char *argv[]);
+
+const char *samtools_version()
+{
+ return SAMTOOLS_VERSION;
+}
+
+static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+{
+ fflush(stdout);
+ if (subcommand && *subcommand) fprintf(pysamerr, "samtools %s: ", subcommand);
+ else fprintf(pysamerr, "samtools: ");
+ vfprintf(pysamerr, format, args);
+ if (extra) fprintf(pysamerr, ": %s\n", extra);
+ else fprintf(pysamerr, "\n");
+ fflush(pysamerr);
+}
+
+void print_error(const char *subcommand, const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, NULL);
+ va_end(args);
+}
+
+void print_error_errno(const char *subcommand, const char *format, ...)
+{
+ int err = errno;
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, strerror(err));
+ va_end(args);
+}
+
+static void usage(FILE *fp)
+{
+ /* Please improve the grouping */
+
+ fprintf(fp,
+"\n"
+"Program: samtools (Tools for alignments in the SAM format)\n"
+"Version: %s (using htslib %s)\n\n", samtools_version(), hts_version());
+ fprintf(fp,
+"Usage: samtools <command> [options]\n"
+"\n"
+"Commands:\n"
+" -- Indexing\n"
+" dict create a sequence dictionary file\n"
+" faidx index/extract FASTA\n"
+" index index alignment\n"
+"\n"
+" -- Editing\n"
+" calmd recalculate MD/NM tags and '=' bases\n"
+" fixmate fix mate information\n"
+" reheader replace BAM header\n"
+" rmdup remove PCR duplicates\n"
+" targetcut cut fosmid regions (for fosmid pool only)\n"
+" addreplacerg adds or replaces RG tags\n"
+"\n"
+" -- File operations\n"
+" collate shuffle and group alignments by name\n"
+" cat concatenate BAMs\n"
+" merge merge sorted alignments\n"
+" mpileup multi-way pileup\n"
+" sort sort alignment file\n"
+" split splits a file by read group\n"
+" quickcheck quickly check if SAM/BAM/CRAM file appears intact\n"
+" fastq converts a BAM to a FASTQ\n"
+" fasta converts a BAM to a FASTA\n"
+"\n"
+" -- Statistics\n"
+" bedcov read depth per BED region\n"
+" depth compute the depth\n"
+" flagstat simple stats\n"
+" idxstats BAM index stats\n"
+" phase phase heterozygotes\n"
+" stats generate stats (former bamcheck)\n"
+"\n"
+" -- Viewing\n"
+" flags explain BAM flags\n"
+" tview text alignment viewer\n"
+" view SAM<->BAM<->CRAM conversion\n"
+" depad convert padded BAM to unpadded BAM\n"
+"\n");
+#ifdef _WIN32
+ fprintf(fp,
+"Note: The Windows version of SAMtools is mainly designed for read-only\n"
+" operations, such as viewing the alignments and generating the pileup.\n"
+" Binary files generated by the Windows version may be buggy.\n\n");
+#endif
+}
+
+int samtools_main(int argc, char *argv[])
+{
+#ifdef _WIN32
+ setmode(fileno(stdout), O_BINARY);
+ setmode(fileno(stdin), O_BINARY);
+#endif
+
+ if (argc < 2) { usage(pysamerr); return 1; }
+
+ if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) {
+ if (argc == 2) { usage(stdout); return 0; }
+
+ // Otherwise change "samtools help COMMAND [...]" to "samtools COMMAND";
+ // main_xyz() functions by convention display the subcommand's usage
+ // when invoked without any arguments.
+ argv++;
+ argc = 2;
+ }
+ int ret = 0;
+ if (strcmp(argv[1], "view") == 0) ret = main_samview(argc-1, argv+1);
+ else if (strcmp(argv[1], "import") == 0) ret = main_import(argc-1, argv+1);
+ else if (strcmp(argv[1], "mpileup") == 0) ret = bam_mpileup(argc-1, argv+1);
+ else if (strcmp(argv[1], "merge") == 0) ret = bam_merge(argc-1, argv+1);
+ else if (strcmp(argv[1], "sort") == 0) ret = bam_sort(argc-1, argv+1);
+ else if (strcmp(argv[1], "index") == 0) ret = bam_index(argc-1, argv+1);
+ else if (strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1);
+ else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1);
+ else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1);
+ else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1);
+ else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1);
+ else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) ret = main_cat(argc-1, argv+1);
+ else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1);
+ else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1);
+ else if (strcmp(argv[1], "depth") == 0) ret = main_depth(argc-1, argv+1);
+ else if (strcmp(argv[1], "bam2fq") == 0 ||
+ strcmp(argv[1], "fastq") == 0 ||
+ strcmp(argv[1], "fasta") == 0) ret = main_bam2fq(argc-1, argv+1);
+ else if (strcmp(argv[1], "pad2unpad") == 0) ret = main_pad2unpad(argc-1, argv+1);
+ else if (strcmp(argv[1], "depad") == 0) ret = main_pad2unpad(argc-1, argv+1);
+ else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1);
+ else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1);
+ else if (strcmp(argv[1], "collate") == 0) ret = main_bamshuf(argc-1, argv+1);
+ else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1);
+ else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1);
+ else if (strcmp(argv[1], "split") == 0) ret = main_split(argc-1, argv+1);
+ else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1);
+ else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1);
+ else if (strcmp(argv[1], "pileup") == 0) {
+ fprintf(pysamerr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
+ return 1;
+ }
+ else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "--version") == 0) {
+ printf(
+"samtools %s\n"
+"Using htslib %s\n"
+"Copyright (C) 2015 Genome Research Ltd.\n",
+ samtools_version(), hts_version());
+ }
+ else if (strcmp(argv[1], "--version-only") == 0) {
+ printf("%s+htslib-%s\n", samtools_version(), hts_version());
+ }
+ else {
+ fprintf(pysamerr, "[main] unrecognized command '%s'\n", argv[1]);
+ return 1;
+ }
+ return ret;
+}
diff --git a/samtools/bedcov.c b/samtools/bedcov.c
index 13e7d2a..e2f0db8 100644
--- a/samtools/bedcov.c
+++ b/samtools/bedcov.c
@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "sam_opts.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
@@ -67,14 +68,27 @@ int main_bedcov(int argc, char *argv[])
int *n_plp, dret, i, n, c, min_mapQ = 0;
int64_t *cnt;
const bam_pileup1_t **plp;
+ int usage = 0;
- while ((c = getopt(argc, argv, "Q:")) >= 0) {
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': min_mapQ = atoi(optarg); break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage = 1; break;
}
+ if (usage) break;
}
- if (optind + 2 > argc) {
- fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
+ if (usage || optind + 2 > argc) {
+ fprintf(stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
+ fprintf(stderr, " -Q INT Only count bases of at least INT quality [0]\n");
+ sam_global_opt_help(stderr, "-.--.");
return 1;
}
memset(&str, 0, sizeof(kstring_t));
@@ -84,14 +98,20 @@ int main_bedcov(int argc, char *argv[])
for (i = 0; i < n; ++i) {
aux[i] = calloc(1, sizeof(aux_t));
aux[i]->min_mapQ = min_mapQ;
- aux[i]->fp = sam_open(argv[i+optind+1], "r");
- idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
+ aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
+ if (aux[i]->fp)
+ idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
if (aux[i]->fp == 0 || idx[i] == 0) {
fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
return 2;
}
// TODO bgzf_set_cache_size(aux[i]->fp, 20);
aux[i]->header = sam_hdr_read(aux[i]->fp);
+ if (aux[i]->header == NULL) {
+ fprintf(stderr, "ERROR: failed to read header for '%s'\n",
+ argv[i+optind+1]);
+ return 2;
+ }
}
cnt = calloc(n, 8);
@@ -152,5 +172,6 @@ bed_error:
}
free(aux); free(idx);
free(str.s);
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c
index 6d20b17..6faa7bf 100644
--- a/samtools/bedcov.c.pysam.c
+++ b/samtools/bedcov.c.pysam.c
@@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "sam_opts.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
@@ -69,14 +70,27 @@ int main_bedcov(int argc, char *argv[])
int *n_plp, dret, i, n, c, min_mapQ = 0;
int64_t *cnt;
const bam_pileup1_t **plp;
+ int usage = 0;
- while ((c = getopt(argc, argv, "Q:")) >= 0) {
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': min_mapQ = atoi(optarg); break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage = 1; break;
}
+ if (usage) break;
}
- if (optind + 2 > argc) {
- fprintf(pysamerr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
+ if (usage || optind + 2 > argc) {
+ fprintf(pysamerr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
+ fprintf(pysamerr, " -Q INT Only count bases of at least INT quality [0]\n");
+ sam_global_opt_help(pysamerr, "-.--.");
return 1;
}
memset(&str, 0, sizeof(kstring_t));
@@ -86,14 +100,20 @@ int main_bedcov(int argc, char *argv[])
for (i = 0; i < n; ++i) {
aux[i] = calloc(1, sizeof(aux_t));
aux[i]->min_mapQ = min_mapQ;
- aux[i]->fp = sam_open(argv[i+optind+1], "r");
- idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
+ aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
+ if (aux[i]->fp)
+ idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
if (aux[i]->fp == 0 || idx[i] == 0) {
fprintf(pysamerr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
return 2;
}
// TODO bgzf_set_cache_size(aux[i]->fp, 20);
aux[i]->header = sam_hdr_read(aux[i]->fp);
+ if (aux[i]->header == NULL) {
+ fprintf(pysamerr, "ERROR: failed to read header for '%s'\n",
+ argv[i+optind+1]);
+ return 2;
+ }
}
cnt = calloc(n, 8);
@@ -154,5 +174,6 @@ bed_error:
}
free(aux); free(idx);
free(str.s);
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/cut_target.c b/samtools/cut_target.c
index e160277..56ec9f9 100644
--- a/samtools/cut_target.c
+++ b/samtools/cut_target.c
@@ -1,7 +1,7 @@
/* cut_target.c -- targetcut subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2012-2013 Genome Research Ltd.
+ Copyright (C) 2012-2013, 2015 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -26,9 +26,10 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
-#include "bam.h"
+#include "htslib/sam.h"
#include "errmod.h"
#include "htslib/faidx.h"
+#include "sam_opts.h"
#define ERR_DEP 0.83
@@ -44,16 +45,16 @@ static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
typedef struct {
int min_baseQ, tid, max_bases;
uint16_t *bases;
- bamFile fp;
- bam_header_t *h;
+ samFile *fp;
+ bam_hdr_t *h;
char *ref;
+ int len;
faidx_t *fai;
errmod_t *em;
} ct_t;
static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
{
- extern const char bam_nt16_nt4_table[];
int i, j, ret, tmp, k, sum[4], qual;
float q[16];
if (n > g->max_bases) { // enlarge g->bases
@@ -66,15 +67,15 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
uint8_t *seq;
int q, baseQ, b;
if (p->is_refskip || p->is_del) continue;
- baseQ = bam1_qual(p->b)[p->qpos];
+ baseQ = bam_get_qual(p->b)[p->qpos];
if (baseQ < g->min_baseQ) continue;
- seq = bam1_seq(p->b);
- b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+ seq = bam_get_seq(p->b);
+ b = seq_nt16_int[bam_seqi(seq, p->qpos)];
if (b > 3) continue;
q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
if (q < 4) q = 4;
if (q > 63) q = 63;
- g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b;
+ g->bases[k++] = q<<5 | bam_is_rev(p->b)<<4 | b;
}
if (k == 0) return 0;
errmod_cal(g->em, k, 4, g->bases, q);
@@ -88,7 +89,7 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
return ret<<8|k;
}
-static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
+static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns)
{
int i, f[2][2], *prev, *curr, *swap_tmp, s;
uint8_t *b; // backtrack array
@@ -143,21 +144,21 @@ static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
static int read_aln(void *data, bam1_t *b)
{
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
ct_t *g = (ct_t*)data;
- int ret, len;
+ int ret;
while (1)
{
- ret = bam_read1(g->fp, b);
+ ret = sam_read1(g->fp, g->h, b);
if ( ret<0 ) break;
if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
if ( g->fai && b->core.tid >= 0 ) {
if (b->core.tid != g->tid) { // then load the sequence
free(g->ref);
- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+ g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len);
g->tid = b->core.tid;
}
- bam_prob_realn_core(b, g->ref, 1<<1|1);
+ bam_prob_realn_core(b, g->ref, g->len, 1<<1|1);
}
break;
}
@@ -166,33 +167,49 @@ static int read_aln(void *data, bam1_t *b)
int main_cut_target(int argc, char *argv[])
{
- int c, tid, pos, n, lasttid = -1, l, max_l;
+ int c, tid, pos, n, lasttid = -1, l, max_l, usage = 0;
const bam_pileup1_t *p;
bam_plp_t plp;
uint16_t *cns;
ct_t g;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'),
+ { NULL, 0, NULL, 0 }
+ };
+
memset(&g, 0, sizeof(ct_t));
g.min_baseQ = 13; g.tid = -1;
- while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "f:Q:i:o:0:1:2:", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
case '1': g_param.e[1][1] = atoi(optarg); break;
case '2': g_param.e[1][2] = atoi(optarg); break;
- case 'f': g.fai = fai_load(optarg);
- if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__);
- break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
}
}
- if (argc == optind) {
- fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n");
+ if (ga.reference) {
+ g.fai = fai_load(ga.reference);
+ if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__);
+ }
+ if (usage || argc == optind) {
+ fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
+ sam_global_opt_help(stderr, "-.--f");
return 1;
}
l = max_l = 0; cns = 0;
- g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
- g.h = bam_header_read(g.fp);
+ g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ g.h = sam_hdr_read(g.fp);
+ if (g.h == NULL) {
+ fprintf(stderr, "Couldn't read header for '%s'\n", argv[optind]);
+ sam_close(g.fp);
+ return 1;
+ }
g.em = errmod_init(1. - ERR_DEP);
plp = bam_plp_init(read_aln, &g);
while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
@@ -212,13 +229,14 @@ int main_cut_target(int argc, char *argv[])
}
process_cns(g.h, lasttid, l, cns);
free(cns);
- bam_header_destroy(g.h);
+ bam_hdr_destroy(g.h);
bam_plp_destroy(plp);
- bam_close(g.fp);
+ sam_close(g.fp);
if (g.fai) {
fai_destroy(g.fai); free(g.ref);
}
errmod_destroy(g.em);
free(g.bases);
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/cut_target.c.pysam.c b/samtools/cut_target.c.pysam.c
index de6b17c..92b15a0 100644
--- a/samtools/cut_target.c.pysam.c
+++ b/samtools/cut_target.c.pysam.c
@@ -3,7 +3,7 @@
/* cut_target.c -- targetcut subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2012-2013 Genome Research Ltd.
+ Copyright (C) 2012-2013, 2015 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -28,9 +28,10 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
-#include "bam.h"
+#include "htslib/sam.h"
#include "errmod.h"
#include "htslib/faidx.h"
+#include "sam_opts.h"
#define ERR_DEP 0.83
@@ -46,16 +47,16 @@ static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
typedef struct {
int min_baseQ, tid, max_bases;
uint16_t *bases;
- bamFile fp;
- bam_header_t *h;
+ samFile *fp;
+ bam_hdr_t *h;
char *ref;
+ int len;
faidx_t *fai;
errmod_t *em;
} ct_t;
static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
{
- extern const char bam_nt16_nt4_table[];
int i, j, ret, tmp, k, sum[4], qual;
float q[16];
if (n > g->max_bases) { // enlarge g->bases
@@ -68,15 +69,15 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
uint8_t *seq;
int q, baseQ, b;
if (p->is_refskip || p->is_del) continue;
- baseQ = bam1_qual(p->b)[p->qpos];
+ baseQ = bam_get_qual(p->b)[p->qpos];
if (baseQ < g->min_baseQ) continue;
- seq = bam1_seq(p->b);
- b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+ seq = bam_get_seq(p->b);
+ b = seq_nt16_int[bam_seqi(seq, p->qpos)];
if (b > 3) continue;
q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
if (q < 4) q = 4;
if (q > 63) q = 63;
- g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b;
+ g->bases[k++] = q<<5 | bam_is_rev(p->b)<<4 | b;
}
if (k == 0) return 0;
errmod_cal(g->em, k, 4, g->bases, q);
@@ -90,7 +91,7 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
return ret<<8|k;
}
-static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
+static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns)
{
int i, f[2][2], *prev, *curr, *swap_tmp, s;
uint8_t *b; // backtrack array
@@ -145,21 +146,21 @@ static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
static int read_aln(void *data, bam1_t *b)
{
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
ct_t *g = (ct_t*)data;
- int ret, len;
+ int ret;
while (1)
{
- ret = bam_read1(g->fp, b);
+ ret = sam_read1(g->fp, g->h, b);
if ( ret<0 ) break;
if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
if ( g->fai && b->core.tid >= 0 ) {
if (b->core.tid != g->tid) { // then load the sequence
free(g->ref);
- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+ g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len);
g->tid = b->core.tid;
}
- bam_prob_realn_core(b, g->ref, 1<<1|1);
+ bam_prob_realn_core(b, g->ref, g->len, 1<<1|1);
}
break;
}
@@ -168,33 +169,49 @@ static int read_aln(void *data, bam1_t *b)
int main_cut_target(int argc, char *argv[])
{
- int c, tid, pos, n, lasttid = -1, l, max_l;
+ int c, tid, pos, n, lasttid = -1, l, max_l, usage = 0;
const bam_pileup1_t *p;
bam_plp_t plp;
uint16_t *cns;
ct_t g;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'),
+ { NULL, 0, NULL, 0 }
+ };
+
memset(&g, 0, sizeof(ct_t));
g.min_baseQ = 13; g.tid = -1;
- while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "f:Q:i:o:0:1:2:", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
case '1': g_param.e[1][1] = atoi(optarg); break;
case '2': g_param.e[1][2] = atoi(optarg); break;
- case 'f': g.fai = fai_load(optarg);
- if (g.fai == 0) fprintf(pysamerr, "[%s] fail to load the fasta index.\n", __func__);
- break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
}
}
- if (argc == optind) {
- fprintf(pysamerr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n");
+ if (ga.reference) {
+ g.fai = fai_load(ga.reference);
+ if (g.fai == 0) fprintf(pysamerr, "[%s] fail to load the fasta index.\n", __func__);
+ }
+ if (usage || argc == optind) {
+ fprintf(pysamerr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
+ sam_global_opt_help(pysamerr, "-.--f");
return 1;
}
l = max_l = 0; cns = 0;
- g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
- g.h = bam_header_read(g.fp);
+ g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ g.h = sam_hdr_read(g.fp);
+ if (g.h == NULL) {
+ fprintf(pysamerr, "Couldn't read header for '%s'\n", argv[optind]);
+ sam_close(g.fp);
+ return 1;
+ }
g.em = errmod_init(1. - ERR_DEP);
plp = bam_plp_init(read_aln, &g);
while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
@@ -214,13 +231,14 @@ int main_cut_target(int argc, char *argv[])
}
process_cns(g.h, lasttid, l, cns);
free(cns);
- bam_header_destroy(g.h);
+ bam_hdr_destroy(g.h);
bam_plp_destroy(plp);
- bam_close(g.fp);
+ sam_close(g.fp);
if (g.fai) {
fai_destroy(g.fai); free(g.ref);
}
errmod_destroy(g.em);
free(g.bases);
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/dict.c b/samtools/dict.c
new file mode 100644
index 0000000..241d119
--- /dev/null
+++ b/samtools/dict.c
@@ -0,0 +1,151 @@
+/* dict.c -- create a sequence dictionary file.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: Shane McCarthy <sm15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <zlib.h>
+#include <getopt.h>
+#include "htslib/kseq.h"
+#include "htslib/hts.h"
+
+KSEQ_INIT(gzFile, gzread)
+
+typedef struct _args_t
+{
+ char *output_fname, *fname;
+ char *assembly, *species, *uri;
+ int header;
+}
+args_t;
+
+static void write_dict(const char *fn, args_t *args)
+{
+ hts_md5_context *md5;
+ int l, i, k;
+ gzFile fp;
+ kseq_t *seq;
+ unsigned char digest[16];
+ char hex[33];
+
+ fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "dict: %s: No such file or directory\n", fn);
+ exit(1);
+ }
+ FILE *out = stdout;
+ if (args->output_fname) {
+ out = fopen(args->output_fname, "w");
+ if (out == NULL) {
+ fprintf(stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
+ exit(1);
+ }
+ }
+
+ if (!(md5 = hts_md5_init()))
+ exit(1);
+
+ seq = kseq_init(fp);
+ if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
+ while ((l = kseq_read(seq)) >= 0) {
+ for (i = k = 0; i < seq->seq.l; ++i) {
+ if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]);
+ else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i];
+ }
+ hts_md5_reset(md5);
+ hts_md5_update(md5, (unsigned char*)seq->seq.s, k);
+ hts_md5_final(digest, md5);
+ hts_md5_hex(hex, digest);
+ fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+ if (args->uri)
+ fprintf(out, "\tUR:%s", args->uri);
+ else if (strcmp(fn, "-") != 0) {
+ char *real_path = realpath(fn, NULL);
+ fprintf(out, "\tUR:file://%s", real_path);
+ free(real_path);
+ }
+ if (args->assembly) fprintf(out, "\tAS:%s", args->assembly);
+ if (args->species) fprintf(out, "\tSP:%s", args->species);
+ fprintf(out, "\n");
+ }
+ kseq_destroy(seq);
+ hts_md5_destroy(md5);
+
+ if (args->output_fname) fclose(out);
+}
+
+static int dict_usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Create a sequence dictionary file from a fasta file\n");
+ fprintf(stderr, "Usage: samtools dict [options] <file.fa|file.fa.gz>\n\n");
+ fprintf(stderr, "Options: -a, --assembly STR assembly\n");
+ fprintf(stderr, " -H, --no-header do not print @HD line\n");
+ fprintf(stderr, " -o, --output STR file to write out dict file [stdout]\n");
+ fprintf(stderr, " -s, --species STR species\n");
+ fprintf(stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+int dict_main(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->header = 1;
+
+ static const struct option loptions[] =
+ {
+ {"help", no_argument, NULL, 'h'},
+ {"no-header", no_argument, NULL, 'H'},
+ {"assembly", required_argument, NULL, 'a'},
+ {"species", required_argument, NULL, 's'},
+ {"uri", required_argument, NULL, 'u'},
+ {"output", required_argument, NULL, 'o'},
+ {NULL, 0, NULL, 0}
+ };
+ int c;
+ while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 )
+ {
+ switch (c)
+ {
+ case 'a': args->assembly = optarg; break;
+ case 's': args->species = optarg; break;
+ case 'u': args->uri = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'H': args->header = 0; break;
+ case 'h': return dict_usage();
+ default: return dict_usage();
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else return dict_usage();
+ }
+ else fname = argv[optind];
+
+ write_dict(fname, args);
+ free(args);
+ return 0;
+}
diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c
new file mode 100644
index 0000000..6b4a25a
--- /dev/null
+++ b/samtools/dict.c.pysam.c
@@ -0,0 +1,153 @@
+#include "pysam.h"
+
+/* dict.c -- create a sequence dictionary file.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: Shane McCarthy <sm15 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <zlib.h>
+#include <getopt.h>
+#include "htslib/kseq.h"
+#include "htslib/hts.h"
+
+KSEQ_INIT(gzFile, gzread)
+
+typedef struct _args_t
+{
+ char *output_fname, *fname;
+ char *assembly, *species, *uri;
+ int header;
+}
+args_t;
+
+static void write_dict(const char *fn, args_t *args)
+{
+ hts_md5_context *md5;
+ int l, i, k;
+ gzFile fp;
+ kseq_t *seq;
+ unsigned char digest[16];
+ char hex[33];
+
+ fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(pysamerr, "dict: %s: No such file or directory\n", fn);
+ exit(1);
+ }
+ FILE *out = stdout;
+ if (args->output_fname) {
+ out = fopen(args->output_fname, "w");
+ if (out == NULL) {
+ fprintf(pysamerr, "dict: %s: Cannot open file for writing\n", args->output_fname);
+ exit(1);
+ }
+ }
+
+ if (!(md5 = hts_md5_init()))
+ exit(1);
+
+ seq = kseq_init(fp);
+ if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
+ while ((l = kseq_read(seq)) >= 0) {
+ for (i = k = 0; i < seq->seq.l; ++i) {
+ if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]);
+ else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i];
+ }
+ hts_md5_reset(md5);
+ hts_md5_update(md5, (unsigned char*)seq->seq.s, k);
+ hts_md5_final(digest, md5);
+ hts_md5_hex(hex, digest);
+ fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+ if (args->uri)
+ fprintf(out, "\tUR:%s", args->uri);
+ else if (strcmp(fn, "-") != 0) {
+ char *real_path = realpath(fn, NULL);
+ fprintf(out, "\tUR:file://%s", real_path);
+ free(real_path);
+ }
+ if (args->assembly) fprintf(out, "\tAS:%s", args->assembly);
+ if (args->species) fprintf(out, "\tSP:%s", args->species);
+ fprintf(out, "\n");
+ }
+ kseq_destroy(seq);
+ hts_md5_destroy(md5);
+
+ if (args->output_fname) fclose(out);
+}
+
+static int dict_usage(void)
+{
+ fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "About: Create a sequence dictionary file from a fasta file\n");
+ fprintf(pysamerr, "Usage: samtools dict [options] <file.fa|file.fa.gz>\n\n");
+ fprintf(pysamerr, "Options: -a, --assembly STR assembly\n");
+ fprintf(pysamerr, " -H, --no-header do not print @HD line\n");
+ fprintf(pysamerr, " -o, --output STR file to write out dict file [stdout]\n");
+ fprintf(pysamerr, " -s, --species STR species\n");
+ fprintf(pysamerr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
+ fprintf(pysamerr, "\n");
+ return 1;
+}
+
+int dict_main(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->header = 1;
+
+ static const struct option loptions[] =
+ {
+ {"help", no_argument, NULL, 'h'},
+ {"no-header", no_argument, NULL, 'H'},
+ {"assembly", required_argument, NULL, 'a'},
+ {"species", required_argument, NULL, 's'},
+ {"uri", required_argument, NULL, 'u'},
+ {"output", required_argument, NULL, 'o'},
+ {NULL, 0, NULL, 0}
+ };
+ int c;
+ while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 )
+ {
+ switch (c)
+ {
+ case 'a': args->assembly = optarg; break;
+ case 's': args->species = optarg; break;
+ case 'u': args->uri = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'H': args->header = 0; break;
+ case 'h': return dict_usage();
+ default: return dict_usage();
+ }
+ }
+
+ char *fname = NULL;
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else return dict_usage();
+ }
+ else fname = argv[optind];
+
+ write_dict(fname, args);
+ free(args);
+ return 0;
+}
diff --git a/samtools/errmod.c b/samtools/errmod.c
index e7759a0..f8b5aa7 100644
--- a/samtools/errmod.c
+++ b/samtools/errmod.c
@@ -67,14 +67,14 @@ static errmod_coef_t *cal_coef(double depcorr, double eta)
// initialize ->fk
ec->fk = (double*)calloc(256, sizeof(double));
ec->fk[0] = 1.0;
- for (n = 1; n != 256; ++n)
+ for (n = 1; n < 256; ++n)
ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
// initialize ->coef
ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
lC = logbinomial_table( 256 );
- for (q = 1; q != 64; ++q) {
+ for (q = 1; q < 64; ++q) {
double e = pow(10.0, -q/10.0);
double le = log(e);
double le1 = log(1.0 - e);
diff --git a/samtools/errmod.c.pysam.c b/samtools/errmod.c.pysam.c
index db57001..fce3042 100644
--- a/samtools/errmod.c.pysam.c
+++ b/samtools/errmod.c.pysam.c
@@ -69,14 +69,14 @@ static errmod_coef_t *cal_coef(double depcorr, double eta)
// initialize ->fk
ec->fk = (double*)calloc(256, sizeof(double));
ec->fk[0] = 1.0;
- for (n = 1; n != 256; ++n)
+ for (n = 1; n < 256; ++n)
ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
// initialize ->coef
ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
lC = logbinomial_table( 256 );
- for (q = 1; q != 64; ++q) {
+ for (q = 1; q < 64; ++q) {
double e = pow(10.0, -q/10.0);
double le = log(e);
double le1 = log(1.0 - e);
diff --git a/samtools/misc/md5.c b/samtools/misc/md5.c
deleted file mode 100644
index 7f1ce0e..0000000
--- a/samtools/misc/md5.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest. This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to MD5Init, call MD5Update as
- * needed on buffers full of bytes, and then call MD5Final, which
- * will fill a supplied 16-byte array with the digest.
- */
-
-/* Brutally hacked by John Walker back from ANSI C to K&R (no
- prototypes) to maintain the tradition that Netfone will compile
- with Sun's original "cc". */
-
-#include <string.h>
-#include "md5.h"
-
-#ifndef HIGHFIRST
-#define byteReverse(buf, len) /* Nothing */
-#else
-/*
- * Note: this code is harmless on little-endian machines.
- */
-void byteReverse(unsigned char *buf, unsigned longs);
-
-void byteReverse(buf, longs)
- unsigned char *buf; unsigned longs;
-{
- uint32_t t;
- do {
- t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
- ((unsigned) buf[1] << 8 | buf[0]);
- *(uint32_t *) buf = t;
- buf += 4;
- } while (--longs);
-}
-#endif
-
-void MD5Transform(uint32_t buf[4], uint32_t in[16]);
-
-
-/*
- * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void MD5Init(ctx)
- struct MD5Context *ctx;
-{
- ctx->buf[0] = 0x67452301;
- ctx->buf[1] = 0xefcdab89;
- ctx->buf[2] = 0x98badcfe;
- ctx->buf[3] = 0x10325476;
-
- ctx->bits[0] = 0;
- ctx->bits[1] = 0;
-}
-
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void MD5Update(ctx, buf, len)
- struct MD5Context *ctx; unsigned char *buf; unsigned len;
-{
- uint32_t t;
-
- /* Update bitcount */
-
- t = ctx->bits[0];
- if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
- ctx->bits[1]++; /* Carry from low to high */
- ctx->bits[1] += len >> 29;
-
- t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
-
- /* Handle any leading odd-sized chunks */
-
- if (t) {
- unsigned char *p = (unsigned char *) ctx->in + t;
-
- t = 64 - t;
- if (len < t) {
- memcpy(p, buf, len);
- return;
- }
- memcpy(p, buf, t);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- buf += t;
- len -= t;
- }
- /* Process data in 64-byte chunks */
-
- while (len >= 64) {
- memcpy(ctx->in, buf, 64);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- buf += 64;
- len -= 64;
- }
-
- /* Handle any remaining bytes of data. */
-
- memcpy(ctx->in, buf, len);
-}
-
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void MD5Final(digest, ctx)
- unsigned char digest[16]; struct MD5Context *ctx;
-{
- unsigned count;
- unsigned char *p;
-
- /* Compute number of bytes mod 64 */
- count = (ctx->bits[0] >> 3) & 0x3F;
-
- /* Set the first char of padding to 0x80. This is safe since there is
- always at least one byte free */
- p = ctx->in + count;
- *p++ = 0x80;
-
- /* Bytes of padding needed to make 64 bytes */
- count = 64 - 1 - count;
-
- /* Pad out to 56 mod 64 */
- if (count < 8) {
- /* Two lots of padding: Pad the first block to 64 bytes */
- memset(p, 0, count);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
-
- /* Now fill the next block with 56 bytes */
- memset(ctx->in, 0, 56);
- } else {
- /* Pad block to 56 bytes */
- memset(p, 0, count - 8);
- }
- byteReverse(ctx->in, 14);
-
- /* Append length in bits and transform */
- ((uint32_t *) ctx->in)[14] = ctx->bits[0];
- ((uint32_t *) ctx->in)[15] = ctx->bits[1];
-
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- byteReverse((unsigned char *) ctx->buf, 4);
- memcpy(digest, ctx->buf, 16);
- memset(ctx, 0, sizeof(struct MD5Context)); /* In case it's sensitive */
-}
-
-
-/* The four core functions - F1 is optimized somewhat */
-
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
- ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
-
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data. MD5Update blocks
- * the data and converts bytes into longwords for this routine.
- */
-void MD5Transform(buf, in)
- uint32_t buf[4]; uint32_t in[16];
-{
- register uint32_t a, b, c, d;
-
- a = buf[0];
- b = buf[1];
- c = buf[2];
- d = buf[3];
-
- MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
- MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
- MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
- MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
- MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
- MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
- MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
- MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
- MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
- MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
- MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
- MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
- MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
- MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
- MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
- MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-
- MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
- MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
- MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
- MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
- MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
- MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
- MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
- MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
- MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
- MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
- MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
- MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
- MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
- MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
- MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
- MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-
- MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
- MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
- MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
- MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
- MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
- MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
- MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
- MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
- MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
- MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
- MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
- MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
- MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
- MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
- MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
- MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-
- MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
- MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
- MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
- MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
- MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
- MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
- MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
- MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
- MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
- MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
- MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
- MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
- MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
- MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
- MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
- MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-
- buf[0] += a;
- buf[1] += b;
- buf[2] += c;
- buf[3] += d;
-}
-
-/* lh3: the following code is added by me */
-
-#ifdef MD5SUM_MAIN
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#define HEX_STR "0123456789abcdef"
-
-static void md5_one(const char *fn)
-{
- unsigned char buf[4096], digest[16];
- MD5_CTX md5;
- int l;
- FILE *fp;
-
- fp = strcmp(fn, "-")? fopen(fn, "r") : stdin;
- if (fp == 0) {
- fprintf(stderr, "md5sum: %s: No such file or directory\n", fn);
- exit(1);
- }
- MD5Init(&md5);
- while ((l = fread(buf, 1, 4096, fp)) > 0)
- MD5Update(&md5, buf, l);
- MD5Final(digest, &md5);
- if (fp != stdin) fclose(fp);
- for (l = 0; l < 16; ++l)
- printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
- printf(" %s\n", fn);
-}
-int main(int argc, char *argv[])
-{
- int i;
- if (argc == 1) md5_one("-");
- else for (i = 1; i < argc; ++i) md5_one(argv[i]);
- return 0;
-}
-#endif
diff --git a/samtools/misc/md5.c.pysam.c b/samtools/misc/md5.c.pysam.c
deleted file mode 100644
index 10469fc..0000000
--- a/samtools/misc/md5.c.pysam.c
+++ /dev/null
@@ -1,300 +0,0 @@
-#include "pysam.h"
-
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest. This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to MD5Init, call MD5Update as
- * needed on buffers full of bytes, and then call MD5Final, which
- * will fill a supplied 16-byte array with the digest.
- */
-
-/* Brutally hacked by John Walker back from ANSI C to K&R (no
- prototypes) to maintain the tradition that Netfone will compile
- with Sun's original "cc". */
-
-#include <string.h>
-#include "md5.h"
-
-#ifndef HIGHFIRST
-#define byteReverse(buf, len) /* Nothing */
-#else
-/*
- * Note: this code is harmless on little-endian machines.
- */
-void byteReverse(unsigned char *buf, unsigned longs);
-
-void byteReverse(buf, longs)
- unsigned char *buf; unsigned longs;
-{
- uint32_t t;
- do {
- t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
- ((unsigned) buf[1] << 8 | buf[0]);
- *(uint32_t *) buf = t;
- buf += 4;
- } while (--longs);
-}
-#endif
-
-void MD5Transform(uint32_t buf[4], uint32_t in[16]);
-
-
-/*
- * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void MD5Init(ctx)
- struct MD5Context *ctx;
-{
- ctx->buf[0] = 0x67452301;
- ctx->buf[1] = 0xefcdab89;
- ctx->buf[2] = 0x98badcfe;
- ctx->buf[3] = 0x10325476;
-
- ctx->bits[0] = 0;
- ctx->bits[1] = 0;
-}
-
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void MD5Update(ctx, buf, len)
- struct MD5Context *ctx; unsigned char *buf; unsigned len;
-{
- uint32_t t;
-
- /* Update bitcount */
-
- t = ctx->bits[0];
- if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
- ctx->bits[1]++; /* Carry from low to high */
- ctx->bits[1] += len >> 29;
-
- t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
-
- /* Handle any leading odd-sized chunks */
-
- if (t) {
- unsigned char *p = (unsigned char *) ctx->in + t;
-
- t = 64 - t;
- if (len < t) {
- memcpy(p, buf, len);
- return;
- }
- memcpy(p, buf, t);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- buf += t;
- len -= t;
- }
- /* Process data in 64-byte chunks */
-
- while (len >= 64) {
- memcpy(ctx->in, buf, 64);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- buf += 64;
- len -= 64;
- }
-
- /* Handle any remaining bytes of data. */
-
- memcpy(ctx->in, buf, len);
-}
-
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void MD5Final(digest, ctx)
- unsigned char digest[16]; struct MD5Context *ctx;
-{
- unsigned count;
- unsigned char *p;
-
- /* Compute number of bytes mod 64 */
- count = (ctx->bits[0] >> 3) & 0x3F;
-
- /* Set the first char of padding to 0x80. This is safe since there is
- always at least one byte free */
- p = ctx->in + count;
- *p++ = 0x80;
-
- /* Bytes of padding needed to make 64 bytes */
- count = 64 - 1 - count;
-
- /* Pad out to 56 mod 64 */
- if (count < 8) {
- /* Two lots of padding: Pad the first block to 64 bytes */
- memset(p, 0, count);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
-
- /* Now fill the next block with 56 bytes */
- memset(ctx->in, 0, 56);
- } else {
- /* Pad block to 56 bytes */
- memset(p, 0, count - 8);
- }
- byteReverse(ctx->in, 14);
-
- /* Append length in bits and transform */
- ((uint32_t *) ctx->in)[14] = ctx->bits[0];
- ((uint32_t *) ctx->in)[15] = ctx->bits[1];
-
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- byteReverse((unsigned char *) ctx->buf, 4);
- memcpy(digest, ctx->buf, 16);
- memset(ctx, 0, sizeof(struct MD5Context)); /* In case it's sensitive */
-}
-
-
-/* The four core functions - F1 is optimized somewhat */
-
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
- ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
-
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data. MD5Update blocks
- * the data and converts bytes into longwords for this routine.
- */
-void MD5Transform(buf, in)
- uint32_t buf[4]; uint32_t in[16];
-{
- register uint32_t a, b, c, d;
-
- a = buf[0];
- b = buf[1];
- c = buf[2];
- d = buf[3];
-
- MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
- MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
- MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
- MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
- MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
- MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
- MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
- MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
- MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
- MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
- MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
- MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
- MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
- MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
- MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
- MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-
- MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
- MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
- MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
- MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
- MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
- MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
- MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
- MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
- MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
- MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
- MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
- MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
- MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
- MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
- MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
- MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-
- MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
- MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
- MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
- MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
- MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
- MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
- MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
- MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
- MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
- MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
- MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
- MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
- MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
- MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
- MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
- MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-
- MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
- MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
- MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
- MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
- MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
- MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
- MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
- MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
- MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
- MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
- MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
- MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
- MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
- MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
- MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
- MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-
- buf[0] += a;
- buf[1] += b;
- buf[2] += c;
- buf[3] += d;
-}
-
-/* lh3: the following code is added by me */
-
-#ifdef MD5SUM_MAIN
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#define HEX_STR "0123456789abcdef"
-
-static void md5_one(const char *fn)
-{
- unsigned char buf[4096], digest[16];
- MD5_CTX md5;
- int l;
- FILE *fp;
-
- fp = strcmp(fn, "-")? fopen(fn, "r") : stdin;
- if (fp == 0) {
- fprintf(pysamerr, "md5sum: %s: No such file or directory\n", fn);
- exit(1);
- }
- MD5Init(&md5);
- while ((l = fread(buf, 1, 4096, fp)) > 0)
- MD5Update(&md5, buf, l);
- MD5Final(digest, &md5);
- if (fp != stdin) fclose(fp);
- for (l = 0; l < 16; ++l)
- printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
- printf(" %s\n", fn);
-}
-int main(int argc, char *argv[])
-{
- int i;
- if (argc == 1) md5_one("-");
- else for (i = 1; i < argc; ++i) md5_one(argv[i]);
- return 0;
-}
-#endif
diff --git a/samtools/misc/md5.h b/samtools/misc/md5.h
deleted file mode 100644
index 44121e4..0000000
--- a/samtools/misc/md5.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- This file is adapted from a program in this page:
-
- http://www.fourmilab.ch/md5/
-
- The original source code does not work on 64-bit machines due to the
- wrong typedef "uint32". I also added prototypes.
-
- -lh3
- */
-
-#ifndef MD5_H
-#define MD5_H
-
-/* The following tests optimise behaviour on little-endian
- machines, where there is no need to reverse the byte order
- of 32 bit words in the MD5 computation. By default,
- HIGHFIRST is defined, which indicates we're running on a
- big-endian (most significant byte first) machine, on which
- the byteReverse function in md5.c must be invoked. However,
- byteReverse is coded in such a way that it is an identity
- function when run on a little-endian machine, so calling it
- on such a platform causes no harm apart from wasting time.
- If the platform is known to be little-endian, we speed
- things up by undefining HIGHFIRST, which defines
- byteReverse as a null macro. Doing things in this manner
- insures we work on new platforms regardless of their byte
- order. */
-
-#define HIGHFIRST
-
-#if __LITTLE_ENDIAN__ != 0
-#undef HIGHFIRST
-#endif
-
-#include <stdint.h>
-
-struct MD5Context {
- uint32_t buf[4];
- uint32_t bits[2];
- unsigned char in[64];
-};
-
-void MD5Init(struct MD5Context *ctx);
-void MD5Update(struct MD5Context *ctx, unsigned char *buf, unsigned len);
-void MD5Final(unsigned char digest[16], struct MD5Context *ctx);
-
-/*
- * This is needed to make RSAREF happy on some MS-DOS compilers.
- */
-typedef struct MD5Context MD5_CTX;
-
-/* Define CHECK_HARDWARE_PROPERTIES to have main,c verify
- byte order and uint32_t settings. */
-#define CHECK_HARDWARE_PROPERTIES
-
-#endif /* !MD5_H */
diff --git a/samtools/padding.c b/samtools/padding.c
index ea1c933..436d716 100644
--- a/samtools/padding.c
+++ b/samtools/padding.c
@@ -1,7 +1,7 @@
/* padding.c -- depad subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014, 2015 Genome Research Ltd.
Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -27,26 +27,30 @@ DEALINGS IN THE SOFTWARE. */
#include <string.h>
#include <assert.h>
#include <unistd.h>
-#include "htslib/kstring.h"
+#include <htslib/kstring.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
#include "sam_header.h"
-#include "sam.h"
-#include "bam.h"
-#include "htslib/faidx.h"
+#include "sam_opts.h"
-bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/
+#define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
+
+// The one and only function needed from sam.c.
+// Explicitly here to avoid including bam.h translation layer.
+extern char *samfaipath(const char *fn_ref);
static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
{
if (n != b->core.n_cigar) {
int o = b->core.l_qname + b->core.n_cigar * 4;
- if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) {
- b->m_data = b->data_len + (n - b->core.n_cigar) * 4;
+ if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
+ b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
}
- memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o);
+ memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
memcpy(b->data + b->core.l_qname, cigar, n * 4);
- b->data_len += (n - b->core.n_cigar) * 4;
+ b->l_data += (n - b->core.n_cigar) * 4;
b->core.n_cigar = n;
} else memcpy(b->data + b->core.l_qname, cigar, n * 4);
}
@@ -59,42 +63,45 @@ static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
_c[_n++] = (_v); \
} while (0)
-static void unpad_seq(bam1_t *b, kstring_t *s)
+static int unpad_seq(bam1_t *b, kstring_t *s)
{
+ // Returns 0 on success, -1 on an error
int k, j, i;
int length;
- uint32_t *cigar = bam1_cigar(b);
- uint8_t *seq = bam1_seq(b);
+ int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+
// b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
// We need the padded length after alignment from the CIGAR (excluding
// soft clips S, but including pads from CIGAR D operations)
- length = 0;
- for (k = 0; k < b->core.n_cigar; ++k) {
- int op, ol;
- op= bam_cigar_op(cigar[k]);
- ol = bam_cigar_oplen(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL)
- length += ol;
- }
+ length = bam_cigar2rlen(b->core.n_cigar, cigar);
ks_resize(s, length);
for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
int op, ol;
op = bam_cigar_op(cigar[k]);
ol = bam_cigar_oplen(cigar[k]);
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j);
+ for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j);
} else if (op == BAM_CSOFT_CLIP) {
j += ol;
} else if (op == BAM_CHARD_CLIP) {
/* do nothing */
} else if (op == BAM_CDEL) {
for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
+ } else if (op == BAM_CREF_SKIP) {
+ /* Treat CIGAR N as D (not ideal, but better than ignoring it) */
+ for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
+ if (0 == cigar_n_warning) {
+ cigar_n_warning = -1;
+ fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
+ }
} else {
- fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b));
- exit(1);
+ fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
+ return -1;
}
}
- assert(length == s->l);
+ return length != s->l;
}
int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
@@ -117,7 +124,7 @@ int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
// Map gaps to null to match unpad_seq function
seq->s[seq->l++] = 0;
} else {
- int i = bam_nt16_table[(int)base];
+ int i = seq_nt16_table[(int)base];
if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
free(fai_ref);
@@ -150,7 +157,7 @@ int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len)
if (base == '-' || base == '*') {
gaps += 1;
} else {
- int i = bam_nt16_table[(int)base];
+ int i = seq_nt16_table[(int)base];
if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
free(fai_ref);
@@ -175,9 +182,8 @@ static inline int * update_posmap(int *posmap, kstring_t ref)
return posmap;
}
-int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
+int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai)
{
- bam_header_t *h = 0;
bam1_t *b = 0;
kstring_t r, q;
int r_tid = -1;
@@ -187,16 +193,22 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
b = bam_init1();
r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
int read_ret;
- h = in->header;
- while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in'
- uint32_t *cigar = bam1_cigar(b);
+ while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in'
+ // Cannot depad unmapped CRAM data
+ if (b->core.flag & BAM_FUNMAP)
+ goto next_seq;
+
+ uint32_t *cigar = bam_get_cigar(b);
n2 = 0;
- if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
- // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b));
+ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) {
+ // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b));
r_tid = b->core.tid;
- unpad_seq(b, &r);
+ if (0!=unpad_seq(b, &r)) {
+ fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b));
+ return -1;
+ };
if (h->target_len[r_tid] != r.l) {
- fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam1_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
+ fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
return -1;
}
if (fai) {
@@ -212,8 +224,8 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
// Show gaps as ASCII 45
fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
h->target_name[b->core.tid], i+1,
- r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45,
- q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45);
+ r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45,
+ q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45);
return -1;
}
}
@@ -224,11 +236,11 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
} else if (b->core.n_cigar > 0) {
int i, k, op;
if (b->core.tid < 0) {
- fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b));
+ fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b));
return -1;
} else if (b->core.tid == r_tid) {
; // good case, reference available
- //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b));
+ //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b));
} else if (fai) {
if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
@@ -241,7 +253,10 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
return -1;
}
- unpad_seq(b, &q);
+ if (0!=unpad_seq(b, &q)) {
+ fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b));
+ return -1;
+ };
if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
write_cigar(cigar2, n2, m2, cigar[0]);
} else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) {
@@ -257,9 +272,15 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
if (q.s[0] == BAM_CINS) {
for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD));
+ k = 0;
+ } else if (q.s[0] == BAM_CPAD) {
+ // Join 'k' CPAD to our first cigar op CPAD too.
+ for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
+ } else {
+ k = 0;
}
/* Count consecutive CIGAR operators to turn into a CIGAR string */
- for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
+ for (i = 1, k++, op = q.s[0]; i < q.l; ++i) {
if (op != q.s[i]) {
write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
op = q.s[i]; k = 1;
@@ -302,14 +323,14 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
if (b->core.mtid < 0 || b->core.mpos < 0) {
/* Nice case, no mate to worry about*/
- // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b));
+ // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b));
/* TODO - Warning if FLAG says mate should be mapped? */
/* Clean up funny input where mate position is given but mate reference is missing: */
b->core.mtid = -1;
b->core.mpos = -1;
} else if (b->core.mtid == b->core.tid) {
/* Nice case, same reference */
- // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b));
+ // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b));
b->core.mpos = posmap[b->core.mpos];
} else {
/* Nasty case, Must load alternative posmap */
@@ -333,8 +354,10 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
posmap = update_posmap(posmap, r);
}
/* Most reads will have been moved so safest to always recalculate the BIN value */
- b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
- samwrite(out, b);
+ b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b));
+
+ next_seq:
+ sam_write1(out, h, b);
}
if (read_ret < -1) {
fprintf(stderr, "[depad] truncated file.\n");
@@ -345,13 +368,12 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
return ret;
}
-bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
+bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai)
{
-#if 0
int i = 0, unpadded_len = 0;
- bam_header_t *header = 0 ;
+ bam_hdr_t *header = 0 ;
- header = bam_header_dup(old);
+ header = bam_hdr_dup(old);
for (i = 0; i < old->n_targets; ++i) {
unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
if (unpadded_len < 0) {
@@ -376,11 +398,54 @@ bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
end = strchr(text, '\n');
assert(end != 0);
if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') {
- /* TODO - edit the @SQ line here to remove MD and fix LN. */
- /* For now just remove the @SQ line, and samtools will */
- /* automatically generate a minimal replacement with LN. */
- /* However, that discards any other tags like AS, SP, UR. */
- //fprintf(stderr, "[depad] Removing @SQ line\n");
+ const char *cp = text+3;
+ char *name = strstr(text, "\tSN:");
+ char *name_end;
+ if (!name) {
+ fprintf(stderr, "Unable to find SN: header field\n");
+ return NULL;
+ }
+ name += 4;
+ for (name_end = name; name_end != end && *name_end != '\t'; name_end++);
+ strcat(newtext, "@SQ");
+
+ /* Parse the @SQ lines */
+ while (cp != end) {
+ if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) {
+ // Rewrite the length
+ char len_buf[100];
+ int tid;
+ for (tid = 0; tid < header->n_targets; tid++) {
+ // may want to hash this, but new header API incoming.
+ if (strncmp(name, header->target_name[tid], name_end - name) == 0) {
+ sprintf(len_buf, "LN:%d", header->target_len[tid]);
+ strcat(newtext, len_buf);
+ break;
+ }
+ }
+ while (cp != end && *cp++ != '\t');
+ if (cp != end)
+ strcat(newtext, "\t");
+ } else if (end-cp >= 2 &&
+ (strncmp(cp, "M5", 2) == 0 ||
+ strncmp(cp, "UR", 2) == 0)) {
+ // MD5 changed during depadding; ditch it.
+ // URLs are also invalid.
+ while (cp != end && *cp++ != '\t');
+ } else {
+ // Otherwise copy this sub-field verbatim
+ const char *cp_start = cp;
+ while (cp != end && *cp++ != '\t');
+ strncat(newtext, cp_start, cp-cp_start);
+ }
+ }
+
+ // Add newline, replacing trailing '\t' if last on line was the LN:
+ char *text_end = newtext + strlen(newtext);
+ if (text_end[-1] == '\t')
+ text_end[-1] = '\n';
+ else
+ *text_end++ = '\n', *text_end = '\0';
} else {
/* Copy this line to the new header */
strncat(newtext, text, end - text + 1);
@@ -401,41 +466,51 @@ bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
}
//fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
return header;
-#else
- fprintf(stderr, "Samtools-htslib: fix_header() header parsing not yet implemented\n");
- abort();
-#endif
}
static int usage(int is_long_help);
int main_pad2unpad(int argc, char *argv[])
{
- samfile_t *in = 0, *out = 0;
- bam_header_t *h = 0;
+ samFile *in = 0, *out = 0;
+ bam_hdr_t *h = 0, *h_fix = 0;
faidx_t *fai = 0;
- int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0;
- char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
+ int c, compress_level = -1, is_long_help = 0;
+ char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0;
int ret=0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'),
+ { NULL, 0, NULL, 0 }
+ };
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) {
+ while ((c = getopt_long(argc, argv, "SCso:u1T:?", lopts, NULL)) >= 0) {
switch (c) {
- case 'S': is_bamin = 0; break;
- case 's': assert(compress_level == -1); is_bamout = 0; break;
+ case 'S': break;
+ case 'C': hts_parse_format(&ga.out, "cram"); break;
+ case 's': assert(compress_level == -1); hts_parse_format(&ga.out, "sam"); break;
case 'o': fn_out = strdup(optarg); break;
- case 'u': assert(is_bamout == 1); compress_level = 0; break;
- case '1': assert(is_bamout == 1); compress_level = 1; break;
- case 'T': fn_ref = strdup(optarg); break;
+ case 'u':
+ compress_level = 0;
+ if (ga.out.format == unknown_format)
+ hts_parse_format(&ga.out, "bam");
+ break;
+ case '1':
+ compress_level = 1;
+ if (ga.out.format == unknown_format)
+ hts_parse_format(&ga.out, "bam");
+ break;
case '?': is_long_help = 1; break;
- default: return usage(is_long_help);
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
+ return usage(is_long_help);
}
}
if (argc == optind) return usage(is_long_help);
- if (is_bamin) strcat(in_mode, "b");
- if (is_bamout) strcat(out_mode, "b");
strcat(out_mode, "h");
if (compress_level >= 0) {
char tmp[2];
@@ -444,47 +519,60 @@ int main_pad2unpad(int argc, char *argv[])
}
// Load FASTA reference (also needed for SAM -> BAM if missing header)
- if (fn_ref) {
- fn_list = samfaipath(fn_ref);
- fai = fai_load(fn_ref);
+ if (ga.reference) {
+ fn_list = samfaipath(ga.reference);
+ fai = fai_load(ga.reference);
}
// open file handlers
- if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+ if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
ret = 1;
goto depad_end;
}
- if (in->header == 0) {
- fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+ if (fn_list && hts_set_fai_filename(in, fn_list) != 0) {
+ fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_list);
ret = 1;
goto depad_end;
}
- if (in->header->text == 0 || in->header->l_text == 0) {
- fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]);
- assert (0 == in->header->l_text);
- assert (0 == in->header->text);
+ if ((h = sam_hdr_read(in)) == 0) {
+ fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+ ret = 1;
+ goto depad_end;
}
- if (fn_ref) {
- h = fix_header(in->header, fai);
+ if (fai) {
+ h_fix = fix_header(h, fai);
} else {
fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
- h = in->header;
+ h_fix = h;
}
- if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) {
+ char wmode[2];
+ strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b");
+ if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) {
fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
ret = 1;
goto depad_end;
}
+ // Reference-based CRAM won't work unless we also create a new reference.
+ // We could embed this, but for now we take the easy option.
+ if (ga.out.format == cram)
+ hts_set_opt(out, CRAM_OPT_NO_REF, 1);
+
+ if (sam_hdr_write(out, h_fix) != 0) {
+ fprintf(stderr, "[depad] failed to write header.\n");
+ ret = 1;
+ goto depad_end;
+ }
+
// Do the depad
- ret = bam_pad2unpad(in, out, fai);
+ ret = bam_pad2unpad(in, out, h, fai);
depad_end:
// close files, free and return
if (fai) fai_destroy(fai);
- if (! (in && h == in->header)) bam_header_destroy(h);
- samclose(in);
- samclose(out);
+ if (h) bam_hdr_destroy(h);
+ sam_close(in);
+ sam_close(out);
free(fn_list); free(fn_out);
return ret;
}
@@ -493,14 +581,17 @@ static int usage(int is_long_help)
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools depad <in.bam>\n\n");
- fprintf(stderr, "Options: -s output is SAM (default is BAM)\n");
- fprintf(stderr, " -S input is SAM (default is BAM)\n");
- fprintf(stderr, " -u uncompressed BAM output (can't use with -s)\n");
- fprintf(stderr, " -1 fast compression BAM output (can't use with -s)\n");
- fprintf(stderr, " -T FILE padded reference sequence file [null]\n");
- fprintf(stderr, " -o FILE output file name [stdout]\n");
- fprintf(stderr, " -? longer help\n");
- fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -s Output is SAM (default is BAM)\n");
+ fprintf(stderr, " -S Input is SAM (default is BAM)\n");
+ fprintf(stderr, " -u Uncompressed BAM output (can't use with -s)\n");
+ fprintf(stderr, " -1 Fast compression BAM output (can't use with -s)\n");
+ fprintf(stderr, " -T, --reference FILE\n");
+ fprintf(stderr, " Padded reference sequence file [null]\n");
+ fprintf(stderr, " -o FILE Output file name [stdout]\n");
+ fprintf(stderr, " -? Longer help\n");
+ sam_global_opt_help(stderr, "-...-");
+
if (is_long_help)
fprintf(stderr, "Notes:\n\
\n\
diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c
index 562ceba..fd889f3 100644
--- a/samtools/padding.c.pysam.c
+++ b/samtools/padding.c.pysam.c
@@ -3,7 +3,7 @@
/* padding.c -- depad subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014, 2015 Genome Research Ltd.
Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -29,26 +29,30 @@ DEALINGS IN THE SOFTWARE. */
#include <string.h>
#include <assert.h>
#include <unistd.h>
-#include "htslib/kstring.h"
+#include <htslib/kstring.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
#include "sam_header.h"
-#include "sam.h"
-#include "bam.h"
-#include "htslib/faidx.h"
+#include "sam_opts.h"
-bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/
+#define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
+
+// The one and only function needed from sam.c.
+// Explicitly here to avoid including bam.h translation layer.
+extern char *samfaipath(const char *fn_ref);
static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
{
if (n != b->core.n_cigar) {
int o = b->core.l_qname + b->core.n_cigar * 4;
- if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) {
- b->m_data = b->data_len + (n - b->core.n_cigar) * 4;
+ if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
+ b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
}
- memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o);
+ memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
memcpy(b->data + b->core.l_qname, cigar, n * 4);
- b->data_len += (n - b->core.n_cigar) * 4;
+ b->l_data += (n - b->core.n_cigar) * 4;
b->core.n_cigar = n;
} else memcpy(b->data + b->core.l_qname, cigar, n * 4);
}
@@ -61,42 +65,45 @@ static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
_c[_n++] = (_v); \
} while (0)
-static void unpad_seq(bam1_t *b, kstring_t *s)
+static int unpad_seq(bam1_t *b, kstring_t *s)
{
+ // Returns 0 on success, -1 on an error
int k, j, i;
int length;
- uint32_t *cigar = bam1_cigar(b);
- uint8_t *seq = bam1_seq(b);
+ int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+
// b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
// We need the padded length after alignment from the CIGAR (excluding
// soft clips S, but including pads from CIGAR D operations)
- length = 0;
- for (k = 0; k < b->core.n_cigar; ++k) {
- int op, ol;
- op= bam_cigar_op(cigar[k]);
- ol = bam_cigar_oplen(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL)
- length += ol;
- }
+ length = bam_cigar2rlen(b->core.n_cigar, cigar);
ks_resize(s, length);
for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
int op, ol;
op = bam_cigar_op(cigar[k]);
ol = bam_cigar_oplen(cigar[k]);
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j);
+ for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j);
} else if (op == BAM_CSOFT_CLIP) {
j += ol;
} else if (op == BAM_CHARD_CLIP) {
/* do nothing */
} else if (op == BAM_CDEL) {
for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
+ } else if (op == BAM_CREF_SKIP) {
+ /* Treat CIGAR N as D (not ideal, but better than ignoring it) */
+ for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
+ if (0 == cigar_n_warning) {
+ cigar_n_warning = -1;
+ fprintf(pysamerr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
+ }
} else {
- fprintf(pysamerr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b));
- exit(1);
+ fprintf(pysamerr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
+ return -1;
}
}
- assert(length == s->l);
+ return length != s->l;
}
int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
@@ -119,7 +126,7 @@ int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
// Map gaps to null to match unpad_seq function
seq->s[seq->l++] = 0;
} else {
- int i = bam_nt16_table[(int)base];
+ int i = seq_nt16_table[(int)base];
if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
fprintf(pysamerr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
free(fai_ref);
@@ -152,7 +159,7 @@ int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len)
if (base == '-' || base == '*') {
gaps += 1;
} else {
- int i = bam_nt16_table[(int)base];
+ int i = seq_nt16_table[(int)base];
if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
fprintf(pysamerr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
free(fai_ref);
@@ -177,9 +184,8 @@ static inline int * update_posmap(int *posmap, kstring_t ref)
return posmap;
}
-int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
+int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai)
{
- bam_header_t *h = 0;
bam1_t *b = 0;
kstring_t r, q;
int r_tid = -1;
@@ -189,16 +195,22 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
b = bam_init1();
r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
int read_ret;
- h = in->header;
- while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in'
- uint32_t *cigar = bam1_cigar(b);
+ while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in'
+ // Cannot depad unmapped CRAM data
+ if (b->core.flag & BAM_FUNMAP)
+ goto next_seq;
+
+ uint32_t *cigar = bam_get_cigar(b);
n2 = 0;
- if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
- // fprintf(pysamerr, "[depad] Found embedded reference '%s'\n", bam1_qname(b));
+ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) {
+ // fprintf(pysamerr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b));
r_tid = b->core.tid;
- unpad_seq(b, &r);
+ if (0!=unpad_seq(b, &r)) {
+ fprintf(pysamerr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b));
+ return -1;
+ };
if (h->target_len[r_tid] != r.l) {
- fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam1_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
+ fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
return -1;
}
if (fai) {
@@ -214,8 +226,8 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
// Show gaps as ASCII 45
fprintf(pysamerr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
h->target_name[b->core.tid], i+1,
- r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45,
- q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45);
+ r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45,
+ q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45);
return -1;
}
}
@@ -226,11 +238,11 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
} else if (b->core.n_cigar > 0) {
int i, k, op;
if (b->core.tid < 0) {
- fprintf(pysamerr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b));
+ fprintf(pysamerr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b));
return -1;
} else if (b->core.tid == r_tid) {
; // good case, reference available
- //fprintf(pysamerr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b));
+ //fprintf(pysamerr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b));
} else if (fai) {
if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
fprintf(pysamerr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
@@ -243,7 +255,10 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
fprintf(pysamerr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
return -1;
}
- unpad_seq(b, &q);
+ if (0!=unpad_seq(b, &q)) {
+ fprintf(pysamerr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b));
+ return -1;
+ };
if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
write_cigar(cigar2, n2, m2, cigar[0]);
} else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) {
@@ -259,9 +274,15 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
if (q.s[0] == BAM_CINS) {
for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD));
+ k = 0;
+ } else if (q.s[0] == BAM_CPAD) {
+ // Join 'k' CPAD to our first cigar op CPAD too.
+ for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
+ } else {
+ k = 0;
}
/* Count consecutive CIGAR operators to turn into a CIGAR string */
- for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
+ for (i = 1, k++, op = q.s[0]; i < q.l; ++i) {
if (op != q.s[i]) {
write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
op = q.s[i]; k = 1;
@@ -304,14 +325,14 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
if (b->core.mtid < 0 || b->core.mpos < 0) {
/* Nice case, no mate to worry about*/
- // fprintf(pysamerr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b));
+ // fprintf(pysamerr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b));
/* TODO - Warning if FLAG says mate should be mapped? */
/* Clean up funny input where mate position is given but mate reference is missing: */
b->core.mtid = -1;
b->core.mpos = -1;
} else if (b->core.mtid == b->core.tid) {
/* Nice case, same reference */
- // fprintf(pysamerr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b));
+ // fprintf(pysamerr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b));
b->core.mpos = posmap[b->core.mpos];
} else {
/* Nasty case, Must load alternative posmap */
@@ -335,8 +356,10 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
posmap = update_posmap(posmap, r);
}
/* Most reads will have been moved so safest to always recalculate the BIN value */
- b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
- samwrite(out, b);
+ b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b));
+
+ next_seq:
+ sam_write1(out, h, b);
}
if (read_ret < -1) {
fprintf(pysamerr, "[depad] truncated file.\n");
@@ -347,13 +370,12 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
return ret;
}
-bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
+bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai)
{
-#if 0
int i = 0, unpadded_len = 0;
- bam_header_t *header = 0 ;
+ bam_hdr_t *header = 0 ;
- header = bam_header_dup(old);
+ header = bam_hdr_dup(old);
for (i = 0; i < old->n_targets; ++i) {
unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
if (unpadded_len < 0) {
@@ -378,11 +400,54 @@ bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
end = strchr(text, '\n');
assert(end != 0);
if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') {
- /* TODO - edit the @SQ line here to remove MD and fix LN. */
- /* For now just remove the @SQ line, and samtools will */
- /* automatically generate a minimal replacement with LN. */
- /* However, that discards any other tags like AS, SP, UR. */
- //fprintf(pysamerr, "[depad] Removing @SQ line\n");
+ const char *cp = text+3;
+ char *name = strstr(text, "\tSN:");
+ char *name_end;
+ if (!name) {
+ fprintf(pysamerr, "Unable to find SN: header field\n");
+ return NULL;
+ }
+ name += 4;
+ for (name_end = name; name_end != end && *name_end != '\t'; name_end++);
+ strcat(newtext, "@SQ");
+
+ /* Parse the @SQ lines */
+ while (cp != end) {
+ if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) {
+ // Rewrite the length
+ char len_buf[100];
+ int tid;
+ for (tid = 0; tid < header->n_targets; tid++) {
+ // may want to hash this, but new header API incoming.
+ if (strncmp(name, header->target_name[tid], name_end - name) == 0) {
+ sprintf(len_buf, "LN:%d", header->target_len[tid]);
+ strcat(newtext, len_buf);
+ break;
+ }
+ }
+ while (cp != end && *cp++ != '\t');
+ if (cp != end)
+ strcat(newtext, "\t");
+ } else if (end-cp >= 2 &&
+ (strncmp(cp, "M5", 2) == 0 ||
+ strncmp(cp, "UR", 2) == 0)) {
+ // MD5 changed during depadding; ditch it.
+ // URLs are also invalid.
+ while (cp != end && *cp++ != '\t');
+ } else {
+ // Otherwise copy this sub-field verbatim
+ const char *cp_start = cp;
+ while (cp != end && *cp++ != '\t');
+ strncat(newtext, cp_start, cp-cp_start);
+ }
+ }
+
+ // Add newline, replacing trailing '\t' if last on line was the LN:
+ char *text_end = newtext + strlen(newtext);
+ if (text_end[-1] == '\t')
+ text_end[-1] = '\n';
+ else
+ *text_end++ = '\n', *text_end = '\0';
} else {
/* Copy this line to the new header */
strncat(newtext, text, end - text + 1);
@@ -403,41 +468,51 @@ bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
}
//fprintf(pysamerr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
return header;
-#else
- fprintf(pysamerr, "Samtools-htslib: fix_header() header parsing not yet implemented\n");
- abort();
-#endif
}
static int usage(int is_long_help);
int main_pad2unpad(int argc, char *argv[])
{
- samfile_t *in = 0, *out = 0;
- bam_header_t *h = 0;
+ samFile *in = 0, *out = 0;
+ bam_hdr_t *h = 0, *h_fix = 0;
faidx_t *fai = 0;
- int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0;
- char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
+ int c, compress_level = -1, is_long_help = 0;
+ char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0;
int ret=0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'),
+ { NULL, 0, NULL, 0 }
+ };
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) {
+ while ((c = getopt_long(argc, argv, "SCso:u1T:?", lopts, NULL)) >= 0) {
switch (c) {
- case 'S': is_bamin = 0; break;
- case 's': assert(compress_level == -1); is_bamout = 0; break;
+ case 'S': break;
+ case 'C': hts_parse_format(&ga.out, "cram"); break;
+ case 's': assert(compress_level == -1); hts_parse_format(&ga.out, "sam"); break;
case 'o': fn_out = strdup(optarg); break;
- case 'u': assert(is_bamout == 1); compress_level = 0; break;
- case '1': assert(is_bamout == 1); compress_level = 1; break;
- case 'T': fn_ref = strdup(optarg); break;
+ case 'u':
+ compress_level = 0;
+ if (ga.out.format == unknown_format)
+ hts_parse_format(&ga.out, "bam");
+ break;
+ case '1':
+ compress_level = 1;
+ if (ga.out.format == unknown_format)
+ hts_parse_format(&ga.out, "bam");
+ break;
case '?': is_long_help = 1; break;
- default: return usage(is_long_help);
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ fprintf(pysamerr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
+ return usage(is_long_help);
}
}
if (argc == optind) return usage(is_long_help);
- if (is_bamin) strcat(in_mode, "b");
- if (is_bamout) strcat(out_mode, "b");
strcat(out_mode, "h");
if (compress_level >= 0) {
char tmp[2];
@@ -446,47 +521,60 @@ int main_pad2unpad(int argc, char *argv[])
}
// Load FASTA reference (also needed for SAM -> BAM if missing header)
- if (fn_ref) {
- fn_list = samfaipath(fn_ref);
- fai = fai_load(fn_ref);
+ if (ga.reference) {
+ fn_list = samfaipath(ga.reference);
+ fai = fai_load(ga.reference);
}
// open file handlers
- if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+ if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
fprintf(pysamerr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
ret = 1;
goto depad_end;
}
- if (in->header == 0) {
- fprintf(pysamerr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+ if (fn_list && hts_set_fai_filename(in, fn_list) != 0) {
+ fprintf(pysamerr, "[depad] failed to load reference file \"%s\".\n", fn_list);
ret = 1;
goto depad_end;
}
- if (in->header->text == 0 || in->header->l_text == 0) {
- fprintf(pysamerr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]);
- assert (0 == in->header->l_text);
- assert (0 == in->header->text);
+ if ((h = sam_hdr_read(in)) == 0) {
+ fprintf(pysamerr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+ ret = 1;
+ goto depad_end;
}
- if (fn_ref) {
- h = fix_header(in->header, fai);
+ if (fai) {
+ h_fix = fix_header(h, fai);
} else {
fprintf(pysamerr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
- h = in->header;
+ h_fix = h;
}
- if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) {
+ char wmode[2];
+ strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b");
+ if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) {
fprintf(pysamerr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
ret = 1;
goto depad_end;
}
+ // Reference-based CRAM won't work unless we also create a new reference.
+ // We could embed this, but for now we take the easy option.
+ if (ga.out.format == cram)
+ hts_set_opt(out, CRAM_OPT_NO_REF, 1);
+
+ if (sam_hdr_write(out, h_fix) != 0) {
+ fprintf(pysamerr, "[depad] failed to write header.\n");
+ ret = 1;
+ goto depad_end;
+ }
+
// Do the depad
- ret = bam_pad2unpad(in, out, fai);
+ ret = bam_pad2unpad(in, out, h, fai);
depad_end:
// close files, free and return
if (fai) fai_destroy(fai);
- if (! (in && h == in->header)) bam_header_destroy(h);
- samclose(in);
- samclose(out);
+ if (h) bam_hdr_destroy(h);
+ sam_close(in);
+ sam_close(out);
free(fn_list); free(fn_out);
return ret;
}
@@ -495,14 +583,17 @@ static int usage(int is_long_help)
{
fprintf(pysamerr, "\n");
fprintf(pysamerr, "Usage: samtools depad <in.bam>\n\n");
- fprintf(pysamerr, "Options: -s output is SAM (default is BAM)\n");
- fprintf(pysamerr, " -S input is SAM (default is BAM)\n");
- fprintf(pysamerr, " -u uncompressed BAM output (can't use with -s)\n");
- fprintf(pysamerr, " -1 fast compression BAM output (can't use with -s)\n");
- fprintf(pysamerr, " -T FILE padded reference sequence file [null]\n");
- fprintf(pysamerr, " -o FILE output file name [stdout]\n");
- fprintf(pysamerr, " -? longer help\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysamerr, "Options:\n");
+ fprintf(pysamerr, " -s Output is SAM (default is BAM)\n");
+ fprintf(pysamerr, " -S Input is SAM (default is BAM)\n");
+ fprintf(pysamerr, " -u Uncompressed BAM output (can't use with -s)\n");
+ fprintf(pysamerr, " -1 Fast compression BAM output (can't use with -s)\n");
+ fprintf(pysamerr, " -T, --reference FILE\n");
+ fprintf(pysamerr, " Padded reference sequence file [null]\n");
+ fprintf(pysamerr, " -o FILE Output file name [stdout]\n");
+ fprintf(pysamerr, " -? Longer help\n");
+ sam_global_opt_help(pysamerr, "-...-");
+
if (is_long_help)
fprintf(pysamerr, "Notes:\n\
\n\
diff --git a/samtools/phase.c b/samtools/phase.c
index 23a15a7..0667ea5 100644
--- a/samtools/phase.c
+++ b/samtools/phase.c
@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <zlib.h>
#include "htslib/sam.h"
#include "errmod.h"
+#include "sam_opts.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
@@ -77,8 +78,6 @@ typedef khash_t(64) nseq_t;
#include "htslib/ksort.h"
KSORT_INIT(rseq, frag_p, rseq_lt)
-extern const char bam_nt16_nt4_table[];
-
static inline uint64_t X31_hash_string(const char *s)
{
uint64_t h = *s;
@@ -539,7 +538,7 @@ static int gl2cns(float q[16])
int main_phase(int argc, char *argv[])
{
- int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+ int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
const bam_pileup1_t *plp;
bam_plp_t iter;
nseq_t *seqs;
@@ -550,10 +549,16 @@ int main_phase(int argc, char *argv[])
errmod_t *em;
uint16_t *bases;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
memset(&g, 0, sizeof(phaseg_t));
g.flag = FLAG_FIX_CHIMERA;
g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
- while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "Q:eFq:k:b:l:D:A:", lopts, NULL)) >= 0) {
switch (c) {
case 'D': g.max_depth = atoi(optarg); break;
case 'q': g.min_varLOD = atoi(optarg); break;
@@ -564,9 +569,13 @@ int main_phase(int argc, char *argv[])
case 'A': g.flag |= FLAG_DROP_AMBI; break;
case 'b': g.pre = strdup(optarg); break;
case 'l': fn_list = strdup(optarg); break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
}
+ if (usage) break;
}
- if (argc == optind) {
+ if (usage || argc == optind) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools phase [options] <in.bam>\n\n");
fprintf(stderr, "Options: -k INT block length [%d]\n", g.k);
@@ -579,19 +588,31 @@ int main_phase(int argc, char *argv[])
fprintf(stderr, " -A drop reads with ambiguous phase\n");
// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
fprintf(stderr, "\n");
+
+ sam_global_opt_help(stderr, "-....");
+
return 1;
}
- g.fp = sam_open(argv[optind], "r");
+ g.fp = sam_open_format(argv[optind], "r", &ga.in);
g.fp_hdr = sam_hdr_read(g.fp);
+ if (g.fp_hdr == NULL) {
+ fprintf(stderr, "Failed to read header for '%s'\n", argv[optind]);
+ return 1;
+ }
if (fn_list) { // read the list of sites to phase
set = loadpos(fn_list, g.fp_hdr);
free(fn_list);
} else g.flag &= ~FLAG_LIST_EXCL;
if (g.pre) { // open BAMs to write
char *s = (char*)malloc(strlen(g.pre) + 20);
- strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = sam_open(s, "wb");
- strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = sam_open(s, "wb");
- strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = sam_open(s, "wb");
+ if (ga.out.format == unknown_format)
+ ga.out.format = bam; // default via "wb".
+ strcpy(s, g.pre); strcat(s, ".0."); strcat(s, hts_format_file_extension(&ga.out));
+ g.out[0] = sam_open_format(s, "wb", &ga.out);
+ strcpy(s, g.pre); strcat(s, ".1."); strcat(s, hts_format_file_extension(&ga.out));
+ g.out[1] = sam_open_format(s, "wb", &ga.out);
+ strcpy(s, g.pre); strcat(s, ".chimera."); strcat(s, hts_format_file_extension(&ga.out));
+ g.out[2] = sam_open_format(s, "wb", &ga.out);
for (c = 0; c <= 2; ++c) {
g.out_hdr[c] = bam_hdr_dup(g.fp_hdr);
sam_hdr_write(g.out[c], g.out_hdr[c]);
@@ -643,7 +664,7 @@ int main_phase(int argc, char *argv[])
baseQ = bam_get_qual(p->b)[p->qpos];
if (baseQ < g.min_baseQ) continue;
seq = bam_get_seq(p->b);
- b = bam_nt16_nt4_table[bam_seqi(seq, p->qpos)];
+ b = seq_nt16_int[bam_seqi(seq, p->qpos)];
if (b > 3) continue;
q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
if (q < 4) q = 4;
@@ -671,7 +692,7 @@ int main_phase(int argc, char *argv[])
if (p->is_del || p->is_refskip) continue;
if (p->b->core.qual == 0) continue;
// get the base code
- c = bam_nt16_nt4_table[(int)bam_seqi(seq, p->qpos)];
+ c = seq_nt16_int[bam_seqi(seq, p->qpos)];
if (c == (cns[vpos]&3)) c = 1;
else if (c == (cns[vpos]>>16&3)) c = 2;
else c = 0;
@@ -718,5 +739,6 @@ int main_phase(int argc, char *argv[])
}
free(g.pre); free(g.b);
}
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c
index 420f759..bc1d455 100644
--- a/samtools/phase.c.pysam.c
+++ b/samtools/phase.c.pysam.c
@@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <zlib.h>
#include "htslib/sam.h"
#include "errmod.h"
+#include "sam_opts.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
@@ -79,8 +80,6 @@ typedef khash_t(64) nseq_t;
#include "htslib/ksort.h"
KSORT_INIT(rseq, frag_p, rseq_lt)
-extern const char bam_nt16_nt4_table[];
-
static inline uint64_t X31_hash_string(const char *s)
{
uint64_t h = *s;
@@ -541,7 +540,7 @@ static int gl2cns(float q[16])
int main_phase(int argc, char *argv[])
{
- int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+ int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
const bam_pileup1_t *plp;
bam_plp_t iter;
nseq_t *seqs;
@@ -552,10 +551,16 @@ int main_phase(int argc, char *argv[])
errmod_t *em;
uint16_t *bases;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ { NULL, 0, NULL, 0 }
+ };
+
memset(&g, 0, sizeof(phaseg_t));
g.flag = FLAG_FIX_CHIMERA;
g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
- while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
+ while ((c = getopt_long(argc, argv, "Q:eFq:k:b:l:D:A:", lopts, NULL)) >= 0) {
switch (c) {
case 'D': g.max_depth = atoi(optarg); break;
case 'q': g.min_varLOD = atoi(optarg); break;
@@ -566,9 +571,13 @@ int main_phase(int argc, char *argv[])
case 'A': g.flag |= FLAG_DROP_AMBI; break;
case 'b': g.pre = strdup(optarg); break;
case 'l': fn_list = strdup(optarg); break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
}
+ if (usage) break;
}
- if (argc == optind) {
+ if (usage || argc == optind) {
fprintf(pysamerr, "\n");
fprintf(pysamerr, "Usage: samtools phase [options] <in.bam>\n\n");
fprintf(pysamerr, "Options: -k INT block length [%d]\n", g.k);
@@ -581,19 +590,31 @@ int main_phase(int argc, char *argv[])
fprintf(pysamerr, " -A drop reads with ambiguous phase\n");
// fprintf(pysamerr, " -e do not discover SNPs (effective with -l)\n");
fprintf(pysamerr, "\n");
+
+ sam_global_opt_help(pysamerr, "-....");
+
return 1;
}
- g.fp = sam_open(argv[optind], "r");
+ g.fp = sam_open_format(argv[optind], "r", &ga.in);
g.fp_hdr = sam_hdr_read(g.fp);
+ if (g.fp_hdr == NULL) {
+ fprintf(pysamerr, "Failed to read header for '%s'\n", argv[optind]);
+ return 1;
+ }
if (fn_list) { // read the list of sites to phase
set = loadpos(fn_list, g.fp_hdr);
free(fn_list);
} else g.flag &= ~FLAG_LIST_EXCL;
if (g.pre) { // open BAMs to write
char *s = (char*)malloc(strlen(g.pre) + 20);
- strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = sam_open(s, "wb");
- strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = sam_open(s, "wb");
- strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = sam_open(s, "wb");
+ if (ga.out.format == unknown_format)
+ ga.out.format = bam; // default via "wb".
+ strcpy(s, g.pre); strcat(s, ".0."); strcat(s, hts_format_file_extension(&ga.out));
+ g.out[0] = sam_open_format(s, "wb", &ga.out);
+ strcpy(s, g.pre); strcat(s, ".1."); strcat(s, hts_format_file_extension(&ga.out));
+ g.out[1] = sam_open_format(s, "wb", &ga.out);
+ strcpy(s, g.pre); strcat(s, ".chimera."); strcat(s, hts_format_file_extension(&ga.out));
+ g.out[2] = sam_open_format(s, "wb", &ga.out);
for (c = 0; c <= 2; ++c) {
g.out_hdr[c] = bam_hdr_dup(g.fp_hdr);
sam_hdr_write(g.out[c], g.out_hdr[c]);
@@ -645,7 +666,7 @@ int main_phase(int argc, char *argv[])
baseQ = bam_get_qual(p->b)[p->qpos];
if (baseQ < g.min_baseQ) continue;
seq = bam_get_seq(p->b);
- b = bam_nt16_nt4_table[bam_seqi(seq, p->qpos)];
+ b = seq_nt16_int[bam_seqi(seq, p->qpos)];
if (b > 3) continue;
q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
if (q < 4) q = 4;
@@ -673,7 +694,7 @@ int main_phase(int argc, char *argv[])
if (p->is_del || p->is_refskip) continue;
if (p->b->core.qual == 0) continue;
// get the base code
- c = bam_nt16_nt4_table[(int)bam_seqi(seq, p->qpos)];
+ c = seq_nt16_int[bam_seqi(seq, p->qpos)];
if (c == (cns[vpos]&3)) c = 1;
else if (c == (cns[vpos]>>16&3)) c = 2;
else c = 0;
@@ -720,5 +741,6 @@ int main_phase(int argc, char *argv[])
}
free(g.pre); free(g.b);
}
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/sam.c b/samtools/sam.c
index 9f5f6a0..d6cc9f6 100644
--- a/samtools/sam.c
+++ b/samtools/sam.c
@@ -1,6 +1,6 @@
/* sam.c -- format-neutral SAM/BAM API.
- Copyright (C) 2009, 2012-2014 Genome Research Ltd.
+ Copyright (C) 2009, 2012-2015 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -45,16 +45,28 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
fp->file = hts_fp;
fp->x.bam = hts_fp->fp.bgzf;
if (strchr(mode, 'r')) {
- if (aux) hts_set_fai_filename(fp->file, aux);
+ if (aux) {
+ if (hts_set_fai_filename(fp->file, aux) != 0) {
+ sam_close(hts_fp);
+ free(fp);
+ return NULL;
+ }
+ }
fp->header = sam_hdr_read(fp->file); // samclose() will free this
+ if (fp->header == NULL) {
+ sam_close(hts_fp);
+ free(fp);
+ return NULL;
+ }
fp->is_write = 0;
if (fp->header->n_targets == 0 && bam_verbose >= 1)
fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
}
else {
+ enum htsExactFormat fmt = hts_get_format(fp->file)->format;
fp->header = (bam_hdr_t *)aux; // For writing, we won't free it
fp->is_write = 1;
- if (hts_get_format(fp->file)->format != sam || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
+ if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
}
return fp;
@@ -69,6 +81,17 @@ void samclose(samfile_t *fp)
}
}
+int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+ bam1_t *b = bam_init1();
+ hts_itr_t *iter = sam_itr_queryi(idx, tid, beg, end);
+ int ret;
+ while ((ret = sam_itr_next(fp->file, iter, b)) >= 0) func(b, data);
+ hts_itr_destroy(iter);
+ bam_destroy1(b);
+ return (ret == -1)? 0 : ret;
+}
+
int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
{
bam_plbuf_t *buf;
diff --git a/samtools/sam.c.pysam.c b/samtools/sam.c.pysam.c
index 3a2d860..e7c4cac 100644
--- a/samtools/sam.c.pysam.c
+++ b/samtools/sam.c.pysam.c
@@ -2,7 +2,7 @@
/* sam.c -- format-neutral SAM/BAM API.
- Copyright (C) 2009, 2012-2014 Genome Research Ltd.
+ Copyright (C) 2009, 2012-2015 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -47,16 +47,28 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
fp->file = hts_fp;
fp->x.bam = hts_fp->fp.bgzf;
if (strchr(mode, 'r')) {
- if (aux) hts_set_fai_filename(fp->file, aux);
+ if (aux) {
+ if (hts_set_fai_filename(fp->file, aux) != 0) {
+ sam_close(hts_fp);
+ free(fp);
+ return NULL;
+ }
+ }
fp->header = sam_hdr_read(fp->file); // samclose() will free this
+ if (fp->header == NULL) {
+ sam_close(hts_fp);
+ free(fp);
+ return NULL;
+ }
fp->is_write = 0;
if (fp->header->n_targets == 0 && bam_verbose >= 1)
fprintf(pysamerr, "[samopen] no @SQ lines in the header.\n");
}
else {
+ enum htsExactFormat fmt = hts_get_format(fp->file)->format;
fp->header = (bam_hdr_t *)aux; // For writing, we won't free it
fp->is_write = 1;
- if (hts_get_format(fp->file)->format != sam || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
+ if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
}
return fp;
@@ -71,6 +83,17 @@ void samclose(samfile_t *fp)
}
}
+int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+ bam1_t *b = bam_init1();
+ hts_itr_t *iter = sam_itr_queryi(idx, tid, beg, end);
+ int ret;
+ while ((ret = sam_itr_next(fp->file, iter, b)) >= 0) func(b, data);
+ hts_itr_destroy(iter);
+ bam_destroy1(b);
+ return (ret == -1)? 0 : ret;
+}
+
int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
{
bam_plbuf_t *buf;
diff --git a/samtools/sam.h b/samtools/sam.h
index e642920..5130105 100644
--- a/samtools/sam.h
+++ b/samtools/sam.h
@@ -1,6 +1,6 @@
/* sam.h -- format-neutral SAM/BAM API.
- Copyright (C) 2009, 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2009, 2013-2015 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -103,6 +103,30 @@ extern "C" {
static inline int samwrite(samfile_t *fp, const bam1_t *b) { return sam_write1(fp->file, fp->header, b); }
/*!
+ @abstract Load BAM/CRAM index for use with samfetch()
+ @param fp file handler
+ @param fn name of the BAM or CRAM file (NOT the index file)
+ @return pointer to the index structure
+ */
+ static inline bam_index_t *samtools_sam_index_load(samfile_t *fp, const char *fn) { return sam_index_load(fp->file, fn); }
+ #undef sam_index_load
+ #define sam_index_load(fp,fn) (samtools_sam_index_load((fp), (fn)))
+
+ /*!
+ @abstract Retrieve the alignments overlapping the specified region.
+ @discussion A user defined function will be called for each
+ retrieved alignment ordered by its start position.
+ @param fp file handler
+ @param idx index returned by sam_index_load()
+ @param tid chromosome ID as is defined in the header
+ @param beg start coordinate, 0-based
+ @param end end coordinate, 0-based
+ @param data user provided data (will be transferred to func)
+ @param func user defined function
+ */
+ int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
+
+ /*!
@abstract Get the pileup for a whole alignment file
@param fp file handler
@param mask mask transferred to bam_plbuf_set_mask()
diff --git a/samtools/sam_opts.c b/samtools/sam_opts.c
new file mode 100644
index 0000000..0ed197e
--- /dev/null
+++ b/samtools/sam_opts.c
@@ -0,0 +1,153 @@
+/* sam_opts.c -- utilities to aid parsing common command line options.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: James Bonfield <jkb at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sam_opts.h"
+
+/*
+ * Processes a standard "global" samtools long option.
+ *
+ * The 'c' value is the return value from a getopt_long() call. It is checked
+ * against the lopt[] array to find the corresponding value as this may have
+ * been reassigned by the individual subcommand.
+ *
+ * Having found the entry, the corresponding long form is used to apply the
+ * option, storing the setting in sam_global_args *ga.
+ *
+ * Returns 0 on success,
+ * -1 on failure.
+ */
+int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt,
+ sam_global_args *ga) {
+ int r = 0;
+
+ while (lopt->name) {
+ if (c != lopt->val) {
+ lopt++;
+ continue;
+ }
+
+ if (strcmp(lopt->name, "input-fmt") == 0) {
+ r = hts_parse_format(&ga->in, optarg);
+ break;
+ } else if (strcmp(lopt->name, "input-fmt-option") == 0) {
+ r = hts_opt_add((hts_opt **)&ga->in.specific, optarg);
+ break;
+ } else if (strcmp(lopt->name, "output-fmt") == 0) {
+ r = hts_parse_format(&ga->out, optarg);
+ break;
+ } else if (strcmp(lopt->name, "output-fmt-option") == 0) {
+ r = hts_opt_add((hts_opt **)&ga->out.specific, optarg);
+ break;
+ } else if (strcmp(lopt->name, "reference") == 0) {
+ char *ref = malloc(10 + strlen(optarg) + 1);
+ sprintf(ref, "reference=%s", optarg);
+ ga->reference = strdup(optarg);
+ r = hts_opt_add((hts_opt **)&ga->in.specific, ref);
+ r |= hts_opt_add((hts_opt **)&ga->out.specific, ref);
+ free(ref);
+ break;
+// } else if (strcmp(lopt->name, "verbose") == 0) {
+// ga->verbosity++;
+// break;
+ }
+ }
+
+ if (!lopt->name) {
+ fprintf(stderr, "Unexpected global option: %s\n", lopt->name);
+ return -1;
+ }
+
+ return r;
+}
+
+/*
+ * Report the usage for global options.
+ *
+ * This accepts a string with one character per SAM_OPT_GLOBAL_OPTIONS option
+ * to determine which options need to be printed and how.
+ * Each character should be one of:
+ * '.' No short option has been assigned. Use --long-opt only.
+ * '-' The long (and short) option has been disabled.
+ * <c> Otherwise the short option is character <c>.
+ */
+void sam_global_opt_help(FILE *fp, const char *shortopts) {
+ int i = 0;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ for (i = 0; shortopts && shortopts[i] && lopts[i].name; i++) {
+ if (shortopts[i] == '-')
+ continue;
+
+ if (shortopts[i] == '.')
+ fprintf(fp, " --");
+ else
+ fprintf(fp, " -%c, --", shortopts[i]);
+
+ if (strcmp(lopts[i].name, "input-fmt") == 0)
+ fprintf(fp,"input-fmt FORMAT[,OPT[=VAL]]...\n"
+ " Specify input format (SAM, BAM, CRAM)\n");
+ else if (strcmp(lopts[i].name, "input-fmt-option") == 0)
+ fprintf(fp,"input-fmt-option OPT[=VAL]\n"
+ " Specify a single input file format option in the form\n"
+ " of OPTION or OPTION=VALUE\n");
+ else if (strcmp(lopts[i].name, "output-fmt") == 0)
+ fprintf(fp,"output-fmt FORMAT[,OPT[=VAL]]...\n"
+ " Specify output format (SAM, BAM, CRAM)\n");
+ else if (strcmp(lopts[i].name, "output-fmt-option") == 0)
+ fprintf(fp,"output-fmt-option OPT[=VAL]\n"
+ " Specify a single output file format option in the form\n"
+ " of OPTION or OPTION=VALUE\n");
+ else if (strcmp(lopts[i].name, "reference") == 0)
+ fprintf(fp,"reference FILE\n"
+ " Reference sequence FASTA FILE [null]\n");
+// else if (strcmp(lopts[i].name, "verbose") == 0)
+// fprintf(fp,"verbose\n"
+// " Increment level of verbosity\n");
+ }
+}
+
+void sam_global_args_init(sam_global_args *ga) {
+ if (!ga)
+ return;
+
+ memset(ga, 0, sizeof(*ga));
+}
+
+void sam_global_args_free(sam_global_args *ga) {
+ if (ga->in.specific)
+ hts_opt_free(ga->in.specific);
+
+ if (ga->out.specific)
+ hts_opt_free(ga->out.specific);
+
+ if (ga->reference)
+ free(ga->reference);
+}
diff --git a/samtools/sam_opts.c.pysam.c b/samtools/sam_opts.c.pysam.c
new file mode 100644
index 0000000..c976438
--- /dev/null
+++ b/samtools/sam_opts.c.pysam.c
@@ -0,0 +1,155 @@
+#include "pysam.h"
+
+/* sam_opts.c -- utilities to aid parsing common command line options.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: James Bonfield <jkb at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sam_opts.h"
+
+/*
+ * Processes a standard "global" samtools long option.
+ *
+ * The 'c' value is the return value from a getopt_long() call. It is checked
+ * against the lopt[] array to find the corresponding value as this may have
+ * been reassigned by the individual subcommand.
+ *
+ * Having found the entry, the corresponding long form is used to apply the
+ * option, storing the setting in sam_global_args *ga.
+ *
+ * Returns 0 on success,
+ * -1 on failure.
+ */
+int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt,
+ sam_global_args *ga) {
+ int r = 0;
+
+ while (lopt->name) {
+ if (c != lopt->val) {
+ lopt++;
+ continue;
+ }
+
+ if (strcmp(lopt->name, "input-fmt") == 0) {
+ r = hts_parse_format(&ga->in, optarg);
+ break;
+ } else if (strcmp(lopt->name, "input-fmt-option") == 0) {
+ r = hts_opt_add((hts_opt **)&ga->in.specific, optarg);
+ break;
+ } else if (strcmp(lopt->name, "output-fmt") == 0) {
+ r = hts_parse_format(&ga->out, optarg);
+ break;
+ } else if (strcmp(lopt->name, "output-fmt-option") == 0) {
+ r = hts_opt_add((hts_opt **)&ga->out.specific, optarg);
+ break;
+ } else if (strcmp(lopt->name, "reference") == 0) {
+ char *ref = malloc(10 + strlen(optarg) + 1);
+ sprintf(ref, "reference=%s", optarg);
+ ga->reference = strdup(optarg);
+ r = hts_opt_add((hts_opt **)&ga->in.specific, ref);
+ r |= hts_opt_add((hts_opt **)&ga->out.specific, ref);
+ free(ref);
+ break;
+// } else if (strcmp(lopt->name, "verbose") == 0) {
+// ga->verbosity++;
+// break;
+ }
+ }
+
+ if (!lopt->name) {
+ fprintf(pysamerr, "Unexpected global option: %s\n", lopt->name);
+ return -1;
+ }
+
+ return r;
+}
+
+/*
+ * Report the usage for global options.
+ *
+ * This accepts a string with one character per SAM_OPT_GLOBAL_OPTIONS option
+ * to determine which options need to be printed and how.
+ * Each character should be one of:
+ * '.' No short option has been assigned. Use --long-opt only.
+ * '-' The long (and short) option has been disabled.
+ * <c> Otherwise the short option is character <c>.
+ */
+void sam_global_opt_help(FILE *fp, const char *shortopts) {
+ int i = 0;
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0),
+ { NULL, 0, NULL, 0 }
+ };
+
+ for (i = 0; shortopts && shortopts[i] && lopts[i].name; i++) {
+ if (shortopts[i] == '-')
+ continue;
+
+ if (shortopts[i] == '.')
+ fprintf(fp, " --");
+ else
+ fprintf(fp, " -%c, --", shortopts[i]);
+
+ if (strcmp(lopts[i].name, "input-fmt") == 0)
+ fprintf(fp,"input-fmt FORMAT[,OPT[=VAL]]...\n"
+ " Specify input format (SAM, BAM, CRAM)\n");
+ else if (strcmp(lopts[i].name, "input-fmt-option") == 0)
+ fprintf(fp,"input-fmt-option OPT[=VAL]\n"
+ " Specify a single input file format option in the form\n"
+ " of OPTION or OPTION=VALUE\n");
+ else if (strcmp(lopts[i].name, "output-fmt") == 0)
+ fprintf(fp,"output-fmt FORMAT[,OPT[=VAL]]...\n"
+ " Specify output format (SAM, BAM, CRAM)\n");
+ else if (strcmp(lopts[i].name, "output-fmt-option") == 0)
+ fprintf(fp,"output-fmt-option OPT[=VAL]\n"
+ " Specify a single output file format option in the form\n"
+ " of OPTION or OPTION=VALUE\n");
+ else if (strcmp(lopts[i].name, "reference") == 0)
+ fprintf(fp,"reference FILE\n"
+ " Reference sequence FASTA FILE [null]\n");
+// else if (strcmp(lopts[i].name, "verbose") == 0)
+// fprintf(fp,"verbose\n"
+// " Increment level of verbosity\n");
+ }
+}
+
+void sam_global_args_init(sam_global_args *ga) {
+ if (!ga)
+ return;
+
+ memset(ga, 0, sizeof(*ga));
+}
+
+void sam_global_args_free(sam_global_args *ga) {
+ if (ga->in.specific)
+ hts_opt_free(ga->in.specific);
+
+ if (ga->out.specific)
+ hts_opt_free(ga->out.specific);
+
+ if (ga->reference)
+ free(ga->reference);
+}
diff --git a/samtools/sam_opts.h b/samtools/sam_opts.h
new file mode 100644
index 0000000..25e9279
--- /dev/null
+++ b/samtools/sam_opts.h
@@ -0,0 +1,99 @@
+/* sam_opts.h -- utilities to aid parsing common command line options.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: James Bonfield <jkb at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef SAM_OPTS_H
+#define SAM_OPTS_H
+
+#include <stdio.h>
+#include <limits.h>
+#include <getopt.h>
+#include <htslib/hts.h>
+
+typedef struct sam_global_args {
+ htsFormat in;
+ htsFormat out;
+ char *reference;
+ //int verbosity;
+} sam_global_args;
+
+#define SAM_GLOBAL_ARGS_INIT {{0},{0}}
+
+enum {
+ SAM_OPT_INPUT_FMT = CHAR_MAX+1,
+ SAM_OPT_INPUT_FMT_OPTION,
+ SAM_OPT_OUTPUT_FMT,
+ SAM_OPT_OUTPUT_FMT_OPTION,
+ SAM_OPT_REFERENCE,
+ //SAM_OPT_VERBOSE
+};
+
+#define SAM_OPT_VAL(val, defval) ((val) == '-')? '?' : (val)? (val) : (defval)
+
+// Use this within struct option lopts[] = {...} to add the standard global
+// options. The arguments determine whether the corresponding option is
+// enabled and, if so, whether it has a short option equivalent:
+// 0 No short option has been assigned. Use --long-opt only.
+// '-' Both long and short options are disabled.
+// <c> Otherwise the equivalent short option is character <c>.
+#define SAM_OPT_GLOBAL_OPTIONS(o1, o2, o3, o4, o5) \
+ {"input-fmt", required_argument, NULL, SAM_OPT_VAL(o1, SAM_OPT_INPUT_FMT)}, \
+ {"input-fmt-option", required_argument, NULL, SAM_OPT_VAL(o2, SAM_OPT_INPUT_FMT_OPTION)}, \
+ {"output-fmt", required_argument, NULL, SAM_OPT_VAL(o3, SAM_OPT_OUTPUT_FMT)}, \
+ {"output-fmt-option", required_argument, NULL, SAM_OPT_VAL(o4, SAM_OPT_OUTPUT_FMT_OPTION)}, \
+ {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}
+ //{"verbose", no_argument, NULL, SAM_OPT_VERBOSE}
+
+/*
+ * Processes a standard "global" samtools long option.
+ *
+ * The 'c' value is the return value from a getopt_long() call. It is checked
+ * against the lopt[] array to find the corresponding value as this may have
+ * been reassigned by the individual subcommand.
+ *
+ * Having found the entry, the corresponding long form is used to apply the
+ * option, storing the setting in sam_global_args *ga.
+ *
+ * Returns 0 on success,
+ * -1 on failure.
+ */
+int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt,
+ sam_global_args *ga);
+
+/*
+ * Report the usage for global options.
+ *
+ * This accepts a string with one character per SAM_OPT_GLOBAL_OPTIONS option
+ * to determine which options need to be printed and how.
+ * Each character should be one of:
+ * '.' No short option has been assigned. Use --long-opt only.
+ * '-' The long (and short) option has been disabled.
+ * <c> Otherwise the short option is character <c>.
+ */
+void sam_global_opt_help(FILE *fp, const char *shortopts);
+
+
+void sam_global_args_init(sam_global_args *ga);
+void sam_global_args_free(sam_global_args *ga);
+
+#endif
diff --git a/samtools/sam_view.c b/samtools/sam_view.c
index 55e7e3d..4358a1c 100644
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -1,6 +1,6 @@
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2014 Genome Research Ltd.
+ Copyright (C) 2009-2015 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -31,11 +31,13 @@ DEALINGS IN THE SOFTWARE. */
#include <inttypes.h>
#include <stdbool.h>
#include <assert.h>
+#include <getopt.h>
#include "htslib/sam.h"
#include "htslib/faidx.h"
#include "htslib/kstring.h"
#include "htslib/khash.h"
#include "samtools.h"
+#include "sam_opts.h"
KHASH_SET_INIT_STR(rg)
typedef khash_t(rg) *rghash_t;
@@ -139,9 +141,9 @@ static char *drop_rg(char *hdtxt, rghash_t h, int *len)
return str.s;
}
-static int usage(int is_long_help);
+static int usage(FILE *fp, int exit_status, int is_long_help);
-static int add_read_group_single(samview_settings_t *settings, char *name)
+static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name)
{
char *d = strdup(name);
int ret = 0;
@@ -159,12 +161,12 @@ static int add_read_group_single(samview_settings_t *settings, char *name)
return 0;
err:
- print_error("Couldn't add \"%s\" to read group list: memory exhausted?", name);
+ print_error(subcmd, "Couldn't add \"%s\" to read group list: memory exhausted?", name);
free(d);
return -1;
}
-static int add_read_groups_file(samview_settings_t *settings, char *fn)
+static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
{
FILE *fp;
char buf[1024];
@@ -179,7 +181,7 @@ static int add_read_groups_file(samview_settings_t *settings, char *fn)
fp = fopen(fn, "r");
if (fp == NULL) {
- print_error_errno("failed to open \"%s\" for reading", fn);
+ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
return -1;
}
@@ -194,7 +196,7 @@ static int add_read_groups_file(samview_settings_t *settings, char *fn)
}
if (ferror(fp)) ret = -1;
if (ret == -1) {
- print_error_errno("failed to read \"%s\"", fn);
+ print_error_errno(subcmd, "failed to read \"%s\"", fn);
}
fclose(fp);
return (ret != -1) ? 0 : -1;
@@ -205,21 +207,21 @@ static inline int check_sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t
int r = sam_write1(fp, h, b);
if (r >= 0) return r;
- if (fname) print_error_errno("writing to \"%s\" failed", fname);
- else print_error_errno("writing to standard output failed");
+ if (fname) print_error_errno("view", "writing to \"%s\" failed", fname);
+ else print_error_errno("view", "writing to standard output failed");
*retp = EXIT_FAILURE;
return r;
}
-static void check_sam_close(samFile *fp, const char *fname, const char *null_fname, int *retp)
+static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp)
{
int r = sam_close(fp);
if (r >= 0) return;
// TODO Need error infrastructure so we can print a message instead of r
- if (fname) print_error("error closing \"%s\": %d", fname, r);
- else print_error("error closing %s: %d", null_fname, r);
+ if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r);
+ else print_error(subcmd, "error closing %s: %d", null_fname, r);
*retp = EXIT_FAILURE;
}
@@ -231,7 +233,9 @@ int main_samview(int argc, char *argv[])
int64_t count = 0;
samFile *in = 0, *out = 0, *un_out=0;
bam_hdr_t *header = NULL;
- char out_mode[5], *out_format = "", *fn_out = 0, *fn_list = 0, *fn_ref = 0, *q, *fn_un_out = 0;
+ char out_mode[5], out_un_mode[5], *out_format = "";
+ char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
samview_settings_t settings = {
.rghash = NULL,
@@ -246,10 +250,18 @@ int main_samview(int argc, char *argv[])
.bed = NULL,
};
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'),
+ { "threads", required_argument, NULL, '@' },
+ { NULL, 0, NULL, 0 }
+ };
+
/* parse command-line options */
- /* TODO: convert this to getopt_long we're running out of letters */
strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "SbBcCt:h1Ho:q:f:F:ul:r:?T:R:L:s:@:m:x:U:")) >= 0) {
+ strcpy(out_un_mode, "w");
+ while ((c = getopt_long(argc, argv,
+ "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:",
+ lopts, NULL)) >= 0) {
switch (c) {
case 's':
if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
@@ -276,19 +288,19 @@ int main_samview(int argc, char *argv[])
case 'l': settings.library = strdup(optarg); break;
case 'L':
if ((settings.bed = bed_read(optarg)) == NULL) {
- print_error_errno("Could not read file \"%s\"", optarg);
+ print_error_errno("view", "Could not read file \"%s\"", optarg);
ret = 1;
goto view_end;
}
break;
case 'r':
- if (add_read_group_single(&settings, optarg) != 0) {
+ if (add_read_group_single("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
break;
case 'R':
- if (add_read_groups_file(&settings, optarg) != 0) {
+ if (add_read_groups_file("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
@@ -298,43 +310,62 @@ int main_samview(int argc, char *argv[])
//case 'X': out_format = "X"; break;
*/
case '?': is_long_help = 1; break;
- case 'T': fn_ref = strdup(optarg); break;
case 'B': settings.remove_B = 1; break;
case '@': n_threads = strtol(optarg, 0, 0); break;
case 'x':
{
if (strlen(optarg) != 2) {
fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
- return usage(is_long_help);
+ return usage(stderr, EXIT_FAILURE, is_long_help);
}
settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
settings.remove_aux[settings.remove_aux_len-1] = optarg;
}
break;
- default: return usage(is_long_help);
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
+ return usage(stderr, EXIT_FAILURE, is_long_help);
+ break;
}
}
- if (compress_level >= 0) out_format = "b";
+ if (compress_level >= 0 && !*out_format) out_format = "b";
if (is_header_only) is_header = 1;
- strcat(out_mode, out_format);
+ // File format auto-detection first
+ if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL);
+ if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL);
+ // Overridden by manual -b, -C
+ if (*out_format)
+ out_mode[1] = out_un_mode[1] = *out_format;
+ out_mode[2] = out_un_mode[2] = '\0';
+ // out_(un_)mode now 1 or 2 bytes long, followed by nul.
if (compress_level >= 0) {
char tmp[2];
tmp[0] = compress_level + '0'; tmp[1] = '\0';
strcat(out_mode, tmp);
+ strcat(out_un_mode, tmp);
}
- if (argc == optind) return usage(is_long_help); // potential memory leak...
+ if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak...
+ fn_in = (optind < argc)? argv[optind] : "-";
// generate the fn_list if necessary
- if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
+ if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference);
// open file handlers
- if ((in = sam_open(argv[optind], "r")) == 0) {
- print_error_errno("failed to open \"%s\" for reading", argv[optind]);
+ if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) {
+ print_error_errno("view", "failed to open \"%s\" for reading", fn_in);
ret = 1;
goto view_end;
}
- if (fn_list) hts_set_fai_filename(in, fn_list);
+
+ if (fn_list) {
+ if (hts_set_fai_filename(in, fn_list) != 0) {
+ fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
+ }
+ }
if ((header = sam_hdr_read(in)) == 0) {
- fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]);
+ fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in);
ret = 1;
goto view_end;
}
@@ -347,13 +378,21 @@ int main_samview(int argc, char *argv[])
header->l_text = l;
}
if (!is_count) {
- if ((out = sam_open(fn_out? fn_out : "-", out_mode)) == 0) {
- print_error_errno("failed to open \"%s\" for writing", fn_out? fn_out : "standard output");
+ if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) {
+ print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output");
ret = 1;
goto view_end;
}
- if (fn_list) hts_set_fai_filename(out, fn_list);
- if (*out_format || is_header) {
+ if (fn_list) {
+ if (hts_set_fai_filename(out, fn_list) != 0) {
+ fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
+ }
+ }
+ if (*out_format || is_header ||
+ out_mode[1] == 'b' || out_mode[1] == 'c' ||
+ (ga.out.format != sam && ga.out.format != unknown_format)) {
if (sam_hdr_write(out, header) != 0) {
fprintf(stderr, "[main_samview] failed to write the SAM header\n");
ret = 1;
@@ -361,12 +400,21 @@ int main_samview(int argc, char *argv[])
}
}
if (fn_un_out) {
- if ((un_out = sam_open(fn_un_out, out_mode)) == 0) {
- print_error_errno("failed to open \"%s\" for writing", fn_un_out);
+ if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) {
+ print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out);
ret = 1;
goto view_end;
}
- if (*out_format || is_header) {
+ if (fn_list) {
+ if (hts_set_fai_filename(un_out, fn_list) != 0) {
+ fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
+ }
+ }
+ if (*out_format || is_header ||
+ out_un_mode[1] == 'b' || out_un_mode[1] == 'c' ||
+ (ga.out.format != sam && ga.out.format != unknown_format)) {
if (sam_hdr_write(un_out, header) != 0) {
fprintf(stderr, "[main_samview] failed to write the SAM header\n");
ret = 1;
@@ -375,10 +423,11 @@ int main_samview(int argc, char *argv[])
}
}
}
+
if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
if (is_header_only) goto view_end; // no need to print alignments
- if (argc == optind + 1) { // convert/print the entire file
+ if (optind + 1 >= argc) { // convert/print the entire file
bam1_t *b = bam_init1();
int r;
while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in'
@@ -397,7 +446,7 @@ int main_samview(int argc, char *argv[])
} else { // retrieve alignments in specified regions
int i;
bam1_t *b;
- hts_idx_t *idx = sam_index_load(in, argv[optind]); // load index
+ hts_idx_t *idx = sam_index_load(in, fn_in); // load index
if (idx == 0) { // index is unavailable
fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n");
ret = 1;
@@ -407,8 +456,12 @@ int main_samview(int argc, char *argv[])
for (i = optind + 1; i < argc; ++i) {
int result;
hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200'
- if (iter == NULL) { // reference name is not found
- fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
+ if (iter == NULL) { // region invalid or reference name not found
+ int beg, end;
+ if (hts_parse_reg(argv[i], &beg, &end))
+ fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
+ else
+ fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]);
continue;
}
// fetch alignments
@@ -436,11 +489,12 @@ view_end:
printf("%" PRId64 "\n", count);
// close files, free and return
- if (in) check_sam_close(in, argv[optind], "standard input", &ret);
- if (out) check_sam_close(out, fn_out, "standard output", &ret);
- if (un_out) check_sam_close(un_out, fn_un_out, "file", &ret);
+ if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
+ if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
+ if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
- free(fn_list); free(fn_ref); free(fn_out); free(settings.library); free(fn_un_out);
+ free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
+ sam_global_args_free(&ga);
if ( header ) bam_hdr_destroy(header);
if (settings.bed) bed_destroy(settings.bed);
if (settings.rghash) {
@@ -455,68 +509,85 @@ view_end:
return ret;
}
-static int usage(int is_long_help)
+static int usage(FILE *fp, int exit_status, int is_long_help)
{
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n\n");
- // output options
- fprintf(stderr, "Options: -b output BAM\n");
- fprintf(stderr, " -C output CRAM (requires -T)\n");
- fprintf(stderr, " -1 use fast BAM compression (implies -b)\n");
- fprintf(stderr, " -u uncompressed BAM output (implies -b)\n");
- fprintf(stderr, " -h include header in SAM output\n");
- fprintf(stderr, " -H print SAM header only (no alignments)\n");
- fprintf(stderr, " -c print only the count of matching records\n");
- fprintf(stderr, " -o FILE output file name [stdout]\n");
- fprintf(stderr, " -U FILE output reads not selected by filters to FILE [null]\n");
- // extra input
- fprintf(stderr, " -t FILE FILE listing reference names and lengths (see long help) [null]\n");
- fprintf(stderr, " -T FILE reference sequence FASTA FILE [null]\n");
- // read filters
- fprintf(stderr, " -L FILE only include reads overlapping this BED FILE [null]\n");
- fprintf(stderr, " -r STR only include reads in read group STR [null]\n");
- fprintf(stderr, " -R FILE only include reads with read group listed in FILE [null]\n");
- fprintf(stderr, " -q INT only include reads with mapping quality >= INT [0]\n");
- fprintf(stderr, " -l STR only include reads in library STR [null]\n");
- fprintf(stderr, " -m INT only include reads with number of CIGAR operations\n");
- fprintf(stderr, " consuming query sequence >= INT [0]\n");
- fprintf(stderr, " -f INT only include reads with all bits set in INT set in FLAG [0]\n");
- fprintf(stderr, " -F INT only include reads with none of the bits set in INT\n");
- fprintf(stderr, " set in FLAG [0]\n");
- // read processing
- fprintf(stderr, " -x STR read tag to strip (repeatable) [null]\n");
- fprintf(stderr, " -B collapse the backward CIGAR operation\n");
- fprintf(stderr, " -s FLOAT integer part sets seed of random number generator [0];\n");
- fprintf(stderr, " rest sets fraction of templates to subsample [no subsampling]\n");
- // general options
- fprintf(stderr, " -@ INT number of BAM compression threads [0]\n");
- fprintf(stderr, " -? print long help, including note about region specification\n");
- fprintf(stderr, " -S ignored (input format is auto-detected)\n");
- fprintf(stderr, "\n");
+ fprintf(fp,
+"\n"
+"Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
+"\n"
+"Options:\n"
+// output options
+" -b output BAM\n"
+" -C output CRAM (requires -T)\n"
+" -1 use fast BAM compression (implies -b)\n"
+" -u uncompressed BAM output (implies -b)\n"
+" -h include header in SAM output\n"
+" -H print SAM header only (no alignments)\n"
+" -c print only the count of matching records\n"
+" -o FILE output file name [stdout]\n"
+" -U FILE output reads not selected by filters to FILE [null]\n"
+// extra input
+" -t FILE FILE listing reference names and lengths (see long help) [null]\n"
+// read filters
+" -L FILE only include reads overlapping this BED FILE [null]\n"
+" -r STR only include reads in read group STR [null]\n"
+" -R FILE only include reads with read group listed in FILE [null]\n"
+" -q INT only include reads with mapping quality >= INT [0]\n"
+" -l STR only include reads in library STR [null]\n"
+" -m INT only include reads with number of CIGAR operations consuming\n"
+" query sequence >= INT [0]\n"
+" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
+" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+// read processing
+" -x STR read tag to strip (repeatable) [null]\n"
+" -B collapse the backward CIGAR operation\n"
+" -s FLOAT integer part sets seed of random number generator [0];\n"
+" rest sets fraction of templates to subsample [no subsampling]\n"
+// general options
+" -@, --threads INT\n"
+" number of BAM/CRAM compression threads [0]\n"
+" -? print long help, including note about region specification\n"
+" -S ignored (input format is auto-detected)\n");
+
+ sam_global_opt_help(fp, "-.O.T");
+ fprintf(fp, "\n");
+
if (is_long_help)
- fprintf(stderr, "Notes:\n\
-\n\
- 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n\
-\n\
- 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n\
- two fields of each line consisting of the reference name and the\n\
- corresponding sequence length. The `.fai' file generated by \n\
- `samtools faidx' is suitable for use as this file. This may be an\n\
- empty file if reads are unaligned.\n\
-\n\
- 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\
-\n\
- 4. BAM->SAM conversion: `samtools view -h in.bam'.\n\
-\n\
- 5. A region should be presented in one of the following formats:\n\
- `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\
- specified, the input alignment file must be a sorted and indexed\n\
- alignment (BAM/CRAM) file.\n\
-\n\
- 6. Option `-u' is preferred over `-b' when the output is piped to\n\
- another samtools command.\n\
-\n");
- return 1;
+ fprintf(fp,
+"Notes:\n"
+"\n"
+" 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n"
+" Further control over the CRAM format can be specified by using the\n"
+" --output-fmt-option, e.g. to specify the number of sequences per slice\n"
+" and to use avoid reference based compression:\n"
+" `samtools view -C --output-fmt-option seqs_per_slice=5000 \\\n"
+" --output-fmt-option no_ref -o out.cram in.bam'\n"
+"\n"
+" Options can also be specified as a comma separated list within the\n"
+" --output-fmt value too. For example this is equivalent to the above\n"
+" `samtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n"
+" -o out.cram in.bam'\n"
+"\n"
+" 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n"
+" two fields of each line consisting of the reference name and the\n"
+" corresponding sequence length. The `.fai' file generated by \n"
+" `samtools faidx' is suitable for use as this file. This may be an\n"
+" empty file if reads are unaligned.\n"
+"\n"
+" 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n"
+"\n"
+" 4. BAM->SAM conversion: `samtools view -h in.bam'.\n"
+"\n"
+" 5. A region should be presented in one of the following formats:\n"
+" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n"
+" specified, the input alignment file must be a sorted and indexed\n"
+" alignment (BAM/CRAM) file.\n"
+"\n"
+" 6. Option `-u' is preferred over `-b' when the output is piped to\n"
+" another samtools command.\n"
+"\n");
+
+ return exit_status;
}
int main_import(int argc, char *argv[])
@@ -536,192 +607,424 @@ int main_import(int argc, char *argv[])
}
int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+static const char *copied_tags[] = { "RG", "BC", "QT", NULL };
-static void bam2fq_usage(FILE *to)
+static void bam2fq_usage(FILE *to, const char *command)
{
- fprintf(to, "\nUsage: samtools bam2fq [-nO] [-s <outSE.fq>] <in.bam>\n\n");
- fprintf(to, "Options: -n don't append /1 and /2 to the read name\n");
- fprintf(to, " -O output quality in the OQ tag if present\n");
- fprintf(to, " -s FILE write singleton reads to FILE [assume single-end]\n");
- fprintf(to, "\n");
+ fprintf(to,
+"Usage: samtools %s [options...] <in.bam>\n", command);
+ fprintf(to,
+"Options:\n"
+" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
+" -1 FILE write paired reads flagged READ1 to FILE\n"
+" -2 FILE write paired reads flagged READ2 to FILE\n"
+" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
+" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+" -n don't append /1 and /2 to the read name\n"
+" -O output quality in the OQ tag if present\n"
+" -s FILE write singleton reads to FILE [assume single-end]\n"
+" -t copy RG, BC and QT tags to the FASTQ header line\n"
+" -v INT default quality score if not given in file [1]\n");
+ sam_global_opt_help(to, "-.--.");
}
-int main_bam2fq(int argc, char *argv[])
-{
+typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
+typedef enum { FASTA, FASTQ } fastfile;
+typedef struct bam2fq_opts {
+ char *fnse;
+ char *fnr[3];
+ char *fn_input; // pointer to input filename in argv do not free
+ bool has12, use_oq, copy_tags;
+ int flag_on, flag_off;
+ sam_global_args ga;
+ fastfile filetype;
+ int def_qual;
+} bam2fq_opts_t;
+
+typedef struct bam2fq_state {
samFile *fp;
+ FILE *fpse;
+ FILE *fpr[3];
bam_hdr_t *h;
- bam1_t *b;
- int8_t *buf;
- int status = EXIT_SUCCESS;
- size_t max_buf;
- FILE* fpse;
+ bool has12, use_oq, copy_tags;
+ int flag_on, flag_off;
+ fastfile filetype;
+ int def_qual;
+} bam2fq_state_t;
+
+static readpart which_readpart(const bam1_t *b)
+{
+ if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
+ return READ_1;
+ } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) {
+ return READ_2;
+ } else {
+ return READ_UNKNOWN;
+ }
+}
+
+// Transform a bam1_t record into a string with the FASTQ representation of it
+// @returns false for error, true for success
+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int i;
+ int32_t qlen = b->core.l_qseq;
+ assert(qlen >= 0);
+ uint8_t *seq;
+ uint8_t *qual = bam_get_qual(b);
+ const uint8_t *oq = NULL;
+ if (state->use_oq) oq = bam_aux_get(b, "OQ") + 1;
+ bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
+
+ linebuf->l = 0;
+ // Write read name
+ readpart readpart = which_readpart(b);
+ kputc(state->filetype == FASTA? '>' : '@', linebuf);
+ kputs(bam_get_qname(b), linebuf);
+ // Add the /1 /2 if requested
+ if (state->has12) {
+ if (readpart == READ_1) kputs("/1", linebuf);
+ else if (readpart == READ_2) kputs("/2", linebuf);
+ }
+ if (state->copy_tags) {
+ for (i = 0; copied_tags[i]; ++i) {
+ uint8_t *s;
+ if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
+ kputc('\t', linebuf);
+ kputsn(copied_tags[i], 2, linebuf);
+ kputsn(":Z:", 3, linebuf);
+ kputs(bam_aux2Z(s), linebuf);
+ }
+ }
+ }
+ kputc('\n', linebuf);
+
+ seq = bam_get_seq(b);
+
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ for (i = qlen-1; i > -1; --i) {
+ char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
+ kputc(c, linebuf);
+ }
+ } else {
+ for (i = 0; i < qlen; ++i) {
+ char c = seq_nt16_str[bam_seqi(seq,i)];
+ kputc(c, linebuf);
+ }
+ }
+ kputc('\n', linebuf);
+
+ if (state->filetype == FASTQ) {
+ // Write quality
+ kputs("+\n", linebuf);
+ if (has_qual) {
+ if (state->use_oq && oq) {
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ for (i = qlen-1; i > -1; --i) {
+ kputc(oq[i], linebuf);
+ }
+ } else {
+ kputs((char*)oq, linebuf);
+ }
+ } else {
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ for (i = qlen-1; i > -1; --i) {
+ kputc(33 + qual[i], linebuf);
+ }
+ } else {
+ for (i = 0; i < qlen; ++i) {
+ kputc(33 + qual[i], linebuf);
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < qlen; ++i) {
+ kputc(33 + state->def_qual, linebuf);
+ }
+ }
+ kputc('\n', linebuf);
+ }
+ return true;
+}
+
+// return true if valid
+static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
+{
// Parse args
- char* fnse = NULL;
- bool has12 = true, use_oq = false;
+ bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
+ opts->has12 = true;
+ opts->filetype = FASTQ;
+ opts->def_qual = 1;
+
int c;
- while ((c = getopt(argc, argv, "nOs:")) > 0) {
+ sam_global_args_init(&opts->ga);
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
+ while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) {
switch (c) {
- case 'n': has12 = false; break;
- case 'O': use_oq = true; break;
- case 's': fnse = optarg; break;
- default: bam2fq_usage(stderr); return 1;
+ case '0': opts->fnr[0] = optarg; break;
+ case '1': opts->fnr[1] = optarg; break;
+ case '2': opts->fnr[2] = optarg; break;
+ case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
+ case 'F': opts->flag_off |= strtol(optarg, 0, 0); break;
+ case 'n': opts->has12 = false; break;
+ case 'O': opts->use_oq = true; break;
+ case 's': opts->fnse = optarg; break;
+ case 't': opts->copy_tags = true; break;
+ case 'v': opts->def_qual = atoi(optarg); break;
+ case '?': bam2fq_usage(stderr, argv[0]); free(opts); return false;
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
+ bam2fq_usage(stderr, argv[0]); free(opts); return false;
+ }
+ break;
}
}
+ if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
+
+ if (opts->def_qual < 0 || 93 < opts->def_qual) {
+ fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
+ bam2fq_usage(stderr, argv[0]);
+ free(opts);
+ return true;
+ }
+
+ const char* type_str = argv[0];
+ if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) {
+ opts->filetype = FASTQ;
+ } else if (strcasecmp("fasta", type_str) == 0) {
+ opts->filetype = FASTA;
+ } else {
+ print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
+ bam2fq_usage(stderr, argv[0]);
+ free(opts);
+ return false;
+ }
+
if ((argc - (optind)) == 0) {
- bam2fq_usage(stdout);
- return 0;
+ bam2fq_usage(stdout, argv[0]);
+ free(opts);
+ return false;
}
if ((argc - (optind)) != 1) {
fprintf(stderr, "Too many arguments.\n");
- bam2fq_usage(stderr);
- return 1;
+ bam2fq_usage(stderr, argv[0]);
+ free(opts);
+ return false;
}
+ opts->fn_input = argv[optind];
+ *opts_out = opts;
+ return true;
+}
- fp = sam_open(argv[optind], "r");
- if (fp == NULL) {
- print_error_errno("Cannot read file \"%s\"", argv[optind]);
- return 1;
+static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
+{
+ bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
+ state->flag_on = opts->flag_on;
+ state->flag_off = opts->flag_off;
+ state->has12 = opts->has12;
+ state->use_oq = opts->use_oq;
+ state->copy_tags = opts->copy_tags;
+ state->filetype = opts->filetype;
+ state->def_qual = opts->def_qual;
+
+ state->fp = sam_open(opts->fn_input, "r");
+ if (state->fp == NULL) {
+ print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
+ free(state);
+ return false;
}
- if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
- SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL)) {
+ uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
+ if (opts->use_oq) rf |= SAM_AUX;
+ if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
- return 1;
+ free(state);
+ return false;
}
- if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) {
+ if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) {
fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
- return 1;
+ free(state);
+ return false;
}
- fpse = NULL;
- if (fnse) {
- fpse = fopen(fnse,"w");
- if (fpse == NULL) {
- print_error_errno("Cannot write to singleton file \"%s\"", fnse);
- return 1;
+ if (opts->fnse) {
+ state->fpse = fopen(opts->fnse,"w");
+ if (state->fpse == NULL) {
+ print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
+ free(state);
+ return false;
}
}
- h = sam_hdr_read(fp);
- b = bam_init1();
- buf = NULL;
- max_buf = 0;
-
- int64_t n_singletons = 0, n_reads = 0;
- char* previous = NULL;
- kstring_t linebuf = { 0, 0, NULL };
- kputsn("", 0, &linebuf);
-
- while (sam_read1(fp, h, b) >= 0) {
- if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) continue; // skip secondary and supplementary alignments
- ++n_reads;
-
- int i;
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- uint8_t* seq;
- uint8_t* qual = bam_get_qual(b);
- const uint8_t *oq = NULL;
- if (use_oq) oq = bam_aux_get(b, "OQ");
- bool has_qual = (qual[0] != 0xff || (use_oq && oq)); // test if there is quality
-
- // If there was a previous readname
- if ( fpse && previous ) {
- if (!strcmp(bam_get_qname(b), previous ) ) {
- fputs(linebuf.s, stdout); // Write previous read
- free(previous);
- previous = NULL;
- } else { // Doesn't match it's a singleton
- ++n_singletons;
- fputs(linebuf.s, fpse); // Write previous read to singletons
- free(previous);
- previous = strdup(bam_get_qname(b));
+ int i;
+ for (i = 0; i < 3; ++i) {
+ if (opts->fnr[i]) {
+ state->fpr[i] = fopen(opts->fnr[i], "w");
+ if (state->fpr[i] == NULL) {
+ print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]);
+ free(state);
+ return false;
}
} else {
- fputs(linebuf.s, stdout); // Write pending read
- if (fpse) previous = strdup(bam_get_qname(b));
+ state->fpr[i] = stdout;
}
+ }
- linebuf.l = 0;
- // Write read name
- kputc(!has_qual? '>' : '@', &linebuf);
- kputs(bam_get_qname(b), &linebuf);
- // Add the /1 /2 if requested
- if (has12) {
- if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) kputs("/1\n", &linebuf);
- else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) kputs("/2\n", &linebuf);
- else kputc('\n', &linebuf);
- } else {
- kputc('\n', &linebuf);
- }
+ state->h = sam_hdr_read(state->fp);
+ if (state->h == NULL) {
+ fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input);
+ free(state);
+ return false;
+ }
- if (max_buf < qlen + 1) {
- max_buf = qlen + 1;
- kroundup32(max_buf);
- buf = realloc(buf, max_buf);
- if (buf == NULL) {
- fprintf(stderr, "Out of memory");
- return 1;
- }
- }
- buf[qlen] = '\0';
- seq = bam_get_seq(b);
- for (i = 0; i < qlen; ++i)
- buf[i] = bam_seqi(seq, i);
- if (b->core.flag & BAM_FREVERSE) { // reverse complement
- for (i = 0; i < qlen>>1; ++i) {
- int8_t t = seq_comp_table[buf[qlen - 1 - i]];
- buf[qlen - 1 - i] = seq_comp_table[buf[i]];
- buf[i] = t;
+ *state_out = state;
+ return true;
+}
+
+static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status)
+{
+ bool valid = true;
+ bam_hdr_destroy(state->h);
+ check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
+ if (state->fpse && fclose(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+ int i;
+ for (i = 0; i < 3; ++i) {
+ if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
+ }
+ free(state);
+ return valid;
+}
+
+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
+{
+ return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
+ || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
+ || (b->core.flag&(state->flag_off)) != 0);
+
+}
+
+static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
+{
+ bam1_t* b = bam_init1();
+ char *current_qname = NULL;
+ int64_t n_reads = 0, n_singletons = 0; // Statistics
+ kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}};
+ int score[3];
+ int at_eof;
+ if (b == NULL ) {
+ perror("[bam2fq_mainloop_singletontrack] Malloc error for bam record buffer.");
+ return false;
+ }
+
+ bool valid = true;
+ while (true) {
+ at_eof = sam_read1(state->fp, state->h, b);
+
+ if (!at_eof && filter_it_out(b, state)) continue;
+ if (!at_eof) ++n_reads;
+
+ if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
+ if (current_qname) {
+ if (score[1] > 0 && score[2] > 0) {
+ // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
+ if (fputs(linebuf[1].s, state->fpr[1]) == EOF) { valid = false; break; }
+ if (fputs(linebuf[2].s, state->fpr[2]) == EOF) { valid = false; break; }
+ } else if (score[1] > 0 || score[2] > 0) {
+ // print whichever one exists to fpse
+ if (score[1] > 0) {
+ if (fputs(linebuf[1].s, state->fpse) == EOF) { valid = false; break; }
+ } else {
+ if (fputs(linebuf[2].s, state->fpse) == EOF) { valid = false; break; }
+ }
+ ++n_singletons;
+ }
+ if (score[0]) { // TODO: check this
+ // print linebuf[0] to fpr[0]
+ if (fputs(linebuf[0].s, state->fpr[0]) == EOF) { valid = false; break; }
+ }
}
- if (qlen&1) buf[i] = seq_comp_table[buf[i]];
+
+ if (at_eof) break;
+
+ free(current_qname);
+ current_qname = strdup(bam_get_qname(b));
+ score[0] = score[1] = score[2] = 0;
}
- for (i = 0; i < qlen; ++i)
- buf[i] = seq_nt16_str[buf[i]];
- kputs((char*)buf, &linebuf);
- kputc('\n', &linebuf);
- if (has_qual) {
- // Write quality
- kputs("+\n", &linebuf);
- if (use_oq && oq) memcpy(buf, oq + 1, qlen);
- else {
- for (i = 0; i < qlen; ++i)
- buf[i] = 33 + qual[i];
- }
- if (b->core.flag & BAM_FREVERSE) { // reverse
- for (i = 0; i < qlen>>1; ++i) {
- int8_t t = buf[qlen - 1 - i];
- buf[qlen - 1 - i] = buf[i];
- buf[i] = t;
- }
+ // Prefer a copy of the read that has base qualities
+ int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
+ if (b_score > score[which_readpart(b)]) {
+ if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) {
+ fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
+ return false;
}
- kputs((char*)buf, &linebuf);
- kputc('\n', &linebuf);
+ score[which_readpart(b)] = b_score;
}
}
+ if (!valid)
+ {
+ perror("[bam2fq_mainloop_singletontrack] Error writing to FASTx files.");
+ }
+ bam_destroy1(b);
+ free(current_qname);
+ free(linebuf[0].s);
+ free(linebuf[1].s);
+ free(linebuf[2].s);
+ fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
+ fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
- if (fpse) {
- if ( previous ) { // Nothing left to match it's a singleton
- ++n_singletons;
- fputs(linebuf.s, fpse); // Write previous read to singletons
- } else {
- fputs(linebuf.s, stdout); // Write previous read
- }
+ return valid;
+}
- fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
- fclose(fpse);
- } else {
- fputs(linebuf.s, stdout); // Write previous read
+static bool bam2fq_mainloop(bam2fq_state_t *state)
+{
+ // process a name collated BAM into fastq
+ bam1_t* b = bam_init1();
+ if (b == NULL) {
+ perror(NULL);
+ return false;
+ }
+ int64_t n_reads = 0; // Statistics
+ kstring_t linebuf = { 0, 0, NULL }; // Buffer
+ while (sam_read1(state->fp, state->h, b) >= 0) {
+ if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
+ || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
+ || (b->core.flag&(state->flag_off)) != 0) continue;
+ ++n_reads;
+
+ if (!bam1_to_fq(b, &linebuf, state)) return false;
+ fputs(linebuf.s, state->fpr[which_readpart(b)]);
}
free(linebuf.s);
- free(previous);
+ bam_destroy1(b);
fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+ return true;
+}
+
+int main_bam2fq(int argc, char *argv[])
+{
+ int status = EXIT_SUCCESS;
+ bam2fq_opts_t* opts = NULL;
+ bam2fq_state_t* state = NULL;
+
+ bool valid = parse_opts(argc, argv, &opts);
+ if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
+
+ if (!init_state(opts, &state)) return EXIT_FAILURE;
+
+ if (state->fpse) {
+ if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE;
+ } else {
+ if (!bam2fq_mainloop(state)) status = EXIT_FAILURE;
+ }
+
+ if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
+ sam_global_args_free(&opts->ga);
+ free(opts);
- free(buf);
- bam_destroy1(b);
- bam_hdr_destroy(h);
- check_sam_close(fp, argv[optind], "file", &status);
return status;
}
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c
index 34840b9..dfc8065 100644
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -2,7 +2,7 @@
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2014 Genome Research Ltd.
+ Copyright (C) 2009-2015 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -33,11 +33,13 @@ DEALINGS IN THE SOFTWARE. */
#include <inttypes.h>
#include <stdbool.h>
#include <assert.h>
+#include <getopt.h>
#include "htslib/sam.h"
#include "htslib/faidx.h"
#include "htslib/kstring.h"
#include "htslib/khash.h"
#include "samtools.h"
+#include "sam_opts.h"
KHASH_SET_INIT_STR(rg)
typedef khash_t(rg) *rghash_t;
@@ -141,9 +143,9 @@ static char *drop_rg(char *hdtxt, rghash_t h, int *len)
return str.s;
}
-static int usage(int is_long_help);
+static int usage(FILE *fp, int exit_status, int is_long_help);
-static int add_read_group_single(samview_settings_t *settings, char *name)
+static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name)
{
char *d = strdup(name);
int ret = 0;
@@ -161,12 +163,12 @@ static int add_read_group_single(samview_settings_t *settings, char *name)
return 0;
err:
- print_error("Couldn't add \"%s\" to read group list: memory exhausted?", name);
+ print_error(subcmd, "Couldn't add \"%s\" to read group list: memory exhausted?", name);
free(d);
return -1;
}
-static int add_read_groups_file(samview_settings_t *settings, char *fn)
+static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
{
FILE *fp;
char buf[1024];
@@ -181,7 +183,7 @@ static int add_read_groups_file(samview_settings_t *settings, char *fn)
fp = fopen(fn, "r");
if (fp == NULL) {
- print_error_errno("failed to open \"%s\" for reading", fn);
+ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
return -1;
}
@@ -196,7 +198,7 @@ static int add_read_groups_file(samview_settings_t *settings, char *fn)
}
if (ferror(fp)) ret = -1;
if (ret == -1) {
- print_error_errno("failed to read \"%s\"", fn);
+ print_error_errno(subcmd, "failed to read \"%s\"", fn);
}
fclose(fp);
return (ret != -1) ? 0 : -1;
@@ -207,21 +209,21 @@ static inline int check_sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t
int r = sam_write1(fp, h, b);
if (r >= 0) return r;
- if (fname) print_error_errno("writing to \"%s\" failed", fname);
- else print_error_errno("writing to standard output failed");
+ if (fname) print_error_errno("view", "writing to \"%s\" failed", fname);
+ else print_error_errno("view", "writing to standard output failed");
*retp = EXIT_FAILURE;
return r;
}
-static void check_sam_close(samFile *fp, const char *fname, const char *null_fname, int *retp)
+static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp)
{
int r = sam_close(fp);
if (r >= 0) return;
// TODO Need error infrastructure so we can print a message instead of r
- if (fname) print_error("error closing \"%s\": %d", fname, r);
- else print_error("error closing %s: %d", null_fname, r);
+ if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r);
+ else print_error(subcmd, "error closing %s: %d", null_fname, r);
*retp = EXIT_FAILURE;
}
@@ -233,7 +235,9 @@ int main_samview(int argc, char *argv[])
int64_t count = 0;
samFile *in = 0, *out = 0, *un_out=0;
bam_hdr_t *header = NULL;
- char out_mode[5], *out_format = "", *fn_out = 0, *fn_list = 0, *fn_ref = 0, *q, *fn_un_out = 0;
+ char out_mode[5], out_un_mode[5], *out_format = "";
+ char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
samview_settings_t settings = {
.rghash = NULL,
@@ -248,10 +252,18 @@ int main_samview(int argc, char *argv[])
.bed = NULL,
};
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'),
+ { "threads", required_argument, NULL, '@' },
+ { NULL, 0, NULL, 0 }
+ };
+
/* parse command-line options */
- /* TODO: convert this to getopt_long we're running out of letters */
strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "SbBcCt:h1Ho:q:f:F:ul:r:?T:R:L:s:@:m:x:U:")) >= 0) {
+ strcpy(out_un_mode, "w");
+ while ((c = getopt_long(argc, argv,
+ "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:",
+ lopts, NULL)) >= 0) {
switch (c) {
case 's':
if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
@@ -278,19 +290,19 @@ int main_samview(int argc, char *argv[])
case 'l': settings.library = strdup(optarg); break;
case 'L':
if ((settings.bed = bed_read(optarg)) == NULL) {
- print_error_errno("Could not read file \"%s\"", optarg);
+ print_error_errno("view", "Could not read file \"%s\"", optarg);
ret = 1;
goto view_end;
}
break;
case 'r':
- if (add_read_group_single(&settings, optarg) != 0) {
+ if (add_read_group_single("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
break;
case 'R':
- if (add_read_groups_file(&settings, optarg) != 0) {
+ if (add_read_groups_file("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
@@ -300,43 +312,62 @@ int main_samview(int argc, char *argv[])
//case 'X': out_format = "X"; break;
*/
case '?': is_long_help = 1; break;
- case 'T': fn_ref = strdup(optarg); break;
case 'B': settings.remove_B = 1; break;
case '@': n_threads = strtol(optarg, 0, 0); break;
case 'x':
{
if (strlen(optarg) != 2) {
fprintf(pysamerr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
- return usage(is_long_help);
+ return usage(pysamerr, EXIT_FAILURE, is_long_help);
}
settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
settings.remove_aux[settings.remove_aux_len-1] = optarg;
}
break;
- default: return usage(is_long_help);
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
+ return usage(pysamerr, EXIT_FAILURE, is_long_help);
+ break;
}
}
- if (compress_level >= 0) out_format = "b";
+ if (compress_level >= 0 && !*out_format) out_format = "b";
if (is_header_only) is_header = 1;
- strcat(out_mode, out_format);
+ // File format auto-detection first
+ if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL);
+ if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL);
+ // Overridden by manual -b, -C
+ if (*out_format)
+ out_mode[1] = out_un_mode[1] = *out_format;
+ out_mode[2] = out_un_mode[2] = '\0';
+ // out_(un_)mode now 1 or 2 bytes long, followed by nul.
if (compress_level >= 0) {
char tmp[2];
tmp[0] = compress_level + '0'; tmp[1] = '\0';
strcat(out_mode, tmp);
+ strcat(out_un_mode, tmp);
}
- if (argc == optind) return usage(is_long_help); // potential memory leak...
+ if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak...
+ fn_in = (optind < argc)? argv[optind] : "-";
// generate the fn_list if necessary
- if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
+ if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference);
// open file handlers
- if ((in = sam_open(argv[optind], "r")) == 0) {
- print_error_errno("failed to open \"%s\" for reading", argv[optind]);
+ if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) {
+ print_error_errno("view", "failed to open \"%s\" for reading", fn_in);
ret = 1;
goto view_end;
}
- if (fn_list) hts_set_fai_filename(in, fn_list);
+
+ if (fn_list) {
+ if (hts_set_fai_filename(in, fn_list) != 0) {
+ fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
+ }
+ }
if ((header = sam_hdr_read(in)) == 0) {
- fprintf(pysamerr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]);
+ fprintf(pysamerr, "[main_samview] fail to read the header from \"%s\".\n", fn_in);
ret = 1;
goto view_end;
}
@@ -349,13 +380,21 @@ int main_samview(int argc, char *argv[])
header->l_text = l;
}
if (!is_count) {
- if ((out = sam_open(fn_out? fn_out : "-", out_mode)) == 0) {
- print_error_errno("failed to open \"%s\" for writing", fn_out? fn_out : "standard output");
+ if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) {
+ print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output");
ret = 1;
goto view_end;
}
- if (fn_list) hts_set_fai_filename(out, fn_list);
- if (*out_format || is_header) {
+ if (fn_list) {
+ if (hts_set_fai_filename(out, fn_list) != 0) {
+ fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
+ }
+ }
+ if (*out_format || is_header ||
+ out_mode[1] == 'b' || out_mode[1] == 'c' ||
+ (ga.out.format != sam && ga.out.format != unknown_format)) {
if (sam_hdr_write(out, header) != 0) {
fprintf(pysamerr, "[main_samview] failed to write the SAM header\n");
ret = 1;
@@ -363,12 +402,21 @@ int main_samview(int argc, char *argv[])
}
}
if (fn_un_out) {
- if ((un_out = sam_open(fn_un_out, out_mode)) == 0) {
- print_error_errno("failed to open \"%s\" for writing", fn_un_out);
+ if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) {
+ print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out);
ret = 1;
goto view_end;
}
- if (*out_format || is_header) {
+ if (fn_list) {
+ if (hts_set_fai_filename(un_out, fn_list) != 0) {
+ fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
+ }
+ }
+ if (*out_format || is_header ||
+ out_un_mode[1] == 'b' || out_un_mode[1] == 'c' ||
+ (ga.out.format != sam && ga.out.format != unknown_format)) {
if (sam_hdr_write(un_out, header) != 0) {
fprintf(pysamerr, "[main_samview] failed to write the SAM header\n");
ret = 1;
@@ -377,10 +425,11 @@ int main_samview(int argc, char *argv[])
}
}
}
+
if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
if (is_header_only) goto view_end; // no need to print alignments
- if (argc == optind + 1) { // convert/print the entire file
+ if (optind + 1 >= argc) { // convert/print the entire file
bam1_t *b = bam_init1();
int r;
while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in'
@@ -399,7 +448,7 @@ int main_samview(int argc, char *argv[])
} else { // retrieve alignments in specified regions
int i;
bam1_t *b;
- hts_idx_t *idx = sam_index_load(in, argv[optind]); // load index
+ hts_idx_t *idx = sam_index_load(in, fn_in); // load index
if (idx == 0) { // index is unavailable
fprintf(pysamerr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n");
ret = 1;
@@ -409,8 +458,12 @@ int main_samview(int argc, char *argv[])
for (i = optind + 1; i < argc; ++i) {
int result;
hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200'
- if (iter == NULL) { // reference name is not found
- fprintf(pysamerr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
+ if (iter == NULL) { // region invalid or reference name not found
+ int beg, end;
+ if (hts_parse_reg(argv[i], &beg, &end))
+ fprintf(pysamerr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
+ else
+ fprintf(pysamerr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]);
continue;
}
// fetch alignments
@@ -438,11 +491,12 @@ view_end:
printf("%" PRId64 "\n", count);
// close files, free and return
- if (in) check_sam_close(in, argv[optind], "standard input", &ret);
- if (out) check_sam_close(out, fn_out, "standard output", &ret);
- if (un_out) check_sam_close(un_out, fn_un_out, "file", &ret);
+ if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
+ if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
+ if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
- free(fn_list); free(fn_ref); free(fn_out); free(settings.library); free(fn_un_out);
+ free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
+ sam_global_args_free(&ga);
if ( header ) bam_hdr_destroy(header);
if (settings.bed) bed_destroy(settings.bed);
if (settings.rghash) {
@@ -457,68 +511,85 @@ view_end:
return ret;
}
-static int usage(int is_long_help)
+static int usage(FILE *fp, int exit_status, int is_long_help)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n\n");
- // output options
- fprintf(pysamerr, "Options: -b output BAM\n");
- fprintf(pysamerr, " -C output CRAM (requires -T)\n");
- fprintf(pysamerr, " -1 use fast BAM compression (implies -b)\n");
- fprintf(pysamerr, " -u uncompressed BAM output (implies -b)\n");
- fprintf(pysamerr, " -h include header in SAM output\n");
- fprintf(pysamerr, " -H print SAM header only (no alignments)\n");
- fprintf(pysamerr, " -c print only the count of matching records\n");
- fprintf(pysamerr, " -o FILE output file name [stdout]\n");
- fprintf(pysamerr, " -U FILE output reads not selected by filters to FILE [null]\n");
- // extra input
- fprintf(pysamerr, " -t FILE FILE listing reference names and lengths (see long help) [null]\n");
- fprintf(pysamerr, " -T FILE reference sequence FASTA FILE [null]\n");
- // read filters
- fprintf(pysamerr, " -L FILE only include reads overlapping this BED FILE [null]\n");
- fprintf(pysamerr, " -r STR only include reads in read group STR [null]\n");
- fprintf(pysamerr, " -R FILE only include reads with read group listed in FILE [null]\n");
- fprintf(pysamerr, " -q INT only include reads with mapping quality >= INT [0]\n");
- fprintf(pysamerr, " -l STR only include reads in library STR [null]\n");
- fprintf(pysamerr, " -m INT only include reads with number of CIGAR operations\n");
- fprintf(pysamerr, " consuming query sequence >= INT [0]\n");
- fprintf(pysamerr, " -f INT only include reads with all bits set in INT set in FLAG [0]\n");
- fprintf(pysamerr, " -F INT only include reads with none of the bits set in INT\n");
- fprintf(pysamerr, " set in FLAG [0]\n");
- // read processing
- fprintf(pysamerr, " -x STR read tag to strip (repeatable) [null]\n");
- fprintf(pysamerr, " -B collapse the backward CIGAR operation\n");
- fprintf(pysamerr, " -s FLOAT integer part sets seed of random number generator [0];\n");
- fprintf(pysamerr, " rest sets fraction of templates to subsample [no subsampling]\n");
- // general options
- fprintf(pysamerr, " -@ INT number of BAM compression threads [0]\n");
- fprintf(pysamerr, " -? print long help, including note about region specification\n");
- fprintf(pysamerr, " -S ignored (input format is auto-detected)\n");
- fprintf(pysamerr, "\n");
+ fprintf(fp,
+"\n"
+"Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
+"\n"
+"Options:\n"
+// output options
+" -b output BAM\n"
+" -C output CRAM (requires -T)\n"
+" -1 use fast BAM compression (implies -b)\n"
+" -u uncompressed BAM output (implies -b)\n"
+" -h include header in SAM output\n"
+" -H print SAM header only (no alignments)\n"
+" -c print only the count of matching records\n"
+" -o FILE output file name [stdout]\n"
+" -U FILE output reads not selected by filters to FILE [null]\n"
+// extra input
+" -t FILE FILE listing reference names and lengths (see long help) [null]\n"
+// read filters
+" -L FILE only include reads overlapping this BED FILE [null]\n"
+" -r STR only include reads in read group STR [null]\n"
+" -R FILE only include reads with read group listed in FILE [null]\n"
+" -q INT only include reads with mapping quality >= INT [0]\n"
+" -l STR only include reads in library STR [null]\n"
+" -m INT only include reads with number of CIGAR operations consuming\n"
+" query sequence >= INT [0]\n"
+" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
+" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+// read processing
+" -x STR read tag to strip (repeatable) [null]\n"
+" -B collapse the backward CIGAR operation\n"
+" -s FLOAT integer part sets seed of random number generator [0];\n"
+" rest sets fraction of templates to subsample [no subsampling]\n"
+// general options
+" -@, --threads INT\n"
+" number of BAM/CRAM compression threads [0]\n"
+" -? print long help, including note about region specification\n"
+" -S ignored (input format is auto-detected)\n");
+
+ sam_global_opt_help(fp, "-.O.T");
+ fprintf(fp, "\n");
+
if (is_long_help)
- fprintf(pysamerr, "Notes:\n\
-\n\
- 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n\
-\n\
- 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n\
- two fields of each line consisting of the reference name and the\n\
- corresponding sequence length. The `.fai' file generated by \n\
- `samtools faidx' is suitable for use as this file. This may be an\n\
- empty file if reads are unaligned.\n\
-\n\
- 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\
-\n\
- 4. BAM->SAM conversion: `samtools view -h in.bam'.\n\
-\n\
- 5. A region should be presented in one of the following formats:\n\
- `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\
- specified, the input alignment file must be a sorted and indexed\n\
- alignment (BAM/CRAM) file.\n\
-\n\
- 6. Option `-u' is preferred over `-b' when the output is piped to\n\
- another samtools command.\n\
-\n");
- return 1;
+ fprintf(fp,
+"Notes:\n"
+"\n"
+" 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n"
+" Further control over the CRAM format can be specified by using the\n"
+" --output-fmt-option, e.g. to specify the number of sequences per slice\n"
+" and to use avoid reference based compression:\n"
+" `samtools view -C --output-fmt-option seqs_per_slice=5000 \\\n"
+" --output-fmt-option no_ref -o out.cram in.bam'\n"
+"\n"
+" Options can also be specified as a comma separated list within the\n"
+" --output-fmt value too. For example this is equivalent to the above\n"
+" `samtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n"
+" -o out.cram in.bam'\n"
+"\n"
+" 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n"
+" two fields of each line consisting of the reference name and the\n"
+" corresponding sequence length. The `.fai' file generated by \n"
+" `samtools faidx' is suitable for use as this file. This may be an\n"
+" empty file if reads are unaligned.\n"
+"\n"
+" 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n"
+"\n"
+" 4. BAM->SAM conversion: `samtools view -h in.bam'.\n"
+"\n"
+" 5. A region should be presented in one of the following formats:\n"
+" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n"
+" specified, the input alignment file must be a sorted and indexed\n"
+" alignment (BAM/CRAM) file.\n"
+"\n"
+" 6. Option `-u' is preferred over `-b' when the output is piped to\n"
+" another samtools command.\n"
+"\n");
+
+ return exit_status;
}
int main_import(int argc, char *argv[])
@@ -538,192 +609,424 @@ int main_import(int argc, char *argv[])
}
int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+static const char *copied_tags[] = { "RG", "BC", "QT", NULL };
-static void bam2fq_usage(FILE *to)
+static void bam2fq_usage(FILE *to, const char *command)
{
- fprintf(to, "\nUsage: samtools bam2fq [-nO] [-s <outSE.fq>] <in.bam>\n\n");
- fprintf(to, "Options: -n don't append /1 and /2 to the read name\n");
- fprintf(to, " -O output quality in the OQ tag if present\n");
- fprintf(to, " -s FILE write singleton reads to FILE [assume single-end]\n");
- fprintf(to, "\n");
+ fprintf(to,
+"Usage: samtools %s [options...] <in.bam>\n", command);
+ fprintf(to,
+"Options:\n"
+" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
+" -1 FILE write paired reads flagged READ1 to FILE\n"
+" -2 FILE write paired reads flagged READ2 to FILE\n"
+" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
+" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+" -n don't append /1 and /2 to the read name\n"
+" -O output quality in the OQ tag if present\n"
+" -s FILE write singleton reads to FILE [assume single-end]\n"
+" -t copy RG, BC and QT tags to the FASTQ header line\n"
+" -v INT default quality score if not given in file [1]\n");
+ sam_global_opt_help(to, "-.--.");
}
-int main_bam2fq(int argc, char *argv[])
-{
+typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
+typedef enum { FASTA, FASTQ } fastfile;
+typedef struct bam2fq_opts {
+ char *fnse;
+ char *fnr[3];
+ char *fn_input; // pointer to input filename in argv do not free
+ bool has12, use_oq, copy_tags;
+ int flag_on, flag_off;
+ sam_global_args ga;
+ fastfile filetype;
+ int def_qual;
+} bam2fq_opts_t;
+
+typedef struct bam2fq_state {
samFile *fp;
+ FILE *fpse;
+ FILE *fpr[3];
bam_hdr_t *h;
- bam1_t *b;
- int8_t *buf;
- int status = EXIT_SUCCESS;
- size_t max_buf;
- FILE* fpse;
+ bool has12, use_oq, copy_tags;
+ int flag_on, flag_off;
+ fastfile filetype;
+ int def_qual;
+} bam2fq_state_t;
+
+static readpart which_readpart(const bam1_t *b)
+{
+ if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
+ return READ_1;
+ } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) {
+ return READ_2;
+ } else {
+ return READ_UNKNOWN;
+ }
+}
+
+// Transform a bam1_t record into a string with the FASTQ representation of it
+// @returns false for error, true for success
+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int i;
+ int32_t qlen = b->core.l_qseq;
+ assert(qlen >= 0);
+ uint8_t *seq;
+ uint8_t *qual = bam_get_qual(b);
+ const uint8_t *oq = NULL;
+ if (state->use_oq) oq = bam_aux_get(b, "OQ") + 1;
+ bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
+
+ linebuf->l = 0;
+ // Write read name
+ readpart readpart = which_readpart(b);
+ kputc(state->filetype == FASTA? '>' : '@', linebuf);
+ kputs(bam_get_qname(b), linebuf);
+ // Add the /1 /2 if requested
+ if (state->has12) {
+ if (readpart == READ_1) kputs("/1", linebuf);
+ else if (readpart == READ_2) kputs("/2", linebuf);
+ }
+ if (state->copy_tags) {
+ for (i = 0; copied_tags[i]; ++i) {
+ uint8_t *s;
+ if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
+ kputc('\t', linebuf);
+ kputsn(copied_tags[i], 2, linebuf);
+ kputsn(":Z:", 3, linebuf);
+ kputs(bam_aux2Z(s), linebuf);
+ }
+ }
+ }
+ kputc('\n', linebuf);
+
+ seq = bam_get_seq(b);
+
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ for (i = qlen-1; i > -1; --i) {
+ char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
+ kputc(c, linebuf);
+ }
+ } else {
+ for (i = 0; i < qlen; ++i) {
+ char c = seq_nt16_str[bam_seqi(seq,i)];
+ kputc(c, linebuf);
+ }
+ }
+ kputc('\n', linebuf);
+
+ if (state->filetype == FASTQ) {
+ // Write quality
+ kputs("+\n", linebuf);
+ if (has_qual) {
+ if (state->use_oq && oq) {
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ for (i = qlen-1; i > -1; --i) {
+ kputc(oq[i], linebuf);
+ }
+ } else {
+ kputs((char*)oq, linebuf);
+ }
+ } else {
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ for (i = qlen-1; i > -1; --i) {
+ kputc(33 + qual[i], linebuf);
+ }
+ } else {
+ for (i = 0; i < qlen; ++i) {
+ kputc(33 + qual[i], linebuf);
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < qlen; ++i) {
+ kputc(33 + state->def_qual, linebuf);
+ }
+ }
+ kputc('\n', linebuf);
+ }
+ return true;
+}
+
+// return true if valid
+static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
+{
// Parse args
- char* fnse = NULL;
- bool has12 = true, use_oq = false;
+ bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
+ opts->has12 = true;
+ opts->filetype = FASTQ;
+ opts->def_qual = 1;
+
int c;
- while ((c = getopt(argc, argv, "nOs:")) > 0) {
+ sam_global_args_init(&opts->ga);
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ { NULL, 0, NULL, 0 }
+ };
+ while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) {
switch (c) {
- case 'n': has12 = false; break;
- case 'O': use_oq = true; break;
- case 's': fnse = optarg; break;
- default: bam2fq_usage(pysamerr); return 1;
+ case '0': opts->fnr[0] = optarg; break;
+ case '1': opts->fnr[1] = optarg; break;
+ case '2': opts->fnr[2] = optarg; break;
+ case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
+ case 'F': opts->flag_off |= strtol(optarg, 0, 0); break;
+ case 'n': opts->has12 = false; break;
+ case 'O': opts->use_oq = true; break;
+ case 's': opts->fnse = optarg; break;
+ case 't': opts->copy_tags = true; break;
+ case 'v': opts->def_qual = atoi(optarg); break;
+ case '?': bam2fq_usage(pysamerr, argv[0]); free(opts); return false;
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
+ bam2fq_usage(pysamerr, argv[0]); free(opts); return false;
+ }
+ break;
}
}
+ if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
+
+ if (opts->def_qual < 0 || 93 < opts->def_qual) {
+ fprintf(pysamerr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
+ bam2fq_usage(pysamerr, argv[0]);
+ free(opts);
+ return true;
+ }
+
+ const char* type_str = argv[0];
+ if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) {
+ opts->filetype = FASTQ;
+ } else if (strcasecmp("fasta", type_str) == 0) {
+ opts->filetype = FASTA;
+ } else {
+ print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
+ bam2fq_usage(pysamerr, argv[0]);
+ free(opts);
+ return false;
+ }
+
if ((argc - (optind)) == 0) {
- bam2fq_usage(stdout);
- return 0;
+ bam2fq_usage(stdout, argv[0]);
+ free(opts);
+ return false;
}
if ((argc - (optind)) != 1) {
fprintf(pysamerr, "Too many arguments.\n");
- bam2fq_usage(pysamerr);
- return 1;
+ bam2fq_usage(pysamerr, argv[0]);
+ free(opts);
+ return false;
}
+ opts->fn_input = argv[optind];
+ *opts_out = opts;
+ return true;
+}
- fp = sam_open(argv[optind], "r");
- if (fp == NULL) {
- print_error_errno("Cannot read file \"%s\"", argv[optind]);
- return 1;
+static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
+{
+ bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
+ state->flag_on = opts->flag_on;
+ state->flag_off = opts->flag_off;
+ state->has12 = opts->has12;
+ state->use_oq = opts->use_oq;
+ state->copy_tags = opts->copy_tags;
+ state->filetype = opts->filetype;
+ state->def_qual = opts->def_qual;
+
+ state->fp = sam_open(opts->fn_input, "r");
+ if (state->fp == NULL) {
+ print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
+ free(state);
+ return false;
}
- if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
- SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL)) {
+ uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
+ if (opts->use_oq) rf |= SAM_AUX;
+ if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
- return 1;
+ free(state);
+ return false;
}
- if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) {
+ if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) {
fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
- return 1;
+ free(state);
+ return false;
}
- fpse = NULL;
- if (fnse) {
- fpse = fopen(fnse,"w");
- if (fpse == NULL) {
- print_error_errno("Cannot write to singleton file \"%s\"", fnse);
- return 1;
+ if (opts->fnse) {
+ state->fpse = fopen(opts->fnse,"w");
+ if (state->fpse == NULL) {
+ print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
+ free(state);
+ return false;
}
}
- h = sam_hdr_read(fp);
- b = bam_init1();
- buf = NULL;
- max_buf = 0;
-
- int64_t n_singletons = 0, n_reads = 0;
- char* previous = NULL;
- kstring_t linebuf = { 0, 0, NULL };
- kputsn("", 0, &linebuf);
-
- while (sam_read1(fp, h, b) >= 0) {
- if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) continue; // skip secondary and supplementary alignments
- ++n_reads;
-
- int i;
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- uint8_t* seq;
- uint8_t* qual = bam_get_qual(b);
- const uint8_t *oq = NULL;
- if (use_oq) oq = bam_aux_get(b, "OQ");
- bool has_qual = (qual[0] != 0xff || (use_oq && oq)); // test if there is quality
-
- // If there was a previous readname
- if ( fpse && previous ) {
- if (!strcmp(bam_get_qname(b), previous ) ) {
- fputs(linebuf.s, stdout); // Write previous read
- free(previous);
- previous = NULL;
- } else { // Doesn't match it's a singleton
- ++n_singletons;
- fputs(linebuf.s, fpse); // Write previous read to singletons
- free(previous);
- previous = strdup(bam_get_qname(b));
+ int i;
+ for (i = 0; i < 3; ++i) {
+ if (opts->fnr[i]) {
+ state->fpr[i] = fopen(opts->fnr[i], "w");
+ if (state->fpr[i] == NULL) {
+ print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]);
+ free(state);
+ return false;
}
} else {
- fputs(linebuf.s, stdout); // Write pending read
- if (fpse) previous = strdup(bam_get_qname(b));
+ state->fpr[i] = stdout;
}
+ }
- linebuf.l = 0;
- // Write read name
- kputc(!has_qual? '>' : '@', &linebuf);
- kputs(bam_get_qname(b), &linebuf);
- // Add the /1 /2 if requested
- if (has12) {
- if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) kputs("/1\n", &linebuf);
- else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) kputs("/2\n", &linebuf);
- else kputc('\n', &linebuf);
- } else {
- kputc('\n', &linebuf);
- }
+ state->h = sam_hdr_read(state->fp);
+ if (state->h == NULL) {
+ fprintf(pysamerr, "Failed to read header for \"%s\"\n", opts->fn_input);
+ free(state);
+ return false;
+ }
- if (max_buf < qlen + 1) {
- max_buf = qlen + 1;
- kroundup32(max_buf);
- buf = realloc(buf, max_buf);
- if (buf == NULL) {
- fprintf(pysamerr, "Out of memory");
- return 1;
- }
- }
- buf[qlen] = '\0';
- seq = bam_get_seq(b);
- for (i = 0; i < qlen; ++i)
- buf[i] = bam_seqi(seq, i);
- if (b->core.flag & BAM_FREVERSE) { // reverse complement
- for (i = 0; i < qlen>>1; ++i) {
- int8_t t = seq_comp_table[buf[qlen - 1 - i]];
- buf[qlen - 1 - i] = seq_comp_table[buf[i]];
- buf[i] = t;
+ *state_out = state;
+ return true;
+}
+
+static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status)
+{
+ bool valid = true;
+ bam_hdr_destroy(state->h);
+ check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
+ if (state->fpse && fclose(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+ int i;
+ for (i = 0; i < 3; ++i) {
+ if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
+ }
+ free(state);
+ return valid;
+}
+
+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
+{
+ return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
+ || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
+ || (b->core.flag&(state->flag_off)) != 0);
+
+}
+
+static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
+{
+ bam1_t* b = bam_init1();
+ char *current_qname = NULL;
+ int64_t n_reads = 0, n_singletons = 0; // Statistics
+ kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}};
+ int score[3];
+ int at_eof;
+ if (b == NULL ) {
+ perror("[bam2fq_mainloop_singletontrack] Malloc error for bam record buffer.");
+ return false;
+ }
+
+ bool valid = true;
+ while (true) {
+ at_eof = sam_read1(state->fp, state->h, b);
+
+ if (!at_eof && filter_it_out(b, state)) continue;
+ if (!at_eof) ++n_reads;
+
+ if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
+ if (current_qname) {
+ if (score[1] > 0 && score[2] > 0) {
+ // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
+ if (fputs(linebuf[1].s, state->fpr[1]) == EOF) { valid = false; break; }
+ if (fputs(linebuf[2].s, state->fpr[2]) == EOF) { valid = false; break; }
+ } else if (score[1] > 0 || score[2] > 0) {
+ // print whichever one exists to fpse
+ if (score[1] > 0) {
+ if (fputs(linebuf[1].s, state->fpse) == EOF) { valid = false; break; }
+ } else {
+ if (fputs(linebuf[2].s, state->fpse) == EOF) { valid = false; break; }
+ }
+ ++n_singletons;
+ }
+ if (score[0]) { // TODO: check this
+ // print linebuf[0] to fpr[0]
+ if (fputs(linebuf[0].s, state->fpr[0]) == EOF) { valid = false; break; }
+ }
}
- if (qlen&1) buf[i] = seq_comp_table[buf[i]];
+
+ if (at_eof) break;
+
+ free(current_qname);
+ current_qname = strdup(bam_get_qname(b));
+ score[0] = score[1] = score[2] = 0;
}
- for (i = 0; i < qlen; ++i)
- buf[i] = seq_nt16_str[buf[i]];
- kputs((char*)buf, &linebuf);
- kputc('\n', &linebuf);
- if (has_qual) {
- // Write quality
- kputs("+\n", &linebuf);
- if (use_oq && oq) memcpy(buf, oq + 1, qlen);
- else {
- for (i = 0; i < qlen; ++i)
- buf[i] = 33 + qual[i];
- }
- if (b->core.flag & BAM_FREVERSE) { // reverse
- for (i = 0; i < qlen>>1; ++i) {
- int8_t t = buf[qlen - 1 - i];
- buf[qlen - 1 - i] = buf[i];
- buf[i] = t;
- }
+ // Prefer a copy of the read that has base qualities
+ int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
+ if (b_score > score[which_readpart(b)]) {
+ if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) {
+ fprintf(pysamerr, "[%s] Error converting read to FASTA/Q\n", __func__);
+ return false;
}
- kputs((char*)buf, &linebuf);
- kputc('\n', &linebuf);
+ score[which_readpart(b)] = b_score;
}
}
+ if (!valid)
+ {
+ perror("[bam2fq_mainloop_singletontrack] Error writing to FASTx files.");
+ }
+ bam_destroy1(b);
+ free(current_qname);
+ free(linebuf[0].s);
+ free(linebuf[1].s);
+ free(linebuf[2].s);
+ fprintf(pysamerr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
+ fprintf(pysamerr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
- if (fpse) {
- if ( previous ) { // Nothing left to match it's a singleton
- ++n_singletons;
- fputs(linebuf.s, fpse); // Write previous read to singletons
- } else {
- fputs(linebuf.s, stdout); // Write previous read
- }
+ return valid;
+}
- fprintf(pysamerr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
- fclose(fpse);
- } else {
- fputs(linebuf.s, stdout); // Write previous read
+static bool bam2fq_mainloop(bam2fq_state_t *state)
+{
+ // process a name collated BAM into fastq
+ bam1_t* b = bam_init1();
+ if (b == NULL) {
+ perror(NULL);
+ return false;
+ }
+ int64_t n_reads = 0; // Statistics
+ kstring_t linebuf = { 0, 0, NULL }; // Buffer
+ while (sam_read1(state->fp, state->h, b) >= 0) {
+ if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
+ || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
+ || (b->core.flag&(state->flag_off)) != 0) continue;
+ ++n_reads;
+
+ if (!bam1_to_fq(b, &linebuf, state)) return false;
+ fputs(linebuf.s, state->fpr[which_readpart(b)]);
}
free(linebuf.s);
- free(previous);
+ bam_destroy1(b);
fprintf(pysamerr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+ return true;
+}
+
+int main_bam2fq(int argc, char *argv[])
+{
+ int status = EXIT_SUCCESS;
+ bam2fq_opts_t* opts = NULL;
+ bam2fq_state_t* state = NULL;
+
+ bool valid = parse_opts(argc, argv, &opts);
+ if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
+
+ if (!init_state(opts, &state)) return EXIT_FAILURE;
+
+ if (state->fpse) {
+ if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE;
+ } else {
+ if (!bam2fq_mainloop(state)) status = EXIT_FAILURE;
+ }
+
+ if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
+ sam_global_args_free(&opts->ga);
+ free(opts);
- free(buf);
- bam_destroy1(b);
- bam_hdr_destroy(h);
- check_sam_close(fp, argv[optind], "file", &status);
return status;
}
diff --git a/samtools/samtools.h b/samtools/samtools.h
index 3161822..1e72654 100644
--- a/samtools/samtools.h
+++ b/samtools/samtools.h
@@ -1,6 +1,6 @@
/* samtools.h -- utility routines.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2015 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -27,7 +27,13 @@ DEALINGS IN THE SOFTWARE. */
const char *samtools_version(void);
-void print_error(const char *format, ...);
-void print_error_errno(const char *format, ...);
+#if defined __GNUC__ && __GNUC__ >= 2
+#define CHECK_PRINTF(fmt,args) __attribute__ ((format (printf, fmt, args)))
+#else
+#define CHECK_PRINTF(fmt,args)
+#endif
+
+void print_error(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3);
+void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3);
#endif
diff --git a/samtools/stats.c b/samtools/stats.c
index fe43e71..512df1d 100644
--- a/samtools/stats.c
+++ b/samtools/stats.c
@@ -1,8 +1,9 @@
/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2015 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
+ Author: Sam Nicholls <sam at samnicholls.net>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -54,7 +55,9 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/khash_str2int.h>
#include "samtools.h"
#include <htslib/khash.h>
+#include <htslib/kstring.h>
#include "stats_isize.h"
+#include "sam_opts.h"
#define BWA_MIN_RDLEN 35
// From the spec
@@ -67,6 +70,7 @@ DEALINGS IN THE SOFTWARE. */
#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
+#define IS_ORIGINAL(bam) (((bam)->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) == 0)
// The GC-depth graph works as follows: split the reference sequence into
// segments and calculate GC content and depth in each bin. Then sort
@@ -98,21 +102,52 @@ regions_t;
typedef struct
{
- // Parameters
+ uint64_t a;
+ uint64_t c;
+ uint64_t g;
+ uint64_t t;
+ uint64_t n;
+ uint64_t other;
+}
+acgtno_count_t;
+
+typedef struct
+{
+ // Auxiliary data
+ int flag_require, flag_filter;
+ faidx_t *fai; // Reference sequence for GC-depth graph
+ int argc; // Command line arguments to be printed on the output
+ char **argv;
+ int gcd_bin_size; // The size of GC-depth bin
+ int nisize; // The maximum insert size that the allocated array can hold - 0 indicates no limit
int trim_qual; // bwa trim quality
+ float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
+ int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
+ samFile* sam;
+ bam_hdr_t* sam_header;
+
+ // Filters
+ int filter_readlen;
+
+ // Misc
+ char *split_tag; // Tag on which to perform stats splitting
+ char *split_prefix; // Path or string prefix for filenames created when splitting
+}
+stats_info_t;
+typedef struct
+{
// Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
// insert size histogram holder
int nquals; // The number of quality bins
int nbases; // The maximum sequence length the allocated array can hold
- int nisize; // The maximum insert size that the allocated array can hold - 0 indicates no limit
int ngc; // The size of gc_1st and gc_2nd
int nindels; // The maximum indel length for indel distribution
// Arrays for the histogram data
uint64_t *quals_1st, *quals_2nd;
uint64_t *gc_1st, *gc_2nd;
- uint64_t *acgt_cycles;
+ acgtno_count_t *acgtno_cycles;
uint64_t *read_lengths;
uint64_t *insertions, *deletions;
uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
@@ -121,7 +156,6 @@ typedef struct
// The extremes encountered
int max_len; // Maximum read length
int max_qual; // Maximum quality
- float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
int is_sorted;
// Summary numbers
@@ -150,14 +184,12 @@ typedef struct
// GC-depth related data
uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
gc_depth_t *gcd; // The GC-depth bins holder
- int gcd_bin_size; // The size of GC-depth bin
int32_t tid, gcd_pos; // Position of the current bin
int32_t pos; // Position of the last read
// Coverage distribution related data
int ncov; // The number of coverage bins
uint64_t *cov; // The coverage frequencies
- int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
round_buffer_t cov_rbuf; // Pileup round buffer
// Mismatches by read cycle
@@ -167,24 +199,22 @@ typedef struct
int32_t nrseq_buf; // The used part of the buffer
uint64_t *mpc_buf; // Mismatches per cycle
- // Filters
- int filter_readlen;
-
// Target regions
int nregions, reg_from,reg_to;
regions_t *regions;
// Auxiliary data
- int flag_require, flag_filter;
double sum_qual; // For calculating average quality value
- samFile* sam;
- bam_hdr_t* sam_header;
void *rg_hash; // Read groups to include, the array is null-terminated
- faidx_t *fai; // Reference sequence for GC-depth graph
- int argc; // Command line arguments to be printed on the output
- char **argv;
+
+ // Split
+ char* split_name;
+
+ stats_info_t* info; // Pointer to options and settings struct
+
}
stats_t;
+KHASH_MAP_INIT_STR(c2stats, stats_t*)
static void error(const char *format, ...);
int is_in_regions(bam1_t *bam_line, stats_t *stats);
@@ -233,7 +263,7 @@ void round_buffer_flush(stats_t *stats, int64_t pos)
{
if ( !stats->cov_rbuf.buffer[ibuf] )
continue;
- idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+ idp = coverage_idx(stats->info->cov_min,stats->info->cov_max,stats->ncov,stats->info->cov_step,stats->cov_rbuf.buffer[ibuf]);
stats->cov[idp]++;
stats->cov_rbuf.buffer[ibuf] = 0;
}
@@ -243,7 +273,7 @@ void round_buffer_flush(stats_t *stats, int64_t pos)
{
if ( !stats->cov_rbuf.buffer[ibuf] )
continue;
- idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+ idp = coverage_idx(stats->info->cov_min,stats->info->cov_max,stats->ncov,stats->info->cov_step,stats->cov_rbuf.buffer[ibuf]);
stats->cov[idp]++;
stats->cov_rbuf.buffer[ibuf] = 0;
}
@@ -315,7 +345,7 @@ void count_indels(stats_t *stats,bam1_t *bam_line)
int idx = is_fwd ? icycle : read_len-icycle-ncig;
if ( idx<0 )
error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
- if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
if ( is_1st )
stats->ins_cycles_1st[idx]++;
else
@@ -395,10 +425,10 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
// chunk of refseq in memory. Not very frequent and not noticable in the stats.
if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) continue;
if ( cig!=BAM_CMATCH && cig!=BAM_CEQUAL && cig!=BAM_CDIFF ) // not relying on precalculated diffs
- error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("TODO: cigar %d, %s:%d %s\n", cig,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
if ( ncig+iref > stats->nrseq_buf )
- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1);
+ error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1);
int im;
for (im=0; im<ncig; im++)
@@ -422,11 +452,11 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
{
uint8_t qual = quals[iread] + 1;
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
int idx = is_fwd ? icycle : read_len-icycle-1;
if ( idx>stats->max_len )
- error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
idx = idx*stats->nquals + qual;
if ( idx>=stats->nquals*stats->nbases )
@@ -444,8 +474,8 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
void read_ref_seq(stats_t *stats, int32_t tid, int32_t pos)
{
int i, fai_ref_len;
- char *fai_ref = faidx_fetch_seq(stats->fai, stats->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len);
- if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->sam_header->target_name[tid]);
+ char *fai_ref = faidx_fetch_seq(stats->info->fai, stats->info->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len);
+ if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->info->sam_header->target_name[tid]);
uint8_t *ptr = stats->rseq_buf;
for (i=0; i<fai_ref_len; i++)
@@ -502,7 +532,7 @@ float fai_gc_content(stats_t *stats, int pos, int len)
void realloc_rseq_buffer(stats_t *stats)
{
int n = stats->nbases*10;
- if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
+ if ( stats->info->gcd_bin_size > n ) n = stats->info->gcd_bin_size;
if ( stats->mrseq_buf<n )
{
stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
@@ -538,10 +568,10 @@ void realloc_buffers(stats_t *stats, int seq_len)
memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
}
- stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
- if ( !stats->acgt_cycles )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
- memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
+ stats->acgtno_cycles = realloc(stats->acgtno_cycles, n*sizeof(acgtno_count_t));
+ if ( !stats->acgtno_cycles )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
+ memset(stats->acgtno_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
if ( !stats->read_lengths )
@@ -611,65 +641,53 @@ void update_checksum(bam1_t *bam_line, stats_t *stats)
stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2);
}
-void collect_stats(bam1_t *bam_line, stats_t *stats)
+// These stats should only be calculated for the original reads ignoring
+// supplementary artificial reads otherwise we'll accidentally double count
+void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out)
{
- if ( stats->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(bam_line, "RG");
- if ( !rg ) return; // certain read groups were requested but this record has none
- if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return;
- }
- if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
- {
- stats->nreads_filtered++;
- return;
- }
- if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
- {
- stats->nreads_filtered++;
- return;
- }
- if ( !is_in_regions(bam_line,stats) )
- return;
- if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
- return;
+ int seq_len = bam_line->core.l_qseq;
+ stats->total_len += seq_len; // This ignores clipping so only count primary
if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
- if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++;
- update_checksum(bam_line, stats);
-
- int seq_len = bam_line->core.l_qseq;
- if ( !seq_len ) return;
-
- int read_len = unclipped_length(bam_line);
- if ( read_len >= stats->nbases )
- realloc_buffers(stats,read_len);
- if ( stats->max_len<read_len )
- stats->max_len = read_len;
-
- stats->read_lengths[read_len]++;
-
// Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored
- uint8_t base, *seq = bam_get_seq(bam_line);
- int gc_count = 0;
- int i;
- int reverse = IS_REVERSE(bam_line);
+ uint8_t *seq = bam_get_seq(bam_line);
+ int i, read_cycle, gc_count = 0, reverse = IS_REVERSE(bam_line);
for (i=0; i<seq_len; i++)
{
- // Conversion from uint8_t coding to ACGT
+ // Read cycle for current index
+ read_cycle = (reverse ? seq_len-i-1 : i);
+
+ // Conversion from uint8_t coding:
// -12-4---8------5
// =ACMGRSVTWYHKDBN
- // 01 2 3
- base = bam_seqi(seq,i);
- if ( base==0 ) break; // not ready for "=" sequences
- base /= 2;
- if ( base==1 || base==2 ) gc_count++;
- else if ( base>2 ) base=3;
- if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
- error("FIXME: acgt_cycles\n");
- stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
+ switch (bam_seqi(seq, i)) {
+ case 1:
+ stats->acgtno_cycles[ read_cycle ].a++;
+ break;
+ case 2:
+ stats->acgtno_cycles[ read_cycle ].c++;
+ gc_count++;
+ break;
+ case 4:
+ stats->acgtno_cycles[ read_cycle ].g++;
+ gc_count++;
+ break;
+ case 8:
+ stats->acgtno_cycles[ read_cycle ].t++;
+ break;
+ case 15:
+ stats->acgtno_cycles[ read_cycle ].n++;
+ break;
+ default:
+ /*
+ * count "=" sequences in "other" along
+ * with MRSVWYHKDB ambiguity codes
+ */
+ stats->acgtno_cycles[ read_cycle ].other++;
+ break;
+ }
}
int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
@@ -694,15 +712,15 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
for (i=gc_idx_min; i<gc_idx_max; i++)
stats->gc_1st[i]++;
}
- if ( stats->trim_qual>0 )
- stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
+ if ( stats->info->trim_qual>0 )
+ stats->nbases_trimmed += bwa_trim_read(stats->info->trim_qual, bam_quals, seq_len, reverse);
// Quality histogram and average quality. Clipping is neglected.
for (i=0; i<seq_len; i++)
{
uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
if ( qual>stats->max_qual )
stats->max_qual = qual;
@@ -712,14 +730,15 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
// Look at the flags and increment appropriate counters (mapped, paired, etc)
if ( IS_UNMAPPED(bam_line) )
+ {
stats->nreads_unmapped++;
+ }
else
{
+ stats->nbases_mapped += seq_len; // This ignores clipping so only count primary
+
if ( !bam_line->core.qual )
stats->nreads_mq0++;
-
- count_indels(stats,bam_line);
-
if ( !IS_PAIRED_AND_MAPPED(bam_line) )
stats->nreads_single_mapped++;
else
@@ -730,153 +749,214 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
if ( bam_line->core.tid!=bam_line->core.mtid )
stats->nreads_anomalous++;
+ }
+ }
+ *gc_count_out = gc_count;
+}
+
+void collect_stats(bam1_t *bam_line, stats_t *stats)
+{
+ if ( stats->rg_hash )
+ {
+ const uint8_t *rg = bam_aux_get(bam_line, "RG");
+ if ( !rg ) return; // certain read groups were requested but this record has none
+ if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return;
+ }
+ if ( stats->info->flag_require && (bam_line->core.flag & stats->info->flag_require)!=stats->info->flag_require )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( stats->info->flag_filter && (bam_line->core.flag & stats->info->flag_filter) )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( !is_in_regions(bam_line,stats) )
+ return;
+ if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen )
+ return;
+
+ update_checksum(bam_line, stats);
+
+ // Secondary reads don't count for most stats purposes
+ if ( bam_line->core.flag & BAM_FSECONDARY )
+ {
+ stats->nreads_secondary++;
+ return;
+ }
+
+ // If line has no sequence cannot continue
+ int seq_len = bam_line->core.l_qseq;
+ if ( !seq_len ) return;
- // The insert size is tricky, because for long inserts the libraries are
- // prepared differently and the pairs point in other direction. BWA does
- // not set the paired flag for them. Similar thing is true also for 454
- // reads. Mates mapped to different chromosomes have isize==0.
- int32_t isize = bam_line->core.isize;
- if ( isize<0 ) isize = -isize;
- if ( stats->nisize > 0 && isize >= stats->nisize )
- isize = stats->nisize-1;
- if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
+ if ( IS_DUP(bam_line) )
+ {
+ stats->total_len_dup += seq_len;
+ stats->nreads_dup++;
+ }
+
+ int read_len = unclipped_length(bam_line);
+ if ( read_len >= stats->nbases )
+ realloc_buffers(stats,read_len);
+ // Update max_len observed
+ if ( stats->max_len<read_len )
+ stats->max_len = read_len;
+ int i;
+ int gc_count = 0;
+
+ // These stats should only be calculated for the original reads ignoring supplementary artificial reads
+ // otherwise we'll accidentally double count
+ if ( IS_ORIGINAL(bam_line) )
+ {
+ stats->read_lengths[read_len]++;
+ collect_orig_read_stats(bam_line, stats, &gc_count);
+ }
+
+ // Look at the flags and increment appropriate counters (mapped, paired, etc)
+ if ( IS_UNMAPPED(bam_line) ) return;
+
+ count_indels(stats, bam_line);
+
+ if ( IS_PAIRED_AND_MAPPED(bam_line) )
+ {
+ // The insert size is tricky, because for long inserts the libraries are
+ // prepared differently and the pairs point in other direction. BWA does
+ // not set the paired flag for them. Similar thing is true also for 454
+ // reads. Mates mapped to different chromosomes have isize==0.
+ int32_t isize = bam_line->core.isize;
+ if ( isize<0 ) isize = -isize;
+ if ( stats->info->nisize > 0 && isize >= stats->info->nisize )
+ isize = stats->info->nisize-1;
+ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
+ {
+ int pos_fst = bam_line->core.mpos - bam_line->core.pos;
+ int is_fst = IS_READ1(bam_line) ? 1 : -1;
+ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
+ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+
+ if ( is_fwd*is_mfwd>0 )
+ stats->isize->inc_other(stats->isize->data, isize);
+ else if ( is_fst*pos_fst>0 )
{
- int pos_fst = bam_line->core.mpos - bam_line->core.pos;
- int is_fst = IS_READ1(bam_line) ? 1 : -1;
- int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
- int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
-
- if ( is_fwd*is_mfwd>0 )
- stats->isize->inc_other(stats->isize->data, isize);
- else if ( is_fst*pos_fst>0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize->inc_inward(stats->isize->data, isize);
- else
- stats->isize->inc_outward(stats->isize->data, isize);
- }
- else if ( is_fst*pos_fst<0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize->inc_outward(stats->isize->data, isize);
- else
- stats->isize->inc_inward(stats->isize->data, isize);
- }
+ if ( is_fst*is_fwd>0 )
+ stats->isize->inc_inward(stats->isize->data, isize);
+ else
+ stats->isize->inc_outward(stats->isize->data, isize);
+ }
+ else if ( is_fst*pos_fst<0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize->inc_outward(stats->isize->data, isize);
+ else
+ stats->isize->inc_inward(stats->isize->data, isize);
}
}
+ }
- // Number of mismatches
- uint8_t *nm = bam_aux_get(bam_line,"NM");
- if (nm)
- stats->nmismatches += bam_aux2i(nm);
+ // Number of mismatches
+ uint8_t *nm = bam_aux_get(bam_line,"NM");
+ if (nm)
+ stats->nmismatches += bam_aux2i(nm);
- // Number of mapped bases from cigar
- if ( bam_line->core.n_cigar == 0)
- error("FIXME: mapped read with no cigar?\n");
- int readlen=seq_len;
- if ( stats->regions )
+ // Number of mapped bases from cigar
+ if ( bam_line->core.n_cigar == 0)
+ error("FIXME: mapped read with no cigar?\n");
+ int readlen=seq_len;
+ if ( stats->regions )
+ {
+ // Count only on-target bases
+ int iref = bam_line->core.pos + 1;
+ for (i=0; i<bam_line->core.n_cigar; i++)
{
- // Count only on-target bases
- int iref = bam_line->core.pos + 1;
- for (i=0; i<bam_line->core.n_cigar; i++)
+ int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]);
+ int ncig = bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
+ if ( !ncig ) continue; // curiously, this can happen: 0D
+ if ( cig==BAM_CDEL ) readlen += ncig;
+ else if ( cig==BAM_CMATCH )
+ {
+ if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
+ else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
+ if ( ncig<0 ) ncig = 0;
+ stats->nbases_mapped_cigar += ncig;
+ iref += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
+ }
+ else if ( cig==BAM_CINS )
{
- int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]);
- int ncig = bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- if ( !ncig ) continue; // curiously, this can happen: 0D
- if ( cig==BAM_CDEL ) readlen += ncig;
- else if ( cig==BAM_CMATCH )
- {
- if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
- else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
- if ( ncig<0 ) ncig = 0;
+ iref += ncig;
+ if ( iref>=stats->reg_from && iref<=stats->reg_to )
stats->nbases_mapped_cigar += ncig;
- iref += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- }
- else if ( cig==BAM_CINS )
- {
- iref += ncig;
- if ( iref>=stats->reg_from && iref<=stats->reg_to )
- stats->nbases_mapped_cigar += ncig;
- }
}
}
- else
+ }
+ else
+ {
+ // Count the whole read
+ for (i=0; i<bam_line->core.n_cigar; i++)
{
- // Count the whole read
- for (i=0; i<bam_line->core.n_cigar; i++)
- {
- if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CMATCH || bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CINS )
- stats->nbases_mapped_cigar += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CDEL )
- readlen += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- }
+ if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CMATCH || bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CINS )
+ stats->nbases_mapped_cigar += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
+ if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CDEL )
+ readlen += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
}
- stats->nbases_mapped += seq_len;
+ }
- if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
- stats->is_sorted = 0;
- stats->pos = bam_line->core.pos;
+ if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
+ stats->is_sorted = 0;
+ stats->pos = bam_line->core.pos;
- if ( stats->is_sorted )
- {
- if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
- round_buffer_flush(stats,-1);
+ if ( stats->is_sorted )
+ {
+ if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
+ round_buffer_flush(stats, -1);
- // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
- // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
- // 20kbp, so the effect is negligible.
- if ( stats->fai )
+ // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
+ // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
+ // 20kbp, so the effect is negligible.
+ if ( stats->info->fai )
+ {
+ int inc_ref = 0, inc_gcd = 0;
+ // First pass or new chromosome
+ if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
+ // Read goes beyond the end of the rseq buffer
+ else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
+ // Read overlaps the next gcd bin
+ else if ( stats->gcd_pos+stats->info->gcd_bin_size < bam_line->core.pos+readlen )
{
- int inc_ref = 0, inc_gcd = 0;
- // First pass or new chromosome
- if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
- // Read goes beyond the end of the rseq buffer
- else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
- // Read overlaps the next gcd bin
- else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
- {
- inc_gcd = 1;
- if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
- }
- if ( inc_gcd )
- {
- stats->igcd++;
- if ( stats->igcd >= stats->ngcd )
- realloc_gcd_buffer(stats, readlen);
- if ( inc_ref )
- read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
- stats->gcd_pos = bam_line->core.pos;
- stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
- }
-
- count_mismatches_per_cycle(stats,bam_line,read_len);
+ inc_gcd = 1;
+ if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = 1;
}
- // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
- else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
+ if ( inc_gcd )
{
- // First pass or a new chromosome
- stats->tid = bam_line->core.tid;
- stats->gcd_pos = bam_line->core.pos;
stats->igcd++;
if ( stats->igcd >= stats->ngcd )
realloc_gcd_buffer(stats, readlen);
+ if ( inc_ref )
+ read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+ stats->gcd_pos = bam_line->core.pos;
+ stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->info->gcd_bin_size);
}
- stats->gcd[ stats->igcd ].depth++;
- // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
- if ( !stats->fai )
- stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
-
- // Coverage distribution graph
- round_buffer_flush(stats,bam_line->core.pos);
- round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
- }
- }
- stats->total_len += seq_len;
- if ( IS_DUP(bam_line) )
- {
- stats->total_len_dup += seq_len;
- stats->nreads_dup++;
+ count_mismatches_per_cycle(stats,bam_line,read_len);
+ }
+ // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
+ else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->info->gcd_bin_size )
+ {
+ // First pass or a new chromosome
+ stats->tid = bam_line->core.tid;
+ stats->gcd_pos = bam_line->core.pos;
+ stats->igcd++;
+ if ( stats->igcd >= stats->ngcd )
+ realloc_gcd_buffer(stats, readlen);
+ }
+ stats->gcd[ stats->igcd ].depth++;
+ // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
+ if ( !stats->info->fai )
+ stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
+
+ // Coverage distribution graph
+ round_buffer_flush(stats,bam_line->core.pos);
+ round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
}
}
@@ -908,7 +988,7 @@ float gcd_percentile(gc_depth_t *gcd, int N, int p)
return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
}
-void output_stats(stats_t *stats, int sparse)
+void output_stats(FILE *to, stats_t *stats, int sparse)
{
// Calculate average insert size and standard deviation (from the main bulk data only)
int isize, ibulk=0;
@@ -932,7 +1012,7 @@ void output_stats(stats_t *stats, int sparse)
bulk += stats->isize->inward(stats->isize->data, isize) + stats->isize->outward(stats->isize->data, isize) + stats->isize->other(stats->isize->data, isize);
avg_isize += isize * (stats->isize->inward(stats->isize->data, isize) + stats->isize->outward(stats->isize->data, isize) + stats->isize->other(stats->isize->data, isize));
- if ( bulk/nisize > stats->isize_main_bulk )
+ if ( bulk/nisize > stats->info->isize_main_bulk )
{
ibulk = isize+1;
nisize = bulk;
@@ -945,164 +1025,170 @@ void output_stats(stats_t *stats, int sparse)
sd_isize = sqrt(sd_isize);
- printf("# This file was produced by samtools stats (%s+htslib-%s) and can be plotted using plot-bamstats\n", samtools_version(), hts_version());
- printf("# The command line was: %s",stats->argv[0]);
+ fprintf(to, "# This file was produced by samtools stats (%s+htslib-%s) and can be plotted using plot-bamstats\n", samtools_version(), hts_version());
+ if( stats->split_name != NULL ){
+ fprintf(to, "# This file contains statistics only for reads with tag: %s=%s\n", stats->info->split_tag, stats->split_name);
+ }
+ else{
+ fprintf(to, "# This file contains statistics for all reads.\n");
+ }
+ fprintf(to, "# The command line was: %s",stats->info->argv[0]);
int i;
- for (i=1; i<stats->argc; i++)
- printf(" %s",stats->argv[i]);
- printf("\n");
- printf("# CHK, Checksum\t[2]Read Names\t[3]Sequences\t[4]Qualities\n");
- printf("# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
- printf("CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
- printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
- printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below)
- printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
- printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
- printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
- printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
- printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
- printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired_and_mapped+stats->nreads_single_mapped));
- printf("SN\treads mapped and paired:\t%ld\t# paired-end technology bit set + both mates mapped\n", (long)stats->nreads_paired_and_mapped);
- printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
- printf("SN\treads properly paired:\t%ld\t# proper-pair bit set\n", (long)stats->nreads_properly_paired);
- printf("SN\treads paired:\t%ld\t# paired-end technology bit set\n", (long)stats->nreads_paired_tech);
- printf("SN\treads duplicated:\t%ld\t# PCR or optical duplicate bit set\n", (long)stats->nreads_dup);
- printf("SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
- printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
- printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
- printf("SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
- printf("SN\tbases mapped:\t%ld\t# ignores clipping\n", (long)stats->nbases_mapped); // the length of the whole read goes here, including soft-clips etc.
- printf("SN\tbases mapped (cigar):\t%ld\t# more accurate\n", (long)stats->nbases_mapped_cigar); // only matched and inserted bases are counted here
- printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
- printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
- printf("SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
- printf("SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
+ for (i=1; i<stats->info->argc; i++)
+ fprintf(to, " %s", stats->info->argv[i]);
+ fprintf(to, "\n");
+ fprintf(to, "# CHK, Checksum\t[2]Read Names\t[3]Sequences\t[4]Qualities\n");
+ fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
+ fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
+ fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
+ fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below)
+ fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
+ fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
+ fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
+ fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
+ fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
+ fprintf(to, "SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired_and_mapped+stats->nreads_single_mapped));
+ fprintf(to, "SN\treads mapped and paired:\t%ld\t# paired-end technology bit set + both mates mapped\n", (long)stats->nreads_paired_and_mapped);
+ fprintf(to, "SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
+ fprintf(to, "SN\treads properly paired:\t%ld\t# proper-pair bit set\n", (long)stats->nreads_properly_paired);
+ fprintf(to, "SN\treads paired:\t%ld\t# paired-end technology bit set\n", (long)stats->nreads_paired_tech);
+ fprintf(to, "SN\treads duplicated:\t%ld\t# PCR or optical duplicate bit set\n", (long)stats->nreads_dup);
+ fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
+ fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
+ fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
+ fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
+ fprintf(to, "SN\tbases mapped:\t%ld\t# ignores clipping\n", (long)stats->nbases_mapped); // the length of the whole read goes here, including soft-clips etc.
+ fprintf(to, "SN\tbases mapped (cigar):\t%ld\t# more accurate\n", (long)stats->nbases_mapped_cigar); // only matched and inserted bases are counted here
+ fprintf(to, "SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
+ fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
+ fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
+ fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
- printf("SN\taverage length:\t%.0f\n", avg_read_length);
- printf("SN\tmaximum length:\t%d\n", stats->max_len);
- printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
- printf("SN\tinsert size average:\t%.1f\n", avg_isize);
- printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
- printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
- printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
- printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
- printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
+ fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length);
+ fprintf(to, "SN\tmaximum length:\t%d\n", stats->max_len);
+ fprintf(to, "SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
+ fprintf(to, "SN\tinsert size average:\t%.1f\n", avg_isize);
+ fprintf(to, "SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
+ fprintf(to, "SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
+ fprintf(to, "SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
+ fprintf(to, "SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
+ fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
int ibase,iqual;
if ( stats->max_len<stats->nbases ) stats->max_len++;
if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
- printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+ fprintf(to, "# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- printf("FFQ\t%d",ibase+1);
+ fprintf(to, "FFQ\t%d",ibase+1);
for (iqual=0; iqual<=stats->max_qual; iqual++)
{
- printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
+ fprintf(to, "\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
}
- printf("\n");
+ fprintf(to, "\n");
}
- printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+ fprintf(to, "# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- printf("LFQ\t%d",ibase+1);
+ fprintf(to, "LFQ\t%d",ibase+1);
for (iqual=0; iqual<=stats->max_qual; iqual++)
{
- printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
+ fprintf(to, "\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
}
- printf("\n");
+ fprintf(to, "\n");
}
if ( stats->mpc_buf )
{
- printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
- printf("# is the number of N's and the rest is the number of mismatches\n");
+ fprintf(to, "# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
+ fprintf(to, "# is the number of N's and the rest is the number of mismatches\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- printf("MPC\t%d",ibase+1);
+ fprintf(to, "MPC\t%d",ibase+1);
for (iqual=0; iqual<=stats->max_qual; iqual++)
{
- printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
+ fprintf(to, "\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
}
- printf("\n");
+ fprintf(to, "\n");
}
}
- printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
int ibase_prev = 0;
for (ibase=0; ibase<stats->ngc; ibase++)
{
if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
- printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
+ fprintf(to, "GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
ibase_prev = ibase;
}
- printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
ibase_prev = 0;
for (ibase=0; ibase<stats->ngc; ibase++)
{
if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
- printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
+ fprintf(to, "GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
ibase_prev = ibase;
}
- printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
+ fprintf(to, "# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
- uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
- if ( ! sum ) continue;
- printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
+ acgtno_count_t *acgtno_count = &(stats->acgtno_cycles[ibase]);
+ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t;
+ if ( ! acgt_sum ) continue;
+ fprintf(to, "GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, 100.*acgtno_count->a/acgt_sum, 100.*acgtno_count->c/acgt_sum, 100.*acgtno_count->g/acgt_sum, 100.*acgtno_count->t/acgt_sum, 100.*acgtno_count->n/acgt_sum, 100.*acgtno_count->other/acgt_sum);
}
- printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
+ fprintf(to, "# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
for (isize=0; isize<ibulk; isize++) {
long in = (long)(stats->isize->inward(stats->isize->data, isize));
long out = (long)(stats->isize->outward(stats->isize->data, isize));
long other = (long)(stats->isize->other(stats->isize->data, isize));
if (!sparse || in + out + other > 0) {
- printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, in+out+other,
+ fprintf(to, "IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, in+out+other,
in , out, other);
}
}
- printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
+ fprintf(to, "# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
int ilen;
for (ilen=0; ilen<stats->max_len; ilen++)
{
if ( stats->read_lengths[ilen]>0 )
- printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
+ fprintf(to, "RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
}
- printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
+ fprintf(to, "# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
for (ilen=0; ilen<stats->nindels; ilen++)
{
if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
- printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
+ fprintf(to, "ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
}
- printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
+ fprintf(to, "# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
for (ilen=0; ilen<=stats->nbases; ilen++)
{
// For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
// the index of the cycle of the first inserted base (also 1-based)
if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
- printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
+ fprintf(to, "IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
}
- printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
if ( stats->cov[0] )
- printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
+ fprintf(to, "COV\t[<%d]\t%d\t%ld\n",stats->info->cov_min,stats->info->cov_min-1, (long)stats->cov[0]);
int icov;
for (icov=1; icov<stats->ncov-1; icov++)
if ( stats->cov[icov] )
- printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
+ fprintf(to, "COV\t[%d-%d]\t%d\t%ld\n",stats->info->cov_min + (icov-1)*stats->info->cov_step, stats->info->cov_min + icov*stats->info->cov_step-1,stats->info->cov_min + icov*stats->info->cov_step-1, (long)stats->cov[icov]);
if ( stats->cov[stats->ncov-1] )
- printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
+ fprintf(to, "COV\t[%d<]\t%d\t%ld\n",stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1,stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1, (long)stats->cov[stats->ncov-1]);
// Calculate average GC content, then sort by GC and depth
- printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
+ fprintf(to, "# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
uint32_t igcd;
for (igcd=0; igcd<stats->igcd; igcd++)
{
- if ( stats->fai )
+ if ( stats->info->fai )
stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc);
else
if ( stats->gcd[igcd].depth )
@@ -1120,82 +1206,39 @@ void output_stats(stats_t *stats, int sparse)
nbins++;
itmp++;
}
- printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
- gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
+ fprintf(to, "GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
+ gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->info->gcd_bin_size
);
igcd += nbins;
}
}
-size_t mygetline(char **line, size_t *n, FILE *fp)
+void init_regions(stats_t *stats, const char *file)
{
- if (line == NULL || n == NULL || fp == NULL)
- {
- errno = EINVAL;
- return -1;
- }
- if (*n==0 || !*line)
- {
- *line = NULL;
- *n = 0;
- }
-
- size_t nread=0;
- int c;
- while ((c=getc(fp))!= EOF && c!='\n')
- {
- if ( ++nread>=*n )
- {
- *n += 255;
- *line = realloc(*line, sizeof(char)*(*n));
- }
- (*line)[nread-1] = c;
- }
- if ( nread>=*n )
- {
- *n += 255;
- *line = realloc(*line, sizeof(char)*(*n));
- }
- (*line)[nread] = 0;
- return nread>0 ? nread : -1;
-
-}
-
-void init_regions(stats_t *stats, char *file)
-{
-#if 0
- khiter_t iter;
- khash_t(kh_bam_tid) *header_hash;
-
- header_hash = (khash_t(kh_bam_tid)*)stats->sam_header->hash;
-
FILE *fp = fopen(file,"r");
if ( !fp ) error("%s: %s\n",file,strerror(errno));
- char *line = NULL;
- size_t len = 0;
- ssize_t nread;
+ kstring_t line = { 0, 0, NULL };
int warned = 0;
int prev_tid=-1, prev_pos=-1;
- while ((nread = mygetline(&line, &len, fp)) != -1)
+ while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0)
{
- if ( line[0] == '#' ) continue;
+ if ( line.s[0] == '#' ) continue;
int i = 0;
- while ( i<nread && !isspace(line[i]) ) i++;
- if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
- line[i] = 0;
+ while ( i<line.l && !isspace(line.s[i]) ) i++;
+ if ( i>=line.l ) error("Could not parse the file: %s [%s]\n", file, line.s);
+ line.s[i] = '\0';
- iter = kh_get(kh_bam_tid, header_hash, line);
- int tid = kh_val(header_hash, iter);
- if ( iter == kh_end(header_hash) )
+ int tid = bam_name2id(stats->info->sam_header, line.s);
+ if ( tid < 0 )
{
if ( !warned )
- fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
+ fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s);
warned = 1;
continue;
}
@@ -1218,23 +1261,19 @@ void init_regions(stats_t *stats, char *file)
stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
}
- if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
+ if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
if ( prev_tid==-1 || prev_tid!=tid )
{
prev_tid = tid;
prev_pos = stats->regions[tid].pos[npos].from;
}
if ( prev_pos>stats->regions[tid].pos[npos].from )
- error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
+ error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line.s,stats->regions[tid].pos[npos].from,prev_pos);
stats->regions[tid].npos++;
}
- if (line) free(line);
+ free(line.s);
if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
fclose(fp);
-#else
- fprintf(stderr, "Samtools-htslib: init_regions() header parsing not yet implemented\n");
- abort();
-#endif
}
void destroy_regions(stats_t *stats)
@@ -1278,7 +1317,7 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats)
return 1;
}
-void init_group_id(stats_t *stats, char *id)
+void init_group_id(stats_t *stats, const char *id)
{
#if 0
if ( !stats->sam_header->dict )
@@ -1327,11 +1366,14 @@ static void error(const char *format, ...)
printf(" -I, --id <string> Include only listed read group or sample name\n");
printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
+ printf(" -P, --split-prefix <str> Path or string prefix for filepaths output by -S (default is input filename)\n");
printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n");
+ printf(" -s, --sam Ignored (input format is auto-detected).\n");
+ printf(" -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
- printf(" -s, --sam Input is SAM (usually auto-detected now).\n");
printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
+ sam_global_opt_help(stdout, "-.--.");
printf("\n");
}
else
@@ -1341,13 +1383,17 @@ static void error(const char *format, ...)
vfprintf(stderr, format, ap);
va_end(ap);
}
- exit(-1);
+ exit(1);
+}
+
+void cleanup_stats_info(stats_info_t* info){
+ if (info->fai) fai_destroy(info->fai);
+ sam_close(info->sam);
+ free(info);
}
void cleanup_stats(stats_t* stats)
{
- sam_close(stats->sam);
- if (stats->fai) fai_destroy(stats->fai);
free(stats->cov_rbuf.buffer); free(stats->cov);
free(stats->quals_1st); free(stats->quals_2nd);
free(stats->gc_1st); free(stats->gc_2nd);
@@ -1356,7 +1402,7 @@ void cleanup_stats(stats_t* stats)
free(stats->gcd);
free(stats->rseq_buf);
free(stats->mpc_buf);
- free(stats->acgt_cycles);
+ free(stats->acgtno_cycles);
free(stats->read_lengths);
free(stats->insertions);
free(stats->deletions);
@@ -1366,43 +1412,189 @@ void cleanup_stats(stats_t* stats)
free(stats->del_cycles_2nd);
destroy_regions(stats);
if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash);
+ free(stats->split_name);
free(stats);
}
-int main_stats(int argc, char *argv[])
+void output_split_stats(khash_t(c2stats) *split_hash, char* bam_fname, int sparse)
{
- char *targets = NULL;
- char *bam_fname = NULL;
- char *group_id = NULL;
- samFile* sam = NULL;
- char in_mode[5];
- int sparse = 0;
+ int i = 0;
+ kstring_t output_filename = { 0, 0, NULL };
+ stats_t *curr_stats = NULL;
+ for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){
+ if(!kh_exist(split_hash, i)) continue;
+ curr_stats = kh_value(split_hash, i);
+ round_buffer_flush(curr_stats, -1);
+
+ output_filename.l = 0;
+ if (curr_stats->info->split_prefix)
+ kputs(curr_stats->info->split_prefix, &output_filename);
+ else
+ kputs(bam_fname, &output_filename);
+ kputc('_', &output_filename);
+ kputs(curr_stats->split_name, &output_filename);
+ kputs(".bamstat", &output_filename);
+
+ FILE *to = fopen(output_filename.s, "w");
+ if(to == NULL){
+ error("Could not open '%s' for writing.\n", output_filename.s);
+ }
+ output_stats(to, curr_stats, sparse);
+ fclose(to);
+ }
+
+ free(output_filename.s);
+}
+
+void destroy_split_stats(khash_t(c2stats) *split_hash)
+{
+ int i = 0;
+ stats_t *curr_stats = NULL;
+ for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){
+ if(!kh_exist(split_hash, i)) continue;
+ curr_stats = kh_value(split_hash, i);
+ cleanup_stats(curr_stats);
+ }
+ kh_destroy(c2stats, split_hash);
+}
+stats_info_t* stats_info_init(int argc, char *argv[])
+{
+ stats_info_t* info = calloc(1, sizeof(stats_info_t));
+ info->nisize = 8000;
+ info->isize_main_bulk = 0.99; // There are always outliers at the far end
+ info->gcd_bin_size = 20e3;
+ info->cov_min = 1;
+ info->cov_max = 1000;
+ info->cov_step = 1;
+ info->filter_readlen = -1;
+ info->argc = argc;
+ info->argv = argv;
+
+ return info;
+}
+
+int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFormat* in_fmt)
+{
+ // .. bam
+ samFile* sam;
+ if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) {
+ error("Failed to open: %s\n", bam_fname);
+ return 1;
+ }
+ info->sam = sam;
+ info->sam_header = sam_hdr_read(sam);
+ if (info->sam_header == NULL) {
+ error("Failed to read header for '%s'\n", bam_fname);
+ return 1;
+ }
+ return 0;
+}
+
+stats_t* stats_init()
+{
stats_t *stats = calloc(1,sizeof(stats_t));
stats->ngc = 200;
stats->nquals = 256;
stats->nbases = 300;
- stats->nisize = 8000;
stats->max_len = 30;
stats->max_qual = 40;
- stats->isize_main_bulk = 0.99; // There are always outliers at the far end
- stats->gcd_bin_size = 20e3;
stats->rseq_pos = -1;
stats->tid = stats->gcd_pos = -1;
stats->igcd = 0;
stats->is_sorted = 1;
- stats->cov_min = 1;
- stats->cov_max = 1000;
- stats->cov_step = 1;
- stats->argc = argc;
- stats->argv = argv;
- stats->filter_readlen = -1;
stats->nindels = stats->nbases;
+ stats->split_name = NULL;
+
+ return stats;
+}
+
+static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* group_id, const char* targets)
+{
+ // Give stats_t a pointer to the info struct
+ // This saves us having to pass the stats_info_t to every function
+ stats->info = info;
+
+ // Init structures
+ // .. coverage bins and round buffer
+ if ( info->cov_step > info->cov_max - info->cov_min + 1 )
+ {
+ info->cov_step = info->cov_max - info->cov_min;
+ if ( info->cov_step <= 0 )
+ info->cov_step = 1;
+ }
+ stats->ncov = 3 + (info->cov_max-info->cov_min) / info->cov_step;
+ info->cov_max = info->cov_min + ((info->cov_max-info->cov_min)/info->cov_step +1)*info->cov_step - 1;
+ stats->cov = calloc(sizeof(uint64_t),stats->ncov);
+ stats->cov_rbuf.size = stats->nbases*5;
+ stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
+
+ if ( group_id ) init_group_id(stats, group_id);
+ // .. arrays
+ stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
+ stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
+ stats->isize = init_isize_t(info->nisize);
+ stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
+ stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
+ stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t));
+ stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
+ stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ realloc_rseq_buffer(stats);
+ if ( targets )
+ init_regions(stats, targets);
+}
+
+static stats_t* get_curr_split_stats(bam1_t* bam_line, khash_t(c2stats)* split_hash, stats_info_t* info, char* targets)
+{
+ stats_t *curr_stats = NULL;
+ const uint8_t *tag_val = bam_aux_get(bam_line, info->split_tag);
+ if(tag_val == 0){
+ error("Tag '%s' not found in bam_line.\n", info->split_tag);
+ }
+ char* split_name = strdup(bam_aux2Z(tag_val));
+
+ // New stats object, under split
+ khiter_t k = kh_get(c2stats, split_hash, split_name);
+ if(k == kh_end(split_hash)){
+ curr_stats = stats_init(); // mallocs new instance
+ init_stat_structs(curr_stats, info, NULL, targets);
+ curr_stats->split_name = split_name;
+
+ // Record index in hash
+ int ret = 0;
+ khiter_t iter = kh_put(c2stats, split_hash, split_name, &ret);
+ if( ret < 0 ){
+ error("Failed to insert key '%s' into split_hash", split_name);
+ }
+ kh_val(split_hash, iter) = curr_stats; // store pointer to stats
+ }
+ else{
+ curr_stats = kh_value(split_hash, k);
+ free(split_name); // don't need to hold on to this if it wasn't new
+ }
+ return curr_stats;
+}
+
+int main_stats(int argc, char *argv[])
+{
+ char *targets = NULL;
+ char *bam_fname = NULL;
+ char *group_id = NULL;
+ int sparse = 0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- strcpy(in_mode, "rb");
+ stats_info_t *info = stats_info_init(argc, argv);
static const struct option loptions[] =
{
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
{"help", no_argument, NULL, 'h'},
{"remove-dups", no_argument, NULL, 'd'},
{"sam", no_argument, NULL, 's'},
@@ -1418,35 +1610,43 @@ int main_stats(int argc, char *argv[])
{"id", required_argument, NULL, 'I'},
{"GC-depth", required_argument, NULL, 1},
{"sparse", no_argument, NULL, 'x'},
+ {"split", required_argument, NULL, 'S'},
+ {"split-prefix", required_argument, NULL, 'P'},
{NULL, 0, NULL, 0}
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
+
+ while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 )
{
switch (opt)
{
- case 'f': stats->flag_require = bam_str2flag(optarg); break;
- case 'F': stats->flag_filter = bam_str2flag(optarg); break;
- case 'd': stats->flag_filter |= BAM_FDUP; break;
- case 's': strcpy(in_mode, "r"); break;
- case 'r': stats->fai = fai_load(optarg);
- if (stats->fai==0)
+ case 'f': info->flag_require = bam_str2flag(optarg); break;
+ case 'F': info->flag_filter = bam_str2flag(optarg); break;
+ case 'd': info->flag_filter |= BAM_FDUP; break;
+ case 's': break;
+ case 'r': info->fai = fai_load(optarg);
+ if (info->fai==NULL)
error("Could not load faidx: %s\n", optarg);
break;
- case 1 : stats->gcd_bin_size = atof(optarg); break;
- case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
+ case 1 : info->gcd_bin_size = atof(optarg); break;
+ case 'c': if ( sscanf(optarg,"%d,%d,%d",&info->cov_min,&info->cov_max,&info->cov_step)!= 3 )
error("Unable to parse -c %s\n", optarg);
break;
- case 'l': stats->filter_readlen = atoi(optarg); break;
- case 'i': stats->nisize = atoi(optarg); break;
- case 'm': stats->isize_main_bulk = atof(optarg); break;
- case 'q': stats->trim_qual = atoi(optarg); break;
+ case 'l': info->filter_readlen = atoi(optarg); break;
+ case 'i': info->nisize = atoi(optarg); break;
+ case 'm': info->isize_main_bulk = atof(optarg); break;
+ case 'q': info->trim_qual = atoi(optarg); break;
case 't': targets = optarg; break;
case 'I': group_id = optarg; break;
case 'x': sparse = 1; break;
+ case 'S': info->split_tag = optarg; break;
+ case 'P': info->split_prefix = optarg; break;
case '?':
case 'h': error(NULL);
- default: error("Unknown argument: %s\n", optarg);
+ default:
+ if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0)
+ error("Unknown argument: %s\n", optarg);
+ break;
}
}
if ( optind<argc )
@@ -1459,62 +1659,36 @@ int main_stats(int argc, char *argv[])
bam_fname = "-";
}
- // Init structures
- // .. coverage bins and round buffer
- if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
- {
- stats->cov_step = stats->cov_max - stats->cov_min;
- if ( stats->cov_step <= 0 )
- stats->cov_step = 1;
- }
- stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
- stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
- stats->cov = calloc(sizeof(uint64_t),stats->ncov);
- stats->cov_rbuf.size = stats->nbases*5;
- stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
- // .. bam
- if ((sam = sam_open(bam_fname, in_mode)) == 0)
- error("Failed to open: %s\n", bam_fname);
- stats->sam = sam;
- stats->sam_header = sam_hdr_read(sam);
- if ( group_id ) init_group_id(stats, group_id);
- bam1_t *bam_line = bam_init1();
- // .. arrays
- stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
- stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize = init_isize_t(stats->nisize);
- stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
- stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
- stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
- stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
- stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
- stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
- stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
- realloc_rseq_buffer(stats);
- if ( targets )
- init_regions(stats, targets);
+ if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1;
+
+ stats_t *all_stats = stats_init();
+ stats_t *curr_stats = NULL;
+ init_stat_structs(all_stats, info, group_id, targets);
+ // Init
+ // .. hash
+ khash_t(c2stats)* split_hash = kh_init(c2stats);
// Collect statistics
+ bam1_t *bam_line = bam_init1();
if ( optind<argc )
{
// Collect stats in selected regions only
- hts_idx_t *bam_idx = bam_index_load(bam_fname);
+ hts_idx_t *bam_idx = sam_index_load(info->sam,bam_fname);
if (bam_idx == 0)
error("Random alignment retrieval only works for indexed BAM files.\n");
int i;
for (i=optind; i<argc; i++)
{
- reset_regions(stats);
- hts_itr_t* iter = bam_itr_querys(bam_idx, stats->sam_header, argv[i]);
- while (sam_itr_next(sam, iter, bam_line) >= 0) {
- collect_stats(bam_line,stats);
+ hts_itr_t* iter = bam_itr_querys(bam_idx, info->sam_header, argv[i]);
+ while (sam_itr_next(info->sam, iter, bam_line) >= 0) {
+ if (info->split_tag) {
+ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets);
+ collect_stats(bam_line, curr_stats);
+ }
+ collect_stats(bam_line, all_stats);
}
+ reset_regions(all_stats);
bam_itr_destroy(iter);
}
hts_idx_destroy(bam_idx);
@@ -1522,16 +1696,33 @@ int main_stats(int argc, char *argv[])
else
{
// Stream through the entire BAM ignoring off-target regions if -t is given
- while (sam_read1(sam, stats->sam_header, bam_line) >= 0)
- collect_stats(bam_line,stats);
+ int ret;
+ while ((ret = sam_read1(info->sam, info->sam_header, bam_line)) >= 0) {
+ if (info->split_tag) {
+ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets);
+ collect_stats(bam_line, curr_stats);
+ }
+ collect_stats(bam_line, all_stats);
+ }
+
+ if (ret < -1) {
+ fprintf(stderr, "Failure while decoding file\n");
+ return 1;
+ }
}
- round_buffer_flush(stats,-1);
- output_stats(stats, sparse);
+ round_buffer_flush(all_stats, -1);
+ output_stats(stdout, all_stats, sparse);
+ if (info->split_tag)
+ output_split_stats(split_hash, bam_fname, sparse);
+
bam_destroy1(bam_line);
- bam_hdr_destroy(stats->sam_header);
+ bam_hdr_destroy(info->sam_header);
+ sam_global_args_free(&ga);
- cleanup_stats(stats);
+ cleanup_stats(all_stats);
+ cleanup_stats_info(info);
+ destroy_split_stats(split_hash);
return 0;
}
diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c
index a7ea9e0..e30b2ad 100644
--- a/samtools/stats.c.pysam.c
+++ b/samtools/stats.c.pysam.c
@@ -2,9 +2,10 @@
/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2015 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
+ Author: Sam Nicholls <sam at samnicholls.net>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -56,7 +57,9 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/khash_str2int.h>
#include "samtools.h"
#include <htslib/khash.h>
+#include <htslib/kstring.h>
#include "stats_isize.h"
+#include "sam_opts.h"
#define BWA_MIN_RDLEN 35
// From the spec
@@ -69,6 +72,7 @@ DEALINGS IN THE SOFTWARE. */
#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
+#define IS_ORIGINAL(bam) (((bam)->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) == 0)
// The GC-depth graph works as follows: split the reference sequence into
// segments and calculate GC content and depth in each bin. Then sort
@@ -100,21 +104,52 @@ regions_t;
typedef struct
{
- // Parameters
+ uint64_t a;
+ uint64_t c;
+ uint64_t g;
+ uint64_t t;
+ uint64_t n;
+ uint64_t other;
+}
+acgtno_count_t;
+
+typedef struct
+{
+ // Auxiliary data
+ int flag_require, flag_filter;
+ faidx_t *fai; // Reference sequence for GC-depth graph
+ int argc; // Command line arguments to be printed on the output
+ char **argv;
+ int gcd_bin_size; // The size of GC-depth bin
+ int nisize; // The maximum insert size that the allocated array can hold - 0 indicates no limit
int trim_qual; // bwa trim quality
+ float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
+ int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
+ samFile* sam;
+ bam_hdr_t* sam_header;
+
+ // Filters
+ int filter_readlen;
+
+ // Misc
+ char *split_tag; // Tag on which to perform stats splitting
+ char *split_prefix; // Path or string prefix for filenames created when splitting
+}
+stats_info_t;
+typedef struct
+{
// Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
// insert size histogram holder
int nquals; // The number of quality bins
int nbases; // The maximum sequence length the allocated array can hold
- int nisize; // The maximum insert size that the allocated array can hold - 0 indicates no limit
int ngc; // The size of gc_1st and gc_2nd
int nindels; // The maximum indel length for indel distribution
// Arrays for the histogram data
uint64_t *quals_1st, *quals_2nd;
uint64_t *gc_1st, *gc_2nd;
- uint64_t *acgt_cycles;
+ acgtno_count_t *acgtno_cycles;
uint64_t *read_lengths;
uint64_t *insertions, *deletions;
uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
@@ -123,7 +158,6 @@ typedef struct
// The extremes encountered
int max_len; // Maximum read length
int max_qual; // Maximum quality
- float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
int is_sorted;
// Summary numbers
@@ -152,14 +186,12 @@ typedef struct
// GC-depth related data
uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
gc_depth_t *gcd; // The GC-depth bins holder
- int gcd_bin_size; // The size of GC-depth bin
int32_t tid, gcd_pos; // Position of the current bin
int32_t pos; // Position of the last read
// Coverage distribution related data
int ncov; // The number of coverage bins
uint64_t *cov; // The coverage frequencies
- int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
round_buffer_t cov_rbuf; // Pileup round buffer
// Mismatches by read cycle
@@ -169,24 +201,22 @@ typedef struct
int32_t nrseq_buf; // The used part of the buffer
uint64_t *mpc_buf; // Mismatches per cycle
- // Filters
- int filter_readlen;
-
// Target regions
int nregions, reg_from,reg_to;
regions_t *regions;
// Auxiliary data
- int flag_require, flag_filter;
double sum_qual; // For calculating average quality value
- samFile* sam;
- bam_hdr_t* sam_header;
void *rg_hash; // Read groups to include, the array is null-terminated
- faidx_t *fai; // Reference sequence for GC-depth graph
- int argc; // Command line arguments to be printed on the output
- char **argv;
+
+ // Split
+ char* split_name;
+
+ stats_info_t* info; // Pointer to options and settings struct
+
}
stats_t;
+KHASH_MAP_INIT_STR(c2stats, stats_t*)
static void error(const char *format, ...);
int is_in_regions(bam1_t *bam_line, stats_t *stats);
@@ -235,7 +265,7 @@ void round_buffer_flush(stats_t *stats, int64_t pos)
{
if ( !stats->cov_rbuf.buffer[ibuf] )
continue;
- idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+ idp = coverage_idx(stats->info->cov_min,stats->info->cov_max,stats->ncov,stats->info->cov_step,stats->cov_rbuf.buffer[ibuf]);
stats->cov[idp]++;
stats->cov_rbuf.buffer[ibuf] = 0;
}
@@ -245,7 +275,7 @@ void round_buffer_flush(stats_t *stats, int64_t pos)
{
if ( !stats->cov_rbuf.buffer[ibuf] )
continue;
- idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+ idp = coverage_idx(stats->info->cov_min,stats->info->cov_max,stats->ncov,stats->info->cov_step,stats->cov_rbuf.buffer[ibuf]);
stats->cov[idp]++;
stats->cov_rbuf.buffer[ibuf] = 0;
}
@@ -317,7 +347,7 @@ void count_indels(stats_t *stats,bam1_t *bam_line)
int idx = is_fwd ? icycle : read_len-icycle-ncig;
if ( idx<0 )
error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
- if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
if ( is_1st )
stats->ins_cycles_1st[idx]++;
else
@@ -397,10 +427,10 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
// chunk of refseq in memory. Not very frequent and not noticable in the stats.
if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) continue;
if ( cig!=BAM_CMATCH && cig!=BAM_CEQUAL && cig!=BAM_CDIFF ) // not relying on precalculated diffs
- error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("TODO: cigar %d, %s:%d %s\n", cig,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
if ( ncig+iref > stats->nrseq_buf )
- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1);
+ error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1);
int im;
for (im=0; im<ncig; im++)
@@ -424,11 +454,11 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
{
uint8_t qual = quals[iread] + 1;
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
int idx = is_fwd ? icycle : read_len-icycle-1;
if ( idx>stats->max_len )
- error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
idx = idx*stats->nquals + qual;
if ( idx>=stats->nquals*stats->nbases )
@@ -446,8 +476,8 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
void read_ref_seq(stats_t *stats, int32_t tid, int32_t pos)
{
int i, fai_ref_len;
- char *fai_ref = faidx_fetch_seq(stats->fai, stats->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len);
- if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->sam_header->target_name[tid]);
+ char *fai_ref = faidx_fetch_seq(stats->info->fai, stats->info->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len);
+ if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->info->sam_header->target_name[tid]);
uint8_t *ptr = stats->rseq_buf;
for (i=0; i<fai_ref_len; i++)
@@ -504,7 +534,7 @@ float fai_gc_content(stats_t *stats, int pos, int len)
void realloc_rseq_buffer(stats_t *stats)
{
int n = stats->nbases*10;
- if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
+ if ( stats->info->gcd_bin_size > n ) n = stats->info->gcd_bin_size;
if ( stats->mrseq_buf<n )
{
stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
@@ -540,10 +570,10 @@ void realloc_buffers(stats_t *stats, int seq_len)
memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
}
- stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
- if ( !stats->acgt_cycles )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
- memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
+ stats->acgtno_cycles = realloc(stats->acgtno_cycles, n*sizeof(acgtno_count_t));
+ if ( !stats->acgtno_cycles )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
+ memset(stats->acgtno_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
if ( !stats->read_lengths )
@@ -613,65 +643,53 @@ void update_checksum(bam1_t *bam_line, stats_t *stats)
stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2);
}
-void collect_stats(bam1_t *bam_line, stats_t *stats)
+// These stats should only be calculated for the original reads ignoring
+// supplementary artificial reads otherwise we'll accidentally double count
+void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out)
{
- if ( stats->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(bam_line, "RG");
- if ( !rg ) return; // certain read groups were requested but this record has none
- if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return;
- }
- if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
- {
- stats->nreads_filtered++;
- return;
- }
- if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
- {
- stats->nreads_filtered++;
- return;
- }
- if ( !is_in_regions(bam_line,stats) )
- return;
- if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
- return;
+ int seq_len = bam_line->core.l_qseq;
+ stats->total_len += seq_len; // This ignores clipping so only count primary
if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
- if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++;
- update_checksum(bam_line, stats);
-
- int seq_len = bam_line->core.l_qseq;
- if ( !seq_len ) return;
-
- int read_len = unclipped_length(bam_line);
- if ( read_len >= stats->nbases )
- realloc_buffers(stats,read_len);
- if ( stats->max_len<read_len )
- stats->max_len = read_len;
-
- stats->read_lengths[read_len]++;
-
// Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored
- uint8_t base, *seq = bam_get_seq(bam_line);
- int gc_count = 0;
- int i;
- int reverse = IS_REVERSE(bam_line);
+ uint8_t *seq = bam_get_seq(bam_line);
+ int i, read_cycle, gc_count = 0, reverse = IS_REVERSE(bam_line);
for (i=0; i<seq_len; i++)
{
- // Conversion from uint8_t coding to ACGT
+ // Read cycle for current index
+ read_cycle = (reverse ? seq_len-i-1 : i);
+
+ // Conversion from uint8_t coding:
// -12-4---8------5
// =ACMGRSVTWYHKDBN
- // 01 2 3
- base = bam_seqi(seq,i);
- if ( base==0 ) break; // not ready for "=" sequences
- base /= 2;
- if ( base==1 || base==2 ) gc_count++;
- else if ( base>2 ) base=3;
- if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
- error("FIXME: acgt_cycles\n");
- stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
+ switch (bam_seqi(seq, i)) {
+ case 1:
+ stats->acgtno_cycles[ read_cycle ].a++;
+ break;
+ case 2:
+ stats->acgtno_cycles[ read_cycle ].c++;
+ gc_count++;
+ break;
+ case 4:
+ stats->acgtno_cycles[ read_cycle ].g++;
+ gc_count++;
+ break;
+ case 8:
+ stats->acgtno_cycles[ read_cycle ].t++;
+ break;
+ case 15:
+ stats->acgtno_cycles[ read_cycle ].n++;
+ break;
+ default:
+ /*
+ * count "=" sequences in "other" along
+ * with MRSVWYHKDB ambiguity codes
+ */
+ stats->acgtno_cycles[ read_cycle ].other++;
+ break;
+ }
}
int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
@@ -696,15 +714,15 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
for (i=gc_idx_min; i<gc_idx_max; i++)
stats->gc_1st[i]++;
}
- if ( stats->trim_qual>0 )
- stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
+ if ( stats->info->trim_qual>0 )
+ stats->nbases_trimmed += bwa_trim_read(stats->info->trim_qual, bam_quals, seq_len, reverse);
// Quality histogram and average quality. Clipping is neglected.
for (i=0; i<seq_len; i++)
{
uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
if ( qual>stats->max_qual )
stats->max_qual = qual;
@@ -714,14 +732,15 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
// Look at the flags and increment appropriate counters (mapped, paired, etc)
if ( IS_UNMAPPED(bam_line) )
+ {
stats->nreads_unmapped++;
+ }
else
{
+ stats->nbases_mapped += seq_len; // This ignores clipping so only count primary
+
if ( !bam_line->core.qual )
stats->nreads_mq0++;
-
- count_indels(stats,bam_line);
-
if ( !IS_PAIRED_AND_MAPPED(bam_line) )
stats->nreads_single_mapped++;
else
@@ -732,153 +751,214 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
if ( bam_line->core.tid!=bam_line->core.mtid )
stats->nreads_anomalous++;
+ }
+ }
+ *gc_count_out = gc_count;
+}
+
+void collect_stats(bam1_t *bam_line, stats_t *stats)
+{
+ if ( stats->rg_hash )
+ {
+ const uint8_t *rg = bam_aux_get(bam_line, "RG");
+ if ( !rg ) return; // certain read groups were requested but this record has none
+ if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return;
+ }
+ if ( stats->info->flag_require && (bam_line->core.flag & stats->info->flag_require)!=stats->info->flag_require )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( stats->info->flag_filter && (bam_line->core.flag & stats->info->flag_filter) )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( !is_in_regions(bam_line,stats) )
+ return;
+ if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen )
+ return;
+
+ update_checksum(bam_line, stats);
+
+ // Secondary reads don't count for most stats purposes
+ if ( bam_line->core.flag & BAM_FSECONDARY )
+ {
+ stats->nreads_secondary++;
+ return;
+ }
+
+ // If line has no sequence cannot continue
+ int seq_len = bam_line->core.l_qseq;
+ if ( !seq_len ) return;
- // The insert size is tricky, because for long inserts the libraries are
- // prepared differently and the pairs point in other direction. BWA does
- // not set the paired flag for them. Similar thing is true also for 454
- // reads. Mates mapped to different chromosomes have isize==0.
- int32_t isize = bam_line->core.isize;
- if ( isize<0 ) isize = -isize;
- if ( stats->nisize > 0 && isize >= stats->nisize )
- isize = stats->nisize-1;
- if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
+ if ( IS_DUP(bam_line) )
+ {
+ stats->total_len_dup += seq_len;
+ stats->nreads_dup++;
+ }
+
+ int read_len = unclipped_length(bam_line);
+ if ( read_len >= stats->nbases )
+ realloc_buffers(stats,read_len);
+ // Update max_len observed
+ if ( stats->max_len<read_len )
+ stats->max_len = read_len;
+ int i;
+ int gc_count = 0;
+
+ // These stats should only be calculated for the original reads ignoring supplementary artificial reads
+ // otherwise we'll accidentally double count
+ if ( IS_ORIGINAL(bam_line) )
+ {
+ stats->read_lengths[read_len]++;
+ collect_orig_read_stats(bam_line, stats, &gc_count);
+ }
+
+ // Look at the flags and increment appropriate counters (mapped, paired, etc)
+ if ( IS_UNMAPPED(bam_line) ) return;
+
+ count_indels(stats, bam_line);
+
+ if ( IS_PAIRED_AND_MAPPED(bam_line) )
+ {
+ // The insert size is tricky, because for long inserts the libraries are
+ // prepared differently and the pairs point in other direction. BWA does
+ // not set the paired flag for them. Similar thing is true also for 454
+ // reads. Mates mapped to different chromosomes have isize==0.
+ int32_t isize = bam_line->core.isize;
+ if ( isize<0 ) isize = -isize;
+ if ( stats->info->nisize > 0 && isize >= stats->info->nisize )
+ isize = stats->info->nisize-1;
+ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
+ {
+ int pos_fst = bam_line->core.mpos - bam_line->core.pos;
+ int is_fst = IS_READ1(bam_line) ? 1 : -1;
+ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
+ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+
+ if ( is_fwd*is_mfwd>0 )
+ stats->isize->inc_other(stats->isize->data, isize);
+ else if ( is_fst*pos_fst>0 )
{
- int pos_fst = bam_line->core.mpos - bam_line->core.pos;
- int is_fst = IS_READ1(bam_line) ? 1 : -1;
- int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
- int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
-
- if ( is_fwd*is_mfwd>0 )
- stats->isize->inc_other(stats->isize->data, isize);
- else if ( is_fst*pos_fst>0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize->inc_inward(stats->isize->data, isize);
- else
- stats->isize->inc_outward(stats->isize->data, isize);
- }
- else if ( is_fst*pos_fst<0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize->inc_outward(stats->isize->data, isize);
- else
- stats->isize->inc_inward(stats->isize->data, isize);
- }
+ if ( is_fst*is_fwd>0 )
+ stats->isize->inc_inward(stats->isize->data, isize);
+ else
+ stats->isize->inc_outward(stats->isize->data, isize);
+ }
+ else if ( is_fst*pos_fst<0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize->inc_outward(stats->isize->data, isize);
+ else
+ stats->isize->inc_inward(stats->isize->data, isize);
}
}
+ }
- // Number of mismatches
- uint8_t *nm = bam_aux_get(bam_line,"NM");
- if (nm)
- stats->nmismatches += bam_aux2i(nm);
+ // Number of mismatches
+ uint8_t *nm = bam_aux_get(bam_line,"NM");
+ if (nm)
+ stats->nmismatches += bam_aux2i(nm);
- // Number of mapped bases from cigar
- if ( bam_line->core.n_cigar == 0)
- error("FIXME: mapped read with no cigar?\n");
- int readlen=seq_len;
- if ( stats->regions )
+ // Number of mapped bases from cigar
+ if ( bam_line->core.n_cigar == 0)
+ error("FIXME: mapped read with no cigar?\n");
+ int readlen=seq_len;
+ if ( stats->regions )
+ {
+ // Count only on-target bases
+ int iref = bam_line->core.pos + 1;
+ for (i=0; i<bam_line->core.n_cigar; i++)
{
- // Count only on-target bases
- int iref = bam_line->core.pos + 1;
- for (i=0; i<bam_line->core.n_cigar; i++)
+ int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]);
+ int ncig = bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
+ if ( !ncig ) continue; // curiously, this can happen: 0D
+ if ( cig==BAM_CDEL ) readlen += ncig;
+ else if ( cig==BAM_CMATCH )
+ {
+ if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
+ else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
+ if ( ncig<0 ) ncig = 0;
+ stats->nbases_mapped_cigar += ncig;
+ iref += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
+ }
+ else if ( cig==BAM_CINS )
{
- int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]);
- int ncig = bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- if ( !ncig ) continue; // curiously, this can happen: 0D
- if ( cig==BAM_CDEL ) readlen += ncig;
- else if ( cig==BAM_CMATCH )
- {
- if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
- else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
- if ( ncig<0 ) ncig = 0;
+ iref += ncig;
+ if ( iref>=stats->reg_from && iref<=stats->reg_to )
stats->nbases_mapped_cigar += ncig;
- iref += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- }
- else if ( cig==BAM_CINS )
- {
- iref += ncig;
- if ( iref>=stats->reg_from && iref<=stats->reg_to )
- stats->nbases_mapped_cigar += ncig;
- }
}
}
- else
+ }
+ else
+ {
+ // Count the whole read
+ for (i=0; i<bam_line->core.n_cigar; i++)
{
- // Count the whole read
- for (i=0; i<bam_line->core.n_cigar; i++)
- {
- if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CMATCH || bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CINS )
- stats->nbases_mapped_cigar += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CDEL )
- readlen += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
- }
+ if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CMATCH || bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CINS )
+ stats->nbases_mapped_cigar += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
+ if ( bam_cigar_op(bam_get_cigar(bam_line)[i])==BAM_CDEL )
+ readlen += bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
}
- stats->nbases_mapped += seq_len;
+ }
- if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
- stats->is_sorted = 0;
- stats->pos = bam_line->core.pos;
+ if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
+ stats->is_sorted = 0;
+ stats->pos = bam_line->core.pos;
- if ( stats->is_sorted )
- {
- if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
- round_buffer_flush(stats,-1);
+ if ( stats->is_sorted )
+ {
+ if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
+ round_buffer_flush(stats, -1);
- // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
- // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
- // 20kbp, so the effect is negligible.
- if ( stats->fai )
+ // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
+ // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
+ // 20kbp, so the effect is negligible.
+ if ( stats->info->fai )
+ {
+ int inc_ref = 0, inc_gcd = 0;
+ // First pass or new chromosome
+ if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
+ // Read goes beyond the end of the rseq buffer
+ else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
+ // Read overlaps the next gcd bin
+ else if ( stats->gcd_pos+stats->info->gcd_bin_size < bam_line->core.pos+readlen )
{
- int inc_ref = 0, inc_gcd = 0;
- // First pass or new chromosome
- if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
- // Read goes beyond the end of the rseq buffer
- else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
- // Read overlaps the next gcd bin
- else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
- {
- inc_gcd = 1;
- if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
- }
- if ( inc_gcd )
- {
- stats->igcd++;
- if ( stats->igcd >= stats->ngcd )
- realloc_gcd_buffer(stats, readlen);
- if ( inc_ref )
- read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
- stats->gcd_pos = bam_line->core.pos;
- stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
- }
-
- count_mismatches_per_cycle(stats,bam_line,read_len);
+ inc_gcd = 1;
+ if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = 1;
}
- // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
- else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
+ if ( inc_gcd )
{
- // First pass or a new chromosome
- stats->tid = bam_line->core.tid;
- stats->gcd_pos = bam_line->core.pos;
stats->igcd++;
if ( stats->igcd >= stats->ngcd )
realloc_gcd_buffer(stats, readlen);
+ if ( inc_ref )
+ read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+ stats->gcd_pos = bam_line->core.pos;
+ stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->info->gcd_bin_size);
}
- stats->gcd[ stats->igcd ].depth++;
- // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
- if ( !stats->fai )
- stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
-
- // Coverage distribution graph
- round_buffer_flush(stats,bam_line->core.pos);
- round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
- }
- }
- stats->total_len += seq_len;
- if ( IS_DUP(bam_line) )
- {
- stats->total_len_dup += seq_len;
- stats->nreads_dup++;
+ count_mismatches_per_cycle(stats,bam_line,read_len);
+ }
+ // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
+ else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->info->gcd_bin_size )
+ {
+ // First pass or a new chromosome
+ stats->tid = bam_line->core.tid;
+ stats->gcd_pos = bam_line->core.pos;
+ stats->igcd++;
+ if ( stats->igcd >= stats->ngcd )
+ realloc_gcd_buffer(stats, readlen);
+ }
+ stats->gcd[ stats->igcd ].depth++;
+ // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
+ if ( !stats->info->fai )
+ stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
+
+ // Coverage distribution graph
+ round_buffer_flush(stats,bam_line->core.pos);
+ round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
}
}
@@ -910,7 +990,7 @@ float gcd_percentile(gc_depth_t *gcd, int N, int p)
return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
}
-void output_stats(stats_t *stats, int sparse)
+void output_stats(FILE *to, stats_t *stats, int sparse)
{
// Calculate average insert size and standard deviation (from the main bulk data only)
int isize, ibulk=0;
@@ -934,7 +1014,7 @@ void output_stats(stats_t *stats, int sparse)
bulk += stats->isize->inward(stats->isize->data, isize) + stats->isize->outward(stats->isize->data, isize) + stats->isize->other(stats->isize->data, isize);
avg_isize += isize * (stats->isize->inward(stats->isize->data, isize) + stats->isize->outward(stats->isize->data, isize) + stats->isize->other(stats->isize->data, isize));
- if ( bulk/nisize > stats->isize_main_bulk )
+ if ( bulk/nisize > stats->info->isize_main_bulk )
{
ibulk = isize+1;
nisize = bulk;
@@ -947,164 +1027,170 @@ void output_stats(stats_t *stats, int sparse)
sd_isize = sqrt(sd_isize);
- printf("# This file was produced by samtools stats (%s+htslib-%s) and can be plotted using plot-bamstats\n", samtools_version(), hts_version());
- printf("# The command line was: %s",stats->argv[0]);
+ fprintf(to, "# This file was produced by samtools stats (%s+htslib-%s) and can be plotted using plot-bamstats\n", samtools_version(), hts_version());
+ if( stats->split_name != NULL ){
+ fprintf(to, "# This file contains statistics only for reads with tag: %s=%s\n", stats->info->split_tag, stats->split_name);
+ }
+ else{
+ fprintf(to, "# This file contains statistics for all reads.\n");
+ }
+ fprintf(to, "# The command line was: %s",stats->info->argv[0]);
int i;
- for (i=1; i<stats->argc; i++)
- printf(" %s",stats->argv[i]);
- printf("\n");
- printf("# CHK, Checksum\t[2]Read Names\t[3]Sequences\t[4]Qualities\n");
- printf("# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
- printf("CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
- printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
- printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below)
- printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
- printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
- printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
- printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
- printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
- printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired_and_mapped+stats->nreads_single_mapped));
- printf("SN\treads mapped and paired:\t%ld\t# paired-end technology bit set + both mates mapped\n", (long)stats->nreads_paired_and_mapped);
- printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
- printf("SN\treads properly paired:\t%ld\t# proper-pair bit set\n", (long)stats->nreads_properly_paired);
- printf("SN\treads paired:\t%ld\t# paired-end technology bit set\n", (long)stats->nreads_paired_tech);
- printf("SN\treads duplicated:\t%ld\t# PCR or optical duplicate bit set\n", (long)stats->nreads_dup);
- printf("SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
- printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
- printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
- printf("SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
- printf("SN\tbases mapped:\t%ld\t# ignores clipping\n", (long)stats->nbases_mapped); // the length of the whole read goes here, including soft-clips etc.
- printf("SN\tbases mapped (cigar):\t%ld\t# more accurate\n", (long)stats->nbases_mapped_cigar); // only matched and inserted bases are counted here
- printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
- printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
- printf("SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
- printf("SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
+ for (i=1; i<stats->info->argc; i++)
+ fprintf(to, " %s", stats->info->argv[i]);
+ fprintf(to, "\n");
+ fprintf(to, "# CHK, Checksum\t[2]Read Names\t[3]Sequences\t[4]Qualities\n");
+ fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
+ fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
+ fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
+ fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below)
+ fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
+ fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
+ fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
+ fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
+ fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
+ fprintf(to, "SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired_and_mapped+stats->nreads_single_mapped));
+ fprintf(to, "SN\treads mapped and paired:\t%ld\t# paired-end technology bit set + both mates mapped\n", (long)stats->nreads_paired_and_mapped);
+ fprintf(to, "SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
+ fprintf(to, "SN\treads properly paired:\t%ld\t# proper-pair bit set\n", (long)stats->nreads_properly_paired);
+ fprintf(to, "SN\treads paired:\t%ld\t# paired-end technology bit set\n", (long)stats->nreads_paired_tech);
+ fprintf(to, "SN\treads duplicated:\t%ld\t# PCR or optical duplicate bit set\n", (long)stats->nreads_dup);
+ fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
+ fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
+ fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
+ fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
+ fprintf(to, "SN\tbases mapped:\t%ld\t# ignores clipping\n", (long)stats->nbases_mapped); // the length of the whole read goes here, including soft-clips etc.
+ fprintf(to, "SN\tbases mapped (cigar):\t%ld\t# more accurate\n", (long)stats->nbases_mapped_cigar); // only matched and inserted bases are counted here
+ fprintf(to, "SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
+ fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
+ fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
+ fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
- printf("SN\taverage length:\t%.0f\n", avg_read_length);
- printf("SN\tmaximum length:\t%d\n", stats->max_len);
- printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
- printf("SN\tinsert size average:\t%.1f\n", avg_isize);
- printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
- printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
- printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
- printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
- printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
+ fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length);
+ fprintf(to, "SN\tmaximum length:\t%d\n", stats->max_len);
+ fprintf(to, "SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
+ fprintf(to, "SN\tinsert size average:\t%.1f\n", avg_isize);
+ fprintf(to, "SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
+ fprintf(to, "SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
+ fprintf(to, "SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
+ fprintf(to, "SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
+ fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
int ibase,iqual;
if ( stats->max_len<stats->nbases ) stats->max_len++;
if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
- printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+ fprintf(to, "# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- printf("FFQ\t%d",ibase+1);
+ fprintf(to, "FFQ\t%d",ibase+1);
for (iqual=0; iqual<=stats->max_qual; iqual++)
{
- printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
+ fprintf(to, "\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
}
- printf("\n");
+ fprintf(to, "\n");
}
- printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+ fprintf(to, "# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- printf("LFQ\t%d",ibase+1);
+ fprintf(to, "LFQ\t%d",ibase+1);
for (iqual=0; iqual<=stats->max_qual; iqual++)
{
- printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
+ fprintf(to, "\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
}
- printf("\n");
+ fprintf(to, "\n");
}
if ( stats->mpc_buf )
{
- printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
- printf("# is the number of N's and the rest is the number of mismatches\n");
+ fprintf(to, "# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
+ fprintf(to, "# is the number of N's and the rest is the number of mismatches\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- printf("MPC\t%d",ibase+1);
+ fprintf(to, "MPC\t%d",ibase+1);
for (iqual=0; iqual<=stats->max_qual; iqual++)
{
- printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
+ fprintf(to, "\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
}
- printf("\n");
+ fprintf(to, "\n");
}
}
- printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
int ibase_prev = 0;
for (ibase=0; ibase<stats->ngc; ibase++)
{
if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
- printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
+ fprintf(to, "GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
ibase_prev = ibase;
}
- printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
ibase_prev = 0;
for (ibase=0; ibase<stats->ngc; ibase++)
{
if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
- printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
+ fprintf(to, "GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
ibase_prev = ibase;
}
- printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
+ fprintf(to, "# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n");
for (ibase=0; ibase<stats->max_len; ibase++)
{
- uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
- uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
- if ( ! sum ) continue;
- printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
+ acgtno_count_t *acgtno_count = &(stats->acgtno_cycles[ibase]);
+ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t;
+ if ( ! acgt_sum ) continue;
+ fprintf(to, "GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, 100.*acgtno_count->a/acgt_sum, 100.*acgtno_count->c/acgt_sum, 100.*acgtno_count->g/acgt_sum, 100.*acgtno_count->t/acgt_sum, 100.*acgtno_count->n/acgt_sum, 100.*acgtno_count->other/acgt_sum);
}
- printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
+ fprintf(to, "# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
for (isize=0; isize<ibulk; isize++) {
long in = (long)(stats->isize->inward(stats->isize->data, isize));
long out = (long)(stats->isize->outward(stats->isize->data, isize));
long other = (long)(stats->isize->other(stats->isize->data, isize));
if (!sparse || in + out + other > 0) {
- printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, in+out+other,
+ fprintf(to, "IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, in+out+other,
in , out, other);
}
}
- printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
+ fprintf(to, "# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
int ilen;
for (ilen=0; ilen<stats->max_len; ilen++)
{
if ( stats->read_lengths[ilen]>0 )
- printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
+ fprintf(to, "RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
}
- printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
+ fprintf(to, "# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
for (ilen=0; ilen<stats->nindels; ilen++)
{
if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
- printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
+ fprintf(to, "ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
}
- printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
+ fprintf(to, "# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
for (ilen=0; ilen<=stats->nbases; ilen++)
{
// For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
// the index of the cycle of the first inserted base (also 1-based)
if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
- printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
+ fprintf(to, "IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
}
- printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
+ fprintf(to, "# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
if ( stats->cov[0] )
- printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
+ fprintf(to, "COV\t[<%d]\t%d\t%ld\n",stats->info->cov_min,stats->info->cov_min-1, (long)stats->cov[0]);
int icov;
for (icov=1; icov<stats->ncov-1; icov++)
if ( stats->cov[icov] )
- printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
+ fprintf(to, "COV\t[%d-%d]\t%d\t%ld\n",stats->info->cov_min + (icov-1)*stats->info->cov_step, stats->info->cov_min + icov*stats->info->cov_step-1,stats->info->cov_min + icov*stats->info->cov_step-1, (long)stats->cov[icov]);
if ( stats->cov[stats->ncov-1] )
- printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
+ fprintf(to, "COV\t[%d<]\t%d\t%ld\n",stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1,stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1, (long)stats->cov[stats->ncov-1]);
// Calculate average GC content, then sort by GC and depth
- printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
+ fprintf(to, "# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
uint32_t igcd;
for (igcd=0; igcd<stats->igcd; igcd++)
{
- if ( stats->fai )
+ if ( stats->info->fai )
stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc);
else
if ( stats->gcd[igcd].depth )
@@ -1122,82 +1208,39 @@ void output_stats(stats_t *stats, int sparse)
nbins++;
itmp++;
}
- printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
- gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
+ fprintf(to, "GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
+ gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->info->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->info->gcd_bin_size
);
igcd += nbins;
}
}
-size_t mygetline(char **line, size_t *n, FILE *fp)
+void init_regions(stats_t *stats, const char *file)
{
- if (line == NULL || n == NULL || fp == NULL)
- {
- errno = EINVAL;
- return -1;
- }
- if (*n==0 || !*line)
- {
- *line = NULL;
- *n = 0;
- }
-
- size_t nread=0;
- int c;
- while ((c=getc(fp))!= EOF && c!='\n')
- {
- if ( ++nread>=*n )
- {
- *n += 255;
- *line = realloc(*line, sizeof(char)*(*n));
- }
- (*line)[nread-1] = c;
- }
- if ( nread>=*n )
- {
- *n += 255;
- *line = realloc(*line, sizeof(char)*(*n));
- }
- (*line)[nread] = 0;
- return nread>0 ? nread : -1;
-
-}
-
-void init_regions(stats_t *stats, char *file)
-{
-#if 0
- khiter_t iter;
- khash_t(kh_bam_tid) *header_hash;
-
- header_hash = (khash_t(kh_bam_tid)*)stats->sam_header->hash;
-
FILE *fp = fopen(file,"r");
if ( !fp ) error("%s: %s\n",file,strerror(errno));
- char *line = NULL;
- size_t len = 0;
- ssize_t nread;
+ kstring_t line = { 0, 0, NULL };
int warned = 0;
int prev_tid=-1, prev_pos=-1;
- while ((nread = mygetline(&line, &len, fp)) != -1)
+ while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0)
{
- if ( line[0] == '#' ) continue;
+ if ( line.s[0] == '#' ) continue;
int i = 0;
- while ( i<nread && !isspace(line[i]) ) i++;
- if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
- line[i] = 0;
+ while ( i<line.l && !isspace(line.s[i]) ) i++;
+ if ( i>=line.l ) error("Could not parse the file: %s [%s]\n", file, line.s);
+ line.s[i] = '\0';
- iter = kh_get(kh_bam_tid, header_hash, line);
- int tid = kh_val(header_hash, iter);
- if ( iter == kh_end(header_hash) )
+ int tid = bam_name2id(stats->info->sam_header, line.s);
+ if ( tid < 0 )
{
if ( !warned )
- fprintf(pysamerr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
+ fprintf(pysamerr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s);
warned = 1;
continue;
}
@@ -1220,23 +1263,19 @@ void init_regions(stats_t *stats, char *file)
stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
}
- if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
+ if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
if ( prev_tid==-1 || prev_tid!=tid )
{
prev_tid = tid;
prev_pos = stats->regions[tid].pos[npos].from;
}
if ( prev_pos>stats->regions[tid].pos[npos].from )
- error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
+ error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line.s,stats->regions[tid].pos[npos].from,prev_pos);
stats->regions[tid].npos++;
}
- if (line) free(line);
+ free(line.s);
if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
fclose(fp);
-#else
- fprintf(pysamerr, "Samtools-htslib: init_regions() header parsing not yet implemented\n");
- abort();
-#endif
}
void destroy_regions(stats_t *stats)
@@ -1280,7 +1319,7 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats)
return 1;
}
-void init_group_id(stats_t *stats, char *id)
+void init_group_id(stats_t *stats, const char *id)
{
#if 0
if ( !stats->sam_header->dict )
@@ -1329,11 +1368,14 @@ static void error(const char *format, ...)
printf(" -I, --id <string> Include only listed read group or sample name\n");
printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
+ printf(" -P, --split-prefix <str> Path or string prefix for filepaths output by -S (default is input filename)\n");
printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n");
+ printf(" -s, --sam Ignored (input format is auto-detected).\n");
+ printf(" -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
- printf(" -s, --sam Input is SAM (usually auto-detected now).\n");
printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
+ sam_global_opt_help(stdout, "-.--.");
printf("\n");
}
else
@@ -1343,13 +1385,17 @@ static void error(const char *format, ...)
vfprintf(pysamerr, format, ap);
va_end(ap);
}
- exit(-1);
+ exit(1);
+}
+
+void cleanup_stats_info(stats_info_t* info){
+ if (info->fai) fai_destroy(info->fai);
+ sam_close(info->sam);
+ free(info);
}
void cleanup_stats(stats_t* stats)
{
- sam_close(stats->sam);
- if (stats->fai) fai_destroy(stats->fai);
free(stats->cov_rbuf.buffer); free(stats->cov);
free(stats->quals_1st); free(stats->quals_2nd);
free(stats->gc_1st); free(stats->gc_2nd);
@@ -1358,7 +1404,7 @@ void cleanup_stats(stats_t* stats)
free(stats->gcd);
free(stats->rseq_buf);
free(stats->mpc_buf);
- free(stats->acgt_cycles);
+ free(stats->acgtno_cycles);
free(stats->read_lengths);
free(stats->insertions);
free(stats->deletions);
@@ -1368,43 +1414,189 @@ void cleanup_stats(stats_t* stats)
free(stats->del_cycles_2nd);
destroy_regions(stats);
if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash);
+ free(stats->split_name);
free(stats);
}
-int main_stats(int argc, char *argv[])
+void output_split_stats(khash_t(c2stats) *split_hash, char* bam_fname, int sparse)
{
- char *targets = NULL;
- char *bam_fname = NULL;
- char *group_id = NULL;
- samFile* sam = NULL;
- char in_mode[5];
- int sparse = 0;
+ int i = 0;
+ kstring_t output_filename = { 0, 0, NULL };
+ stats_t *curr_stats = NULL;
+ for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){
+ if(!kh_exist(split_hash, i)) continue;
+ curr_stats = kh_value(split_hash, i);
+ round_buffer_flush(curr_stats, -1);
+
+ output_filename.l = 0;
+ if (curr_stats->info->split_prefix)
+ kputs(curr_stats->info->split_prefix, &output_filename);
+ else
+ kputs(bam_fname, &output_filename);
+ kputc('_', &output_filename);
+ kputs(curr_stats->split_name, &output_filename);
+ kputs(".bamstat", &output_filename);
+
+ FILE *to = fopen(output_filename.s, "w");
+ if(to == NULL){
+ error("Could not open '%s' for writing.\n", output_filename.s);
+ }
+ output_stats(to, curr_stats, sparse);
+ fclose(to);
+ }
+
+ free(output_filename.s);
+}
+
+void destroy_split_stats(khash_t(c2stats) *split_hash)
+{
+ int i = 0;
+ stats_t *curr_stats = NULL;
+ for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){
+ if(!kh_exist(split_hash, i)) continue;
+ curr_stats = kh_value(split_hash, i);
+ cleanup_stats(curr_stats);
+ }
+ kh_destroy(c2stats, split_hash);
+}
+stats_info_t* stats_info_init(int argc, char *argv[])
+{
+ stats_info_t* info = calloc(1, sizeof(stats_info_t));
+ info->nisize = 8000;
+ info->isize_main_bulk = 0.99; // There are always outliers at the far end
+ info->gcd_bin_size = 20e3;
+ info->cov_min = 1;
+ info->cov_max = 1000;
+ info->cov_step = 1;
+ info->filter_readlen = -1;
+ info->argc = argc;
+ info->argv = argv;
+
+ return info;
+}
+
+int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFormat* in_fmt)
+{
+ // .. bam
+ samFile* sam;
+ if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) {
+ error("Failed to open: %s\n", bam_fname);
+ return 1;
+ }
+ info->sam = sam;
+ info->sam_header = sam_hdr_read(sam);
+ if (info->sam_header == NULL) {
+ error("Failed to read header for '%s'\n", bam_fname);
+ return 1;
+ }
+ return 0;
+}
+
+stats_t* stats_init()
+{
stats_t *stats = calloc(1,sizeof(stats_t));
stats->ngc = 200;
stats->nquals = 256;
stats->nbases = 300;
- stats->nisize = 8000;
stats->max_len = 30;
stats->max_qual = 40;
- stats->isize_main_bulk = 0.99; // There are always outliers at the far end
- stats->gcd_bin_size = 20e3;
stats->rseq_pos = -1;
stats->tid = stats->gcd_pos = -1;
stats->igcd = 0;
stats->is_sorted = 1;
- stats->cov_min = 1;
- stats->cov_max = 1000;
- stats->cov_step = 1;
- stats->argc = argc;
- stats->argv = argv;
- stats->filter_readlen = -1;
stats->nindels = stats->nbases;
+ stats->split_name = NULL;
+
+ return stats;
+}
+
+static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* group_id, const char* targets)
+{
+ // Give stats_t a pointer to the info struct
+ // This saves us having to pass the stats_info_t to every function
+ stats->info = info;
+
+ // Init structures
+ // .. coverage bins and round buffer
+ if ( info->cov_step > info->cov_max - info->cov_min + 1 )
+ {
+ info->cov_step = info->cov_max - info->cov_min;
+ if ( info->cov_step <= 0 )
+ info->cov_step = 1;
+ }
+ stats->ncov = 3 + (info->cov_max-info->cov_min) / info->cov_step;
+ info->cov_max = info->cov_min + ((info->cov_max-info->cov_min)/info->cov_step +1)*info->cov_step - 1;
+ stats->cov = calloc(sizeof(uint64_t),stats->ncov);
+ stats->cov_rbuf.size = stats->nbases*5;
+ stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
+
+ if ( group_id ) init_group_id(stats, group_id);
+ // .. arrays
+ stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
+ stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
+ stats->isize = init_isize_t(info->nisize);
+ stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
+ stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
+ stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t));
+ stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
+ stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ realloc_rseq_buffer(stats);
+ if ( targets )
+ init_regions(stats, targets);
+}
+
+static stats_t* get_curr_split_stats(bam1_t* bam_line, khash_t(c2stats)* split_hash, stats_info_t* info, char* targets)
+{
+ stats_t *curr_stats = NULL;
+ const uint8_t *tag_val = bam_aux_get(bam_line, info->split_tag);
+ if(tag_val == 0){
+ error("Tag '%s' not found in bam_line.\n", info->split_tag);
+ }
+ char* split_name = strdup(bam_aux2Z(tag_val));
+
+ // New stats object, under split
+ khiter_t k = kh_get(c2stats, split_hash, split_name);
+ if(k == kh_end(split_hash)){
+ curr_stats = stats_init(); // mallocs new instance
+ init_stat_structs(curr_stats, info, NULL, targets);
+ curr_stats->split_name = split_name;
+
+ // Record index in hash
+ int ret = 0;
+ khiter_t iter = kh_put(c2stats, split_hash, split_name, &ret);
+ if( ret < 0 ){
+ error("Failed to insert key '%s' into split_hash", split_name);
+ }
+ kh_val(split_hash, iter) = curr_stats; // store pointer to stats
+ }
+ else{
+ curr_stats = kh_value(split_hash, k);
+ free(split_name); // don't need to hold on to this if it wasn't new
+ }
+ return curr_stats;
+}
+
+int main_stats(int argc, char *argv[])
+{
+ char *targets = NULL;
+ char *bam_fname = NULL;
+ char *group_id = NULL;
+ int sparse = 0;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- strcpy(in_mode, "rb");
+ stats_info_t *info = stats_info_init(argc, argv);
static const struct option loptions[] =
{
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
{"help", no_argument, NULL, 'h'},
{"remove-dups", no_argument, NULL, 'd'},
{"sam", no_argument, NULL, 's'},
@@ -1420,35 +1612,43 @@ int main_stats(int argc, char *argv[])
{"id", required_argument, NULL, 'I'},
{"GC-depth", required_argument, NULL, 1},
{"sparse", no_argument, NULL, 'x'},
+ {"split", required_argument, NULL, 'S'},
+ {"split-prefix", required_argument, NULL, 'P'},
{NULL, 0, NULL, 0}
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
+
+ while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 )
{
switch (opt)
{
- case 'f': stats->flag_require = bam_str2flag(optarg); break;
- case 'F': stats->flag_filter = bam_str2flag(optarg); break;
- case 'd': stats->flag_filter |= BAM_FDUP; break;
- case 's': strcpy(in_mode, "r"); break;
- case 'r': stats->fai = fai_load(optarg);
- if (stats->fai==0)
+ case 'f': info->flag_require = bam_str2flag(optarg); break;
+ case 'F': info->flag_filter = bam_str2flag(optarg); break;
+ case 'd': info->flag_filter |= BAM_FDUP; break;
+ case 's': break;
+ case 'r': info->fai = fai_load(optarg);
+ if (info->fai==NULL)
error("Could not load faidx: %s\n", optarg);
break;
- case 1 : stats->gcd_bin_size = atof(optarg); break;
- case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
+ case 1 : info->gcd_bin_size = atof(optarg); break;
+ case 'c': if ( sscanf(optarg,"%d,%d,%d",&info->cov_min,&info->cov_max,&info->cov_step)!= 3 )
error("Unable to parse -c %s\n", optarg);
break;
- case 'l': stats->filter_readlen = atoi(optarg); break;
- case 'i': stats->nisize = atoi(optarg); break;
- case 'm': stats->isize_main_bulk = atof(optarg); break;
- case 'q': stats->trim_qual = atoi(optarg); break;
+ case 'l': info->filter_readlen = atoi(optarg); break;
+ case 'i': info->nisize = atoi(optarg); break;
+ case 'm': info->isize_main_bulk = atof(optarg); break;
+ case 'q': info->trim_qual = atoi(optarg); break;
case 't': targets = optarg; break;
case 'I': group_id = optarg; break;
case 'x': sparse = 1; break;
+ case 'S': info->split_tag = optarg; break;
+ case 'P': info->split_prefix = optarg; break;
case '?':
case 'h': error(NULL);
- default: error("Unknown argument: %s\n", optarg);
+ default:
+ if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0)
+ error("Unknown argument: %s\n", optarg);
+ break;
}
}
if ( optind<argc )
@@ -1461,62 +1661,36 @@ int main_stats(int argc, char *argv[])
bam_fname = "-";
}
- // Init structures
- // .. coverage bins and round buffer
- if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
- {
- stats->cov_step = stats->cov_max - stats->cov_min;
- if ( stats->cov_step <= 0 )
- stats->cov_step = 1;
- }
- stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
- stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
- stats->cov = calloc(sizeof(uint64_t),stats->ncov);
- stats->cov_rbuf.size = stats->nbases*5;
- stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
- // .. bam
- if ((sam = sam_open(bam_fname, in_mode)) == 0)
- error("Failed to open: %s\n", bam_fname);
- stats->sam = sam;
- stats->sam_header = sam_hdr_read(sam);
- if ( group_id ) init_group_id(stats, group_id);
- bam1_t *bam_line = bam_init1();
- // .. arrays
- stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
- stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize = init_isize_t(stats->nisize);
- stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
- stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
- stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
- stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
- stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
- stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
- stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
- realloc_rseq_buffer(stats);
- if ( targets )
- init_regions(stats, targets);
+ if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1;
+
+ stats_t *all_stats = stats_init();
+ stats_t *curr_stats = NULL;
+ init_stat_structs(all_stats, info, group_id, targets);
+ // Init
+ // .. hash
+ khash_t(c2stats)* split_hash = kh_init(c2stats);
// Collect statistics
+ bam1_t *bam_line = bam_init1();
if ( optind<argc )
{
// Collect stats in selected regions only
- hts_idx_t *bam_idx = bam_index_load(bam_fname);
+ hts_idx_t *bam_idx = sam_index_load(info->sam,bam_fname);
if (bam_idx == 0)
error("Random alignment retrieval only works for indexed BAM files.\n");
int i;
for (i=optind; i<argc; i++)
{
- reset_regions(stats);
- hts_itr_t* iter = bam_itr_querys(bam_idx, stats->sam_header, argv[i]);
- while (sam_itr_next(sam, iter, bam_line) >= 0) {
- collect_stats(bam_line,stats);
+ hts_itr_t* iter = bam_itr_querys(bam_idx, info->sam_header, argv[i]);
+ while (sam_itr_next(info->sam, iter, bam_line) >= 0) {
+ if (info->split_tag) {
+ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets);
+ collect_stats(bam_line, curr_stats);
+ }
+ collect_stats(bam_line, all_stats);
}
+ reset_regions(all_stats);
bam_itr_destroy(iter);
}
hts_idx_destroy(bam_idx);
@@ -1524,16 +1698,33 @@ int main_stats(int argc, char *argv[])
else
{
// Stream through the entire BAM ignoring off-target regions if -t is given
- while (sam_read1(sam, stats->sam_header, bam_line) >= 0)
- collect_stats(bam_line,stats);
+ int ret;
+ while ((ret = sam_read1(info->sam, info->sam_header, bam_line)) >= 0) {
+ if (info->split_tag) {
+ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets);
+ collect_stats(bam_line, curr_stats);
+ }
+ collect_stats(bam_line, all_stats);
+ }
+
+ if (ret < -1) {
+ fprintf(pysamerr, "Failure while decoding file\n");
+ return 1;
+ }
}
- round_buffer_flush(stats,-1);
- output_stats(stats, sparse);
+ round_buffer_flush(all_stats, -1);
+ output_stats(stdout, all_stats, sparse);
+ if (info->split_tag)
+ output_split_stats(split_hash, bam_fname, sparse);
+
bam_destroy1(bam_line);
- bam_hdr_destroy(stats->sam_header);
+ bam_hdr_destroy(info->sam_header);
+ sam_global_args_free(&ga);
- cleanup_stats(stats);
+ cleanup_stats(all_stats);
+ cleanup_stats_info(info);
+ destroy_split_stats(split_hash);
return 0;
}
diff --git a/samtools/test/merge/test_bam_translate.c b/samtools/test/merge/test_bam_translate.c
index 8ef0f6a..854779b 100644
--- a/samtools/test/merge/test_bam_translate.c
+++ b/samtools/test/merge/test_bam_translate.c
@@ -354,8 +354,7 @@ int main(int argc, char**argv)
bam1_t* b;
// Setup stderr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr
char* tempfname = (optind < argc)? argv[optind] : "test_bam_translate.tmp";
FILE* check = NULL;
@@ -383,8 +382,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res))) ) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0) ) {
++success;
} else {
++failure;
@@ -420,8 +420,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res))) ) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0) ) {
++success;
} else {
++failure;
@@ -457,8 +458,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res)))) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -493,8 +495,9 @@ int main(int argc, char**argv)
}
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) != -1 ) &&
- res && !strcmp("[bam_translate] RG tag \"rg4hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost\n",res)) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) >= 0 &&
+ strcmp("[bam_translate] RG tag \"rg4hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.",res.s) == 0) {
++success;
} else {
++failure;
@@ -529,8 +532,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) != -1 ) &&
- res && !strcmp("[bam_translate] PG tag \"pg5hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost\n",res)) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) >= 0 &&
+ strcmp("[bam_translate] PG tag \"pg5hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.",res.s) == 0) {
++success;
} else {
++failure;
@@ -566,8 +570,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res))) ) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0) ) {
++success;
} else {
++failure;
@@ -581,7 +586,7 @@ int main(int argc, char**argv)
if (verbose) printf("END test 6\n");
// Cleanup
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_stderr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/merge/test_bam_translate.c.pysam.c b/samtools/test/merge/test_bam_translate.c.pysam.c
index da2380e..d11fbf8 100644
--- a/samtools/test/merge/test_bam_translate.c.pysam.c
+++ b/samtools/test/merge/test_bam_translate.c.pysam.c
@@ -356,8 +356,7 @@ int main(int argc, char**argv)
bam1_t* b;
// Setup pysamerr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
char* tempfname = (optind < argc)? argv[optind] : "test_bam_translate.tmp";
FILE* check = NULL;
@@ -385,8 +384,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res))) ) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0) ) {
++success;
} else {
++failure;
@@ -422,8 +422,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res))) ) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0) ) {
++success;
} else {
++failure;
@@ -459,8 +460,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res)))) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -495,8 +497,9 @@ int main(int argc, char**argv)
}
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) != -1 ) &&
- res && !strcmp("[bam_translate] RG tag \"rg4hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost\n",res)) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) >= 0 &&
+ strcmp("[bam_translate] RG tag \"rg4hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.",res.s) == 0) {
++success;
} else {
++failure;
@@ -531,8 +534,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) != -1 ) &&
- res && !strcmp("[bam_translate] PG tag \"pg5hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost\n",res)) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) >= 0 &&
+ strcmp("[bam_translate] PG tag \"pg5hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.",res.s) == 0) {
++success;
} else {
++failure;
@@ -568,8 +572,9 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
- if ( (getline(&res, &len, check) == -1 ) &&
- (feof(check) || (res && !strcmp("",res))) ) {
+ res.l = 0;
+ if (kgetline(&res, (kgets_func *)fgets, check) < 0 &&
+ (feof(check) || res.l == 0) ) {
++success;
} else {
++failure;
@@ -583,7 +588,7 @@ int main(int argc, char**argv)
if (verbose) printf("END test 6\n");
// Cleanup
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/merge/test_pretty_header.c b/samtools/test/merge/test_pretty_header.c
deleted file mode 100644
index c5c5f9e..0000000
--- a/samtools/test/merge/test_pretty_header.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/* test/merge/test_pretty_header.c -- header test harness.
-
- Copyright (C) 2013 Genome Research Ltd.
-
- Author: Martin O. Pollard <mp15 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include "../../bam_sort.c"
-
-void setup_test_1(char** input) {
- *input = strdup(
- "@HD\n"
- "@SQ\n"
- "@RG\n"
- "@PG\n"
- "@CO\n");
-}
-
-bool check_test_1(char* input) {
- // Check input is unchanged
-
- // Check output
-
- return true;
-}
-
-
-int main(int argc, char**argv)
-{
- const int NUM_TESTS = 1;
- int verbose = 0;
- int success = 0;
- int failure = 0;
- int getopt_char;
- while ((getopt_char = getopt(argc, argv, "v")) != -1) {
- switch (getopt_char) {
- case 'v':
- ++verbose;
- break;
- default:
- break;
- }
- }
-
- if (verbose) printf("BEGIN test 1\n");
- // setup
- char* input;
- setup_test_1(&input);
- // test
- if (verbose > 1) {
- printf("input:\n%s",input);
- }
- if (verbose) printf("RUN test 1\n");
- pretty_header(&input, strlen(input));
- if (verbose) printf("END RUN test 1\n");
- if (verbose > 1) {
- printf("input:\n%s",input);
- }
- if (check_test_1(input)) { ++success; } else { ++failure; }
- // teardown
- free(input);
- if (verbose) printf("END test 1\n");
-
- if (success == NUM_TESTS) {
- return 0;
- } else {
- fprintf(stderr, "%d failures %d successes\n", failure, success);
- return 1;
- }
-}
diff --git a/samtools/test/merge/test_pretty_header.c.pysam.c b/samtools/test/merge/test_pretty_header.c.pysam.c
deleted file mode 100644
index 851271b..0000000
--- a/samtools/test/merge/test_pretty_header.c.pysam.c
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "pysam.h"
-
-/* test/merge/test_pretty_header.c -- header test harness.
-
- Copyright (C) 2013 Genome Research Ltd.
-
- Author: Martin O. Pollard <mp15 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include "../../bam_sort.c"
-
-void setup_test_1(char** input) {
- *input = strdup(
- "@HD\n"
- "@SQ\n"
- "@RG\n"
- "@PG\n"
- "@CO\n");
-}
-
-bool check_test_1(char* input) {
- // Check input is unchanged
-
- // Check output
-
- return true;
-}
-
-
-int main(int argc, char**argv)
-{
- const int NUM_TESTS = 1;
- int verbose = 0;
- int success = 0;
- int failure = 0;
- int getopt_char;
- while ((getopt_char = getopt(argc, argv, "v")) != -1) {
- switch (getopt_char) {
- case 'v':
- ++verbose;
- break;
- default:
- break;
- }
- }
-
- if (verbose) printf("BEGIN test 1\n");
- // setup
- char* input;
- setup_test_1(&input);
- // test
- if (verbose > 1) {
- printf("input:\n%s",input);
- }
- if (verbose) printf("RUN test 1\n");
- pretty_header(&input, strlen(input));
- if (verbose) printf("END RUN test 1\n");
- if (verbose > 1) {
- printf("input:\n%s",input);
- }
- if (check_test_1(input)) { ++success; } else { ++failure; }
- // teardown
- free(input);
- if (verbose) printf("END test 1\n");
-
- if (success == NUM_TESTS) {
- return 0;
- } else {
- fprintf(pysamerr, "%d failures %d successes\n", failure, success);
- return 1;
- }
-}
diff --git a/samtools/test/merge/test_trans_tbl_init.c b/samtools/test/merge/test_trans_tbl_init.c
index 64b9786..b1164a3 100644
--- a/samtools/test/merge/test_trans_tbl_init.c
+++ b/samtools/test/merge/test_trans_tbl_init.c
@@ -23,6 +23,13 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include "../../bam_sort.c"
+#include <assert.h>
+#include <regex.h>
+
+typedef struct refseq_info {
+ const char *name;
+ uint32_t len;
+} refseq_info_t;
void dump_header(bam_hdr_t* hdr) {
printf("->n_targets:(%d)\n", hdr->n_targets);
@@ -37,36 +44,90 @@ void dump_header(bam_hdr_t* hdr) {
printf(")\n");
}
+static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) {
+ trans_tbl_t dummy;
+ int res;
+ res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, NULL);
+ trans_tbl_destroy(&dummy);
+ return res;
+}
+
+/*
+ * Populate merged_hdr with data from bam0_header_text and bam0_refseqs.
+ * Return bam_hdr_t based on the content in bam1_header_text and bam1_refseqs.
+ */
+
+bam_hdr_t * setup_test(const char *bam0_header_text,
+ const refseq_info_t *bam0_refseqs,
+ int32_t bam0_n_refseqs,
+ const char *bam1_header_text,
+ const refseq_info_t *bam1_refseqs,
+ int32_t bam1_n_refseqs,
+ merged_header_t *merged_hdr) {
+ bam_hdr_t* bam0 = NULL;
+ bam_hdr_t* bam1 = NULL;
+ int32_t i;
+
+ bam0 = bam_hdr_init();
+ bam0->text = strdup(bam0_header_text);
+ if (!bam0->text) goto fail;
+ bam0->l_text = strlen(bam0_header_text);
+ bam0->n_targets = 1;
+ bam0->target_name = (char**)calloc(bam0_n_refseqs, sizeof(char*));
+ bam0->target_len = (uint32_t*)calloc(bam0_n_refseqs, sizeof(uint32_t));
+ for (i = 0; i < bam0_n_refseqs; i++) {
+ bam0->target_name[i] = strdup(bam0_refseqs[i].name);
+ if (!bam0->target_name[i]) goto fail;
+ bam0->target_len[i] = bam0_refseqs[i].len;
+ }
+
+ if (populate_merged_header(bam0, merged_hdr)) goto fail;
+
+ bam1 = bam_hdr_init();
+ if (!bam1) goto fail;
+ bam1->text = strdup(bam1_header_text);
+ if (!bam1->text) goto fail;
+ bam1->l_text = strlen(bam1_header_text);
+ bam1->n_targets = bam1_n_refseqs;
+ bam1->target_name = (char**)calloc(bam1_n_refseqs, sizeof(char*));
+ bam1->target_len = (uint32_t*)calloc(bam1_n_refseqs, sizeof(uint32_t));
+ for (i = 0; i < bam1_n_refseqs; i++) {
+ bam1->target_name[i] = strdup(bam1_refseqs[i].name);
+ if (!bam1->target_name[i]) goto fail;
+ bam1->target_len[i] = bam1_refseqs[i].len;
+ }
+
+ bam_hdr_destroy(bam0);
+ return bam1;
+
+ fail:
+ bam_hdr_destroy(bam1);
+ bam_hdr_destroy(bam0);
+ return NULL;
+}
+
+#define NELE(x) (sizeof((x)) / sizeof((x)[0]))
+
+static const char init_text[] =
+ "@HD\tVN:1.4\tSO:unknown\n"
+ "@SQ\tSN:fish\tLN:133\tSP:frog";
+
+static const refseq_info_t init_refs[1] = {
+ { "fish", 133 }
+};
+
static const char test_1_trans_text[] =
"@HD\tVN:1.4\tSO:unknown\n"
"@SQ\tSN:fish\tLN:133\n";
-void setup_test_1(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_1_refs[1] = {
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_1_trans_text);
- translate->l_text = strlen(test_1_trans_text);
- translate->n_targets = 1;
- translate->target_name = (char**)calloc(1, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- translate->target_name[0] = strdup("fish");
- translate->target_len[0] = 133;
- out = bam_hdr_init();
- const char out_text[] =
- "@HD\tVN:1.4\tSO:unknown\n"
- "@SQ\tSN:fish\tLN:133\tSP:frog";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+bam_hdr_t * setup_test_1(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_1_trans_text, test_1_refs, NELE(test_1_refs),
+ merged_hdr);
}
bool check_test_1(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -100,34 +161,15 @@ static const char test_2_trans_text[] =
"@SQ\tSN:donkey\tLN:133\n"
"@SQ\tSN:fish\tLN:133";
-void setup_test_2(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_2_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_2_trans_text);
- translate->l_text = strlen(test_2_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
- "@HD\tVN:1.4\tSO:unknown\n"
- "@SQ\tSN:fish\tLN:133\tSP:frog";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+bam_hdr_t * setup_test_2(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_2_trans_text, test_2_refs, NELE(test_2_refs),
+ merged_hdr);
}
bool check_test_2(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -163,34 +205,15 @@ static const char test_3_trans_text[] =
"@SQ\tSN:fish\tLN:133\n"
"@RG\tID:fish\tPU:trans\n";
-void setup_test_3(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_3_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_3_trans_text);
- translate->l_text = strlen(test_3_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
- "@HD\tVN:1.4\tSO:unknown\n"
- "@SQ\tSN:fish\tLN:133\tSP:frog";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+bam_hdr_t * setup_test_3(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_3_trans_text, test_3_refs, NELE(test_3_refs),
+ merged_hdr);
}
bool check_test_3(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -209,35 +232,20 @@ static const char test_4_trans_text[] =
"@SQ\tSN:fish\tLN:133\n"
"@RG\tID:fish\tPU:trans\n";
-void setup_test_4(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_4_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_4_trans_text);
- translate->l_text = strlen(test_4_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
+bam_hdr_t * setup_test_4(merged_header_t *merged_hdr) {
+ const char* t4_init_text =
"@HD\tVN:1.4\tSO:unknown\n"
"@SQ\tSN:fish\tLN:133\tSP:frog\n"
"@RG\tID:fish\tPU:out\n";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+
+ return setup_test(t4_init_text, init_refs, NELE(init_refs),
+ test_4_trans_text, test_4_refs, NELE(test_4_refs),
+ merged_hdr);
}
bool check_test_4(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -258,37 +266,22 @@ static const char test_5_trans_text[] =
"@PG\tXX:dummy\tID:fish\tDS:trans\n"
"@PG\tPP:fish\tID:hook\tDS:trans\n";
-void setup_test_5(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_5_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_5_trans_text);
- translate->l_text = strlen(test_5_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
+bam_hdr_t * setup_test_5(merged_header_t *merged_hdr) {
+ const char* t5_init_text =
"@HD\tVN:1.4\tSO:unknown\n"
"@SQ\tSN:fish\tLN:133\tSP:frog\n"
"@RG\tID:fish\tPU:out\n"
"@PG\tXX:dummyx\tID:fish\tDS:out\n"
"@PG\tPP:fish\tID:hook\tDS:out\n";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+
+ return setup_test(t5_init_text, init_refs, NELE(init_refs),
+ test_5_trans_text, test_5_refs, NELE(test_5_refs),
+ merged_hdr);
}
bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -301,10 +294,38 @@ bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
return true;
}
+static const char test_6_trans_text[] =
+"@HD\tVN:1.4\tSO:unknown\n"
+"@SQ\tSN:donkey\tLN:133\n"
+"@SQ\tSN:fish\tLN:133\n"
+"@RG\tID:fish\tPU:trans\n"
+"@PG\tXX:dummy\tID:fish\tDS:trans\n"
+"@PG\tPP:fish\tID:hook\tDS:trans\n";
+
+static const refseq_info_t test_6_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
+
+bam_hdr_t * setup_test_6(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_6_trans_text, test_6_refs, NELE(test_6_refs),
+ merged_hdr);
+}
+
+bool check_test_6(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
+ // Check input is unchanged
+ if (
+ strncmp(test_6_trans_text, translate->text, translate->l_text)
+ || translate->l_text != strlen(test_5_trans_text)
+ || translate->n_targets != 2
+ ) return false;
+ return true;
+}
int main(int argc, char**argv)
{
- const int NUM_TESTS = 5;
+ const int NUM_TESTS = 6;
int verbose = 0;
int success = 0;
int failure = 0;
@@ -329,16 +350,18 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 1\n");
// setup
trans_tbl_t tbl_1;
- setup_test_1(&translate,&out);
+ merged_header_t *merged_hdr = init_merged_header();
+ translate = setup_test_1(merged_hdr);
+ assert(translate);
// test
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 1\n");
- trans_tbl_init(out, translate, &tbl_1, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 1\n");
if (verbose > 1) {
printf("translate\n");
@@ -346,7 +369,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_1(translate, out, &tbl_1)) { ++success; } else { ++failure; }
+ if (check_test_1(translate, out, &tbl_1)) {
+ if (verbose) printf("Test 1 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 1 : FAIL\n");
+ fprintf(stderr, "Test 1 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -357,15 +387,18 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 2\n");
// reinit
trans_tbl_t tbl_2;
- setup_test_2(&translate,&out);
+
+ merged_hdr = init_merged_header();
+ translate = setup_test_2(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 2\n");
- trans_tbl_init(out, translate, &tbl_2, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 2\n");
if (verbose > 1) {
printf("translate\n");
@@ -373,7 +406,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_2(translate, out, &tbl_2)) { ++success; } else { ++failure; }
+ if (check_test_2(translate, out, &tbl_2)) {
+ if (verbose) printf("Test 2 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 2 : FAIL\n");
+ fprintf(stderr, "Test 2 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -384,15 +424,17 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 3\n");
// reinit
trans_tbl_t tbl_3;
- setup_test_3(&translate,&out);
+ merged_hdr = init_merged_header();
+ translate = setup_test_3(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
- }
+ }
if (verbose) printf("RUN test 3\n");
- trans_tbl_init(out, translate, &tbl_3, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 3\n");
if (verbose > 1) {
printf("translate\n");
@@ -400,7 +442,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_3(translate, out, &tbl_3)) { ++success; } else { ++failure; }
+ if (check_test_3(translate, out, &tbl_3)) {
+ if (verbose) printf("Test 3 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 3 : FAIL\n");
+ fprintf(stderr, "Test 3 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -411,15 +460,17 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 4\n");
// reinit
trans_tbl_t tbl_4;
- setup_test_4(&translate,&out);
+ merged_hdr = init_merged_header();
+ translate = setup_test_4(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 4\n");
- trans_tbl_init(out, translate, &tbl_4, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 4\n");
if (verbose > 1) {
printf("translate\n");
@@ -427,7 +478,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_4(translate, out, &tbl_4)) { ++success; } else { ++failure; }
+ if (check_test_4(translate, out, &tbl_4)) {
+ if (verbose) printf("Test 4 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 4 : FAIL\n");
+ fprintf(stderr, "Test 4 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -438,16 +496,18 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 5\n");
// reinit
trans_tbl_t tbl_5;
- setup_test_5(&translate,&out);
+ merged_hdr = init_merged_header();
+ translate = setup_test_5(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 5\n");
- trans_tbl_init(out, translate, &tbl_5, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 5\n");
if (verbose > 1) {
printf("translate\n");
@@ -455,13 +515,56 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_5(translate, out, &tbl_5)) { ++success; } else { ++failure; }
+ if (check_test_5(translate, out, &tbl_5)) {
+ if (verbose) printf("Test 5 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 5 : FAIL\n");
+ fprintf(stderr, "Test 5 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_5);
if (verbose) printf("END test 5\n");
+ // test
+ if (verbose) printf("BEGIN test 6\n");
+ // reinit
+ trans_tbl_t tbl_6;
+ merged_hdr = init_merged_header();
+ translate = setup_test_6(merged_hdr);
+ assert(translate);
+ if (verbose > 1) {
+ printf("translate\n");
+ dump_header(translate);
+ }
+ if (verbose) printf("RUN test 6\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, "filename");
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
+ if (verbose) printf("END RUN test 6\n");
+ if (verbose > 1) {
+ printf("translate\n");
+ dump_header(translate);
+ printf("out\n");
+ dump_header(out);
+ }
+ if (check_test_6(translate, out, &tbl_6)) {
+ if (verbose) printf("Test 6 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 6 : FAIL\n");
+ fprintf(stderr, "Test 6 : FAIL\n");
+ ++failure;
+ }
+ // teardown
+ bam_hdr_destroy(translate);
+ bam_hdr_destroy(out);
+ trans_tbl_destroy(&tbl_6);
+ if (verbose) printf("END test 6\n");
+
if (success == NUM_TESTS) {
return 0;
} else {
diff --git a/samtools/test/merge/test_trans_tbl_init.c.pysam.c b/samtools/test/merge/test_trans_tbl_init.c.pysam.c
index 594bf2c..0f54989 100644
--- a/samtools/test/merge/test_trans_tbl_init.c.pysam.c
+++ b/samtools/test/merge/test_trans_tbl_init.c.pysam.c
@@ -25,6 +25,13 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include "../../bam_sort.c"
+#include <assert.h>
+#include <regex.h>
+
+typedef struct refseq_info {
+ const char *name;
+ uint32_t len;
+} refseq_info_t;
void dump_header(bam_hdr_t* hdr) {
printf("->n_targets:(%d)\n", hdr->n_targets);
@@ -39,36 +46,90 @@ void dump_header(bam_hdr_t* hdr) {
printf(")\n");
}
+static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) {
+ trans_tbl_t dummy;
+ int res;
+ res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, NULL);
+ trans_tbl_destroy(&dummy);
+ return res;
+}
+
+/*
+ * Populate merged_hdr with data from bam0_header_text and bam0_refseqs.
+ * Return bam_hdr_t based on the content in bam1_header_text and bam1_refseqs.
+ */
+
+bam_hdr_t * setup_test(const char *bam0_header_text,
+ const refseq_info_t *bam0_refseqs,
+ int32_t bam0_n_refseqs,
+ const char *bam1_header_text,
+ const refseq_info_t *bam1_refseqs,
+ int32_t bam1_n_refseqs,
+ merged_header_t *merged_hdr) {
+ bam_hdr_t* bam0 = NULL;
+ bam_hdr_t* bam1 = NULL;
+ int32_t i;
+
+ bam0 = bam_hdr_init();
+ bam0->text = strdup(bam0_header_text);
+ if (!bam0->text) goto fail;
+ bam0->l_text = strlen(bam0_header_text);
+ bam0->n_targets = 1;
+ bam0->target_name = (char**)calloc(bam0_n_refseqs, sizeof(char*));
+ bam0->target_len = (uint32_t*)calloc(bam0_n_refseqs, sizeof(uint32_t));
+ for (i = 0; i < bam0_n_refseqs; i++) {
+ bam0->target_name[i] = strdup(bam0_refseqs[i].name);
+ if (!bam0->target_name[i]) goto fail;
+ bam0->target_len[i] = bam0_refseqs[i].len;
+ }
+
+ if (populate_merged_header(bam0, merged_hdr)) goto fail;
+
+ bam1 = bam_hdr_init();
+ if (!bam1) goto fail;
+ bam1->text = strdup(bam1_header_text);
+ if (!bam1->text) goto fail;
+ bam1->l_text = strlen(bam1_header_text);
+ bam1->n_targets = bam1_n_refseqs;
+ bam1->target_name = (char**)calloc(bam1_n_refseqs, sizeof(char*));
+ bam1->target_len = (uint32_t*)calloc(bam1_n_refseqs, sizeof(uint32_t));
+ for (i = 0; i < bam1_n_refseqs; i++) {
+ bam1->target_name[i] = strdup(bam1_refseqs[i].name);
+ if (!bam1->target_name[i]) goto fail;
+ bam1->target_len[i] = bam1_refseqs[i].len;
+ }
+
+ bam_hdr_destroy(bam0);
+ return bam1;
+
+ fail:
+ bam_hdr_destroy(bam1);
+ bam_hdr_destroy(bam0);
+ return NULL;
+}
+
+#define NELE(x) (sizeof((x)) / sizeof((x)[0]))
+
+static const char init_text[] =
+ "@HD\tVN:1.4\tSO:unknown\n"
+ "@SQ\tSN:fish\tLN:133\tSP:frog";
+
+static const refseq_info_t init_refs[1] = {
+ { "fish", 133 }
+};
+
static const char test_1_trans_text[] =
"@HD\tVN:1.4\tSO:unknown\n"
"@SQ\tSN:fish\tLN:133\n";
-void setup_test_1(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_1_refs[1] = {
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_1_trans_text);
- translate->l_text = strlen(test_1_trans_text);
- translate->n_targets = 1;
- translate->target_name = (char**)calloc(1, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- translate->target_name[0] = strdup("fish");
- translate->target_len[0] = 133;
- out = bam_hdr_init();
- const char out_text[] =
- "@HD\tVN:1.4\tSO:unknown\n"
- "@SQ\tSN:fish\tLN:133\tSP:frog";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+bam_hdr_t * setup_test_1(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_1_trans_text, test_1_refs, NELE(test_1_refs),
+ merged_hdr);
}
bool check_test_1(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -102,34 +163,15 @@ static const char test_2_trans_text[] =
"@SQ\tSN:donkey\tLN:133\n"
"@SQ\tSN:fish\tLN:133";
-void setup_test_2(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_2_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_2_trans_text);
- translate->l_text = strlen(test_2_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
- "@HD\tVN:1.4\tSO:unknown\n"
- "@SQ\tSN:fish\tLN:133\tSP:frog";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+bam_hdr_t * setup_test_2(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_2_trans_text, test_2_refs, NELE(test_2_refs),
+ merged_hdr);
}
bool check_test_2(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -165,34 +207,15 @@ static const char test_3_trans_text[] =
"@SQ\tSN:fish\tLN:133\n"
"@RG\tID:fish\tPU:trans\n";
-void setup_test_3(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_3_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_3_trans_text);
- translate->l_text = strlen(test_3_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
- "@HD\tVN:1.4\tSO:unknown\n"
- "@SQ\tSN:fish\tLN:133\tSP:frog";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+bam_hdr_t * setup_test_3(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_3_trans_text, test_3_refs, NELE(test_3_refs),
+ merged_hdr);
}
bool check_test_3(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -211,35 +234,20 @@ static const char test_4_trans_text[] =
"@SQ\tSN:fish\tLN:133\n"
"@RG\tID:fish\tPU:trans\n";
-void setup_test_4(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_4_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_4_trans_text);
- translate->l_text = strlen(test_4_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
+bam_hdr_t * setup_test_4(merged_header_t *merged_hdr) {
+ const char* t4_init_text =
"@HD\tVN:1.4\tSO:unknown\n"
"@SQ\tSN:fish\tLN:133\tSP:frog\n"
"@RG\tID:fish\tPU:out\n";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+
+ return setup_test(t4_init_text, init_refs, NELE(init_refs),
+ test_4_trans_text, test_4_refs, NELE(test_4_refs),
+ merged_hdr);
}
bool check_test_4(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -260,37 +268,22 @@ static const char test_5_trans_text[] =
"@PG\tXX:dummy\tID:fish\tDS:trans\n"
"@PG\tPP:fish\tID:hook\tDS:trans\n";
-void setup_test_5(bam_hdr_t** translate_in, bam_hdr_t** out_in) {
- bam_hdr_t* out;
- bam_hdr_t* translate;
+static const refseq_info_t test_5_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
- translate = bam_hdr_init();
- translate->text = strdup(test_5_trans_text);
- translate->l_text = strlen(test_5_trans_text);
- translate->n_targets = 2;
- translate->target_name = (char**)calloc(translate->n_targets, sizeof(char*));
- translate->target_len = (uint32_t*)calloc(translate->n_targets, sizeof(uint32_t));
- translate->target_name[0] = strdup("donkey");
- translate->target_len[0] = 133;
- translate->target_name[1] = strdup("fish");
- translate->target_len[1] = 133;
- out = bam_hdr_init();
- const char* out_text =
+bam_hdr_t * setup_test_5(merged_header_t *merged_hdr) {
+ const char* t5_init_text =
"@HD\tVN:1.4\tSO:unknown\n"
"@SQ\tSN:fish\tLN:133\tSP:frog\n"
"@RG\tID:fish\tPU:out\n"
"@PG\tXX:dummyx\tID:fish\tDS:out\n"
"@PG\tPP:fish\tID:hook\tDS:out\n";
- out->text = strdup(out_text);
- out->l_text = strlen(out_text);
- out->n_targets = 1;
- out->target_name = (char**)calloc(1, sizeof(char*));
- out->target_len = (uint32_t*)calloc(1, sizeof(uint32_t));
- out->target_name[0] = strdup("fish");
- out->target_len[0] = 133;
-
- *translate_in = translate;
- *out_in = out;
+
+ return setup_test(t5_init_text, init_refs, NELE(init_refs),
+ test_5_trans_text, test_5_refs, NELE(test_5_refs),
+ merged_hdr);
}
bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
@@ -303,10 +296,38 @@ bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
return true;
}
+static const char test_6_trans_text[] =
+"@HD\tVN:1.4\tSO:unknown\n"
+"@SQ\tSN:donkey\tLN:133\n"
+"@SQ\tSN:fish\tLN:133\n"
+"@RG\tID:fish\tPU:trans\n"
+"@PG\tXX:dummy\tID:fish\tDS:trans\n"
+"@PG\tPP:fish\tID:hook\tDS:trans\n";
+
+static const refseq_info_t test_6_refs[2] = {
+ { "donkey", 133 },
+ { "fish", 133 }
+};
+
+bam_hdr_t * setup_test_6(merged_header_t *merged_hdr) {
+ return setup_test(init_text, init_refs, NELE(init_refs),
+ test_6_trans_text, test_6_refs, NELE(test_6_refs),
+ merged_hdr);
+}
+
+bool check_test_6(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) {
+ // Check input is unchanged
+ if (
+ strncmp(test_6_trans_text, translate->text, translate->l_text)
+ || translate->l_text != strlen(test_5_trans_text)
+ || translate->n_targets != 2
+ ) return false;
+ return true;
+}
int main(int argc, char**argv)
{
- const int NUM_TESTS = 5;
+ const int NUM_TESTS = 6;
int verbose = 0;
int success = 0;
int failure = 0;
@@ -331,16 +352,18 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 1\n");
// setup
trans_tbl_t tbl_1;
- setup_test_1(&translate,&out);
+ merged_header_t *merged_hdr = init_merged_header();
+ translate = setup_test_1(merged_hdr);
+ assert(translate);
// test
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 1\n");
- trans_tbl_init(out, translate, &tbl_1, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 1\n");
if (verbose > 1) {
printf("translate\n");
@@ -348,7 +371,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_1(translate, out, &tbl_1)) { ++success; } else { ++failure; }
+ if (check_test_1(translate, out, &tbl_1)) {
+ if (verbose) printf("Test 1 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 1 : FAIL\n");
+ fprintf(pysamerr, "Test 1 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -359,15 +389,18 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 2\n");
// reinit
trans_tbl_t tbl_2;
- setup_test_2(&translate,&out);
+
+ merged_hdr = init_merged_header();
+ translate = setup_test_2(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 2\n");
- trans_tbl_init(out, translate, &tbl_2, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 2\n");
if (verbose > 1) {
printf("translate\n");
@@ -375,7 +408,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_2(translate, out, &tbl_2)) { ++success; } else { ++failure; }
+ if (check_test_2(translate, out, &tbl_2)) {
+ if (verbose) printf("Test 2 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 2 : FAIL\n");
+ fprintf(pysamerr, "Test 2 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -386,15 +426,17 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 3\n");
// reinit
trans_tbl_t tbl_3;
- setup_test_3(&translate,&out);
+ merged_hdr = init_merged_header();
+ translate = setup_test_3(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
- }
+ }
if (verbose) printf("RUN test 3\n");
- trans_tbl_init(out, translate, &tbl_3, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 3\n");
if (verbose > 1) {
printf("translate\n");
@@ -402,7 +444,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_3(translate, out, &tbl_3)) { ++success; } else { ++failure; }
+ if (check_test_3(translate, out, &tbl_3)) {
+ if (verbose) printf("Test 3 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 3 : FAIL\n");
+ fprintf(pysamerr, "Test 3 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -413,15 +462,17 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 4\n");
// reinit
trans_tbl_t tbl_4;
- setup_test_4(&translate,&out);
+ merged_hdr = init_merged_header();
+ translate = setup_test_4(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 4\n");
- trans_tbl_init(out, translate, &tbl_4, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 4\n");
if (verbose > 1) {
printf("translate\n");
@@ -429,7 +480,14 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_4(translate, out, &tbl_4)) { ++success; } else { ++failure; }
+ if (check_test_4(translate, out, &tbl_4)) {
+ if (verbose) printf("Test 4 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 4 : FAIL\n");
+ fprintf(pysamerr, "Test 4 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
@@ -440,16 +498,18 @@ int main(int argc, char**argv)
if (verbose) printf("BEGIN test 5\n");
// reinit
trans_tbl_t tbl_5;
- setup_test_5(&translate,&out);
+ merged_hdr = init_merged_header();
+ translate = setup_test_5(merged_hdr);
+ assert(translate);
if (verbose > 1) {
printf("translate\n");
dump_header(translate);
- printf("out\n");
- dump_header(out);
}
if (verbose) printf("RUN test 5\n");
- trans_tbl_init(out, translate, &tbl_5, false, false);
+ trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, NULL);
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 5\n");
if (verbose > 1) {
printf("translate\n");
@@ -457,13 +517,56 @@ int main(int argc, char**argv)
printf("out\n");
dump_header(out);
}
- if (check_test_5(translate, out, &tbl_5)) { ++success; } else { ++failure; }
+ if (check_test_5(translate, out, &tbl_5)) {
+ if (verbose) printf("Test 5 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 5 : FAIL\n");
+ fprintf(pysamerr, "Test 5 : FAIL\n");
+ ++failure;
+ }
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_5);
if (verbose) printf("END test 5\n");
+ // test
+ if (verbose) printf("BEGIN test 6\n");
+ // reinit
+ trans_tbl_t tbl_6;
+ merged_hdr = init_merged_header();
+ translate = setup_test_6(merged_hdr);
+ assert(translate);
+ if (verbose > 1) {
+ printf("translate\n");
+ dump_header(translate);
+ }
+ if (verbose) printf("RUN test 6\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, "filename");
+ out = finish_merged_header(merged_hdr);
+ free_merged_header(merged_hdr);
+ if (verbose) printf("END RUN test 6\n");
+ if (verbose > 1) {
+ printf("translate\n");
+ dump_header(translate);
+ printf("out\n");
+ dump_header(out);
+ }
+ if (check_test_6(translate, out, &tbl_6)) {
+ if (verbose) printf("Test 6 : PASS\n");
+ ++success;
+ } else {
+ if (verbose) printf("Test 6 : FAIL\n");
+ fprintf(pysamerr, "Test 6 : FAIL\n");
+ ++failure;
+ }
+ // teardown
+ bam_hdr_destroy(translate);
+ bam_hdr_destroy(out);
+ trans_tbl_destroy(&tbl_6);
+ if (verbose) printf("END test 6\n");
+
if (success == NUM_TESTS) {
return 0;
} else {
diff --git a/samtools/test/split/test_count_rg.c b/samtools/test/split/test_count_rg.c
index db3cb15..97512a8 100644
--- a/samtools/test/split/test_count_rg.c
+++ b/samtools/test/split/test_count_rg.c
@@ -63,8 +63,7 @@ int main(int argc, char**argv)
// Setup stderr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr
char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp";
FILE* check = NULL;
@@ -95,8 +94,8 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
if (result_1 && count == 1 && !strcmp(output[0], "fish")
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -114,7 +113,7 @@ int main(int argc, char**argv)
if (verbose) printf("END test 1\n");
// Cleanup
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_stderr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/split/test_count_rg.c.pysam.c b/samtools/test/split/test_count_rg.c.pysam.c
index cf56408..eda8abb 100644
--- a/samtools/test/split/test_count_rg.c.pysam.c
+++ b/samtools/test/split/test_count_rg.c.pysam.c
@@ -65,8 +65,7 @@ int main(int argc, char**argv)
// Setup pysamerr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp";
FILE* check = NULL;
@@ -97,8 +96,8 @@ int main(int argc, char**argv)
// check result
check = fopen(tempfname, "r");
if (result_1 && count == 1 && !strcmp(output[0], "fish")
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -116,7 +115,7 @@ int main(int argc, char**argv)
if (verbose) printf("END test 1\n");
// Cleanup
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/split/test_expand_format_string.c b/samtools/test/split/test_expand_format_string.c
index 000d303..ede7586 100644
--- a/samtools/test/split/test_expand_format_string.c
+++ b/samtools/test/split/test_expand_format_string.c
@@ -63,8 +63,7 @@ int main(int argc, char**argv)
// Setup stderr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr
char* tempfname = (optind < argc)? argv[optind] : "test_expand_format_string.tmp";
FILE* check = NULL;
@@ -85,7 +84,7 @@ int main(int argc, char**argv)
// test
xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe
- char* output_1 = expand_format_string(format_string_1, basename_1, rg_id_1, rg_idx_1);
+ char* output_1 = expand_format_string(format_string_1, basename_1, rg_id_1, rg_idx_1, NULL);
fclose(stderr);
if (verbose) printf("END RUN test 1\n");
@@ -97,11 +96,11 @@ int main(int argc, char**argv)
}
// check result
- len = 0;
+ res.l = 0;
check = fopen(tempfname, "r");
if (output_1 != NULL && !strcmp(output_1, "basename_4.bam")
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -114,7 +113,7 @@ int main(int argc, char**argv)
if (verbose) printf("END test 1\n");
// Cleanup test harness
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_stderr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/split/test_expand_format_string.c.pysam.c b/samtools/test/split/test_expand_format_string.c.pysam.c
index d666af0..94e7732 100644
--- a/samtools/test/split/test_expand_format_string.c.pysam.c
+++ b/samtools/test/split/test_expand_format_string.c.pysam.c
@@ -65,8 +65,7 @@ int main(int argc, char**argv)
// Setup pysamerr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
char* tempfname = (optind < argc)? argv[optind] : "test_expand_format_string.tmp";
FILE* check = NULL;
@@ -87,7 +86,7 @@ int main(int argc, char**argv)
// test
xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
- char* output_1 = expand_format_string(format_string_1, basename_1, rg_id_1, rg_idx_1);
+ char* output_1 = expand_format_string(format_string_1, basename_1, rg_id_1, rg_idx_1, NULL);
fclose(pysamerr);
if (verbose) printf("END RUN test 1\n");
@@ -99,11 +98,11 @@ int main(int argc, char**argv)
}
// check result
- len = 0;
+ res.l = 0;
check = fopen(tempfname, "r");
if (output_1 != NULL && !strcmp(output_1, "basename_4.bam")
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -116,7 +115,7 @@ int main(int argc, char**argv)
if (verbose) printf("END test 1\n");
// Cleanup test harness
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/split/test_filter_header_rg.c b/samtools/test/split/test_filter_header_rg.c
index ab92016..f4e1266 100644
--- a/samtools/test/split/test_filter_header_rg.c
+++ b/samtools/test/split/test_filter_header_rg.c
@@ -96,8 +96,7 @@ int main(int argc, char**argv)
// Setup stderr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr
char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp";
FILE* check = NULL;
@@ -125,11 +124,12 @@ int main(int argc, char**argv)
}
// check result
+ res.l = 0;
check = fopen(tempfname, "r");
if ( result_1
&& check_test_1(hdr1)
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -163,11 +163,12 @@ int main(int argc, char**argv)
}
// check result
+ res.l = 0;
check = fopen(tempfname, "r");
if ( result_2
&& check_test_2(hdr2)
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -181,7 +182,7 @@ int main(int argc, char**argv)
// Cleanup
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_stderr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/split/test_filter_header_rg.c.pysam.c b/samtools/test/split/test_filter_header_rg.c.pysam.c
index 496d21a..4a5b6d5 100644
--- a/samtools/test/split/test_filter_header_rg.c.pysam.c
+++ b/samtools/test/split/test_filter_header_rg.c.pysam.c
@@ -98,8 +98,7 @@ int main(int argc, char**argv)
// Setup pysamerr redirect
- size_t len = 0;
- char* res = NULL;
+ kstring_t res = { 0, 0, NULL };
FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp";
FILE* check = NULL;
@@ -127,11 +126,12 @@ int main(int argc, char**argv)
}
// check result
+ res.l = 0;
check = fopen(tempfname, "r");
if ( result_1
&& check_test_1(hdr1)
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -165,11 +165,12 @@ int main(int argc, char**argv)
}
// check result
+ res.l = 0;
check = fopen(tempfname, "r");
if ( result_2
&& check_test_2(hdr2)
- && (getline(&res, &len, check) == -1)
- && (feof(check) || (res && !strcmp("",res)))) {
+ && kgetline(&res, (kgets_func *)fgets, check) < 0
+ && (feof(check) || res.l == 0)) {
++success;
} else {
++failure;
@@ -183,7 +184,7 @@ int main(int argc, char**argv)
// Cleanup
- free(res);
+ free(res.s);
remove(tempfname);
if (failure > 0)
fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/split/test_parse_args.c b/samtools/test/split/test_parse_args.c
index 5b8818e..66c7c88 100644
--- a/samtools/test/split/test_parse_args.c
+++ b/samtools/test/split/test_parse_args.c
@@ -38,7 +38,7 @@ bool check_test_1(const parsed_opts_t* opts) {
if ( opts->merged_input_name != NULL
|| opts->unaccounted_header_name != NULL
|| opts->unaccounted_name != NULL
- || strcmp(opts->output_format_string,"%*_%#.bam")
+ || strcmp(opts->output_format_string,"%*_%#.%.")
|| opts->verbose == true )
return false;
return true;
@@ -57,7 +57,7 @@ bool check_test_2(const parsed_opts_t* opts) {
|| strcmp(opts->merged_input_name, "merged.bam")
|| opts->unaccounted_header_name != NULL
|| opts->unaccounted_name != NULL
- || strcmp(opts->output_format_string,"%*_%#.bam")
+ || strcmp(opts->output_format_string,"%*_%#.%.")
|| opts->verbose == true )
return false;
return true;
@@ -87,10 +87,8 @@ int main(int argc, char**argv)
}
// Setup stdout and stderr redirect
- size_t len_stdout = 0;
- char* res_stdout = NULL;
- size_t len_stderr = 0;
- char* res_stderr = NULL;
+ kstring_t res_stdout = { 0, 0, NULL };
+ kstring_t res_stderr = { 0, 0, NULL };
FILE* orig_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save stderr
FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr
char* tempfname_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o";
@@ -124,14 +122,15 @@ int main(int argc, char**argv)
}
// check result
+ res_stdout.l = res_stderr.l = 0;
check_stdout = fopen(tempfname_stdout, "r");
check_stderr = fopen(tempfname_stderr, "r");
if ( !result_1
- && (getline(&res_stdout, &len_stdout, check_stdout) != -1)
+ && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) >= 0
&& !feof(check_stdout)
- && (res_stdout && strcmp("",res_stdout))
- && (getline(&res_stderr, &len_stderr, check_stderr) == -1)
- && (feof(check_stderr) || (res_stderr && !strcmp("",res_stderr)))) {
+ && res_stdout.l > 0
+ && kgetline(&res_stderr, (kgets_func *)fgets, check_stderr) < 0
+ && (feof(check_stderr) || res_stderr.l == 0)) {
++success;
} else {
++failure;
@@ -174,14 +173,15 @@ int main(int argc, char**argv)
}
// check result
+ res_stdout.l = res_stderr.l = 0;
check_stdout = fopen(tempfname_stdout, "r");
check_stderr = fopen(tempfname_stderr, "r");
if ( result_2
&& check_test_2(result_2)
- && (getline(&res_stdout, &len_stdout, check_stdout) == -1)
- && (feof(check_stdout) || (res_stdout && !strcmp("",res_stdout)))
- && (getline(&res_stderr, &len_stderr, check_stderr) == -1)
- && (feof(check_stderr) || (res_stderr && !strcmp("",res_stderr)))) {
+ && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) < 0
+ && (feof(check_stdout) || res_stdout.l == 0)
+ && kgetline(&res_stderr, (kgets_func *)fgets, check_stderr) < 0
+ && (feof(check_stderr) || res_stderr.l == 0)) {
++success;
} else {
++failure;
@@ -202,8 +202,8 @@ int main(int argc, char**argv)
// Cleanup
- free(res_stdout);
- free(res_stderr);
+ free(res_stdout.s);
+ free(res_stderr.s);
remove(tempfname_stdout);
remove(tempfname_stderr);
fclose(orig_stdout);
diff --git a/samtools/test/split/test_parse_args.c.pysam.c b/samtools/test/split/test_parse_args.c.pysam.c
index 807e764..608ec7c 100644
--- a/samtools/test/split/test_parse_args.c.pysam.c
+++ b/samtools/test/split/test_parse_args.c.pysam.c
@@ -40,7 +40,7 @@ bool check_test_1(const parsed_opts_t* opts) {
if ( opts->merged_input_name != NULL
|| opts->unaccounted_header_name != NULL
|| opts->unaccounted_name != NULL
- || strcmp(opts->output_format_string,"%*_%#.bam")
+ || strcmp(opts->output_format_string,"%*_%#.%.")
|| opts->verbose == true )
return false;
return true;
@@ -59,7 +59,7 @@ bool check_test_2(const parsed_opts_t* opts) {
|| strcmp(opts->merged_input_name, "merged.bam")
|| opts->unaccounted_header_name != NULL
|| opts->unaccounted_name != NULL
- || strcmp(opts->output_format_string,"%*_%#.bam")
+ || strcmp(opts->output_format_string,"%*_%#.%.")
|| opts->verbose == true )
return false;
return true;
@@ -89,10 +89,8 @@ int main(int argc, char**argv)
}
// Setup stdout and pysamerr redirect
- size_t len_stdout = 0;
- char* res_stdout = NULL;
- size_t len_pysamerr = 0;
- char* res_pysamerr = NULL;
+ kstring_t res_stdout = { 0, 0, NULL };
+ kstring_t res_pysamerr = { 0, 0, NULL };
FILE* orig_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save pysamerr
FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
char* tempfname_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o";
@@ -126,14 +124,15 @@ int main(int argc, char**argv)
}
// check result
+ res_stdout.l = res_pysamerr.l = 0;
check_stdout = fopen(tempfname_stdout, "r");
check_pysamerr = fopen(tempfname_pysamerr, "r");
if ( !result_1
- && (getline(&res_stdout, &len_stdout, check_stdout) != -1)
+ && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) >= 0
&& !feof(check_stdout)
- && (res_stdout && strcmp("",res_stdout))
- && (getline(&res_pysamerr, &len_pysamerr, check_pysamerr) == -1)
- && (feof(check_pysamerr) || (res_pysamerr && !strcmp("",res_pysamerr)))) {
+ && res_stdout.l > 0
+ && kgetline(&res_pysamerr, (kgets_func *)fgets, check_pysamerr) < 0
+ && (feof(check_pysamerr) || res_pysamerr.l == 0)) {
++success;
} else {
++failure;
@@ -176,14 +175,15 @@ int main(int argc, char**argv)
}
// check result
+ res_stdout.l = res_pysamerr.l = 0;
check_stdout = fopen(tempfname_stdout, "r");
check_pysamerr = fopen(tempfname_pysamerr, "r");
if ( result_2
&& check_test_2(result_2)
- && (getline(&res_stdout, &len_stdout, check_stdout) == -1)
- && (feof(check_stdout) || (res_stdout && !strcmp("",res_stdout)))
- && (getline(&res_pysamerr, &len_pysamerr, check_pysamerr) == -1)
- && (feof(check_pysamerr) || (res_pysamerr && !strcmp("",res_pysamerr)))) {
+ && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) < 0
+ && (feof(check_stdout) || res_stdout.l == 0)
+ && kgetline(&res_pysamerr, (kgets_func *)fgets, check_pysamerr) < 0
+ && (feof(check_pysamerr) || res_pysamerr.l == 0)) {
++success;
} else {
++failure;
@@ -204,8 +204,8 @@ int main(int argc, char**argv)
// Cleanup
- free(res_stdout);
- free(res_pysamerr);
+ free(res_stdout.s);
+ free(res_pysamerr.s);
remove(tempfname_stdout);
remove(tempfname_pysamerr);
fclose(orig_stdout);
diff --git a/samtools/version.h b/samtools/version.h
index 64eb542..abe052c 100644
--- a/samtools/version.h
+++ b/samtools/version.h
@@ -1 +1 @@
-#define SAMTOOLS_VERSION "1.2"
+#define SAMTOOLS_VERSION "1.3"
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 652736c..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-[bdist_rpm]
-doc_files = README doc/*.html ChangeLog
-vendor = TDB
-packager = TDB <email at email.com>
-distribution-name = Red Hat Linux
-requires = python
diff --git a/setup.py b/setup.py
index 8009437..7b59b69 100644
--- a/setup.py
+++ b/setup.py
@@ -1,16 +1,19 @@
#! /usr/bin/python
-'''The SAM/BAM/CRAM format is a way to store efficiently large numbers
-of alignments, such as those routinely are created by next-generation
-sequencing methods.
+'''pysam - a python module for reading, manipulating and writing
+genomic data sets.
+
+pysam is a lightweight wrapper of the htslib C-API and provides
+facilities to read and write SAM/BAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ
+files as well as access to the command line functionality of the
+samtools and bcftools packages. The module supports compression and
+random access through indexing.
This module provides a low-level wrapper around the htslib C-API as
-using cython and a high-level API for convenient access to the data in
-SAM/BAM formatted files. Also included is an interface to the samtools
-command line utilities and the tabix C-API for reading compressed and
-indexed tabular data.
+using cython and a high-level API for convenient access to the data
+within standard genomic file formats.
-The current version wraps htslib-1.2.1 and samtools-1.2.
+The current version wraps htslib-1.3, samtools-1.3 and bcftools-1.3.
See:
http://www.htslib.org
@@ -19,18 +22,59 @@ http://pysam.readthedocs.org/en/stable
'''
-import os
-import sys
+import collections
import glob
-import shutil
-import hashlib
-import re
-import fnmatch
+import os
import platform
+import re
+import subprocess
+import sys
+from contextlib import contextmanager
+from setuptools import Extension, setup
+
+IS_PYTHON3 = sys.version_info.major >= 3
+
+
+ at contextmanager
+def changedir(path):
+ save_dir = os.getcwd()
+ os.chdir(path)
+ try:
+ yield
+ finally:
+ os.chdir(save_dir)
+
+
+def configure_library(library_dir, env_options=None, options=[]):
+
+ configure_script = os.path.join(library_dir, "configure")
+
+ if not os.path.exists(configure_script):
+ raise ValueError(
+ "configure script {} does not exist".format(configure_script))
+
+ def run_configure(option):
+ try:
+ retcode = subprocess.call(
+ " ".join(("./configure", option)),
+ shell=True)
+ if retcode != 0:
+ return False
+ else:
+ return True
+ except OSError as e:
+ return False
+
+ with changedir(library_dir):
+ if env_options is not None:
+ if run_configure(env_options):
+ return env_options
-name = "pysam"
+ for option in options:
+ if run_configure(option):
+ return option
+ return None
-IS_PYTHON3 = sys.version_info[0] >= 3
# How to link against HTSLIB
# separate: use included htslib and include in each extension
@@ -43,39 +87,75 @@ IS_PYTHON3 = sys.version_info[0] >= 3
# pysam.
# external: use shared libhts.so compiled outside of
# pysam
-HTSLIB_MODE = "separate"
-HTSLIB_LIBRARY_DIR = os.environ.get('HTSLIB_LIBRARY_DIR', None)
-HTSLIB_INCLUDE_DIR = os.environ.get('HTSLIB_INCLUDE_DIR', None)
+HTSLIB_MODE = "shared"
+HTSLIB_LIBRARY_DIR = os.environ.get("HTSLIB_LIBRARY_DIR", None)
+HTSLIB_INCLUDE_DIR = os.environ.get("HTSLIB_INCLUDE_DIR", None)
+HTSLIB_CONFIGURE_OPTIONS = os.environ.get("HTSLIB_CONFIGURE_OPTIONS", None)
+
+# Check if cython is available
+#
+# If cython is available, the pysam will be built using cython from
+# the .pyx files. If no cython is available, the C-files included in the
+# distribution will be used.
+try:
+ from cy_build import CyExtension as Extension, cy_build_ext as build_ext
+ source_pattern = "pysam/c%s.pyx"
+ cmdclass = {'build_ext': build_ext}
+ HTSLIB_MODE = "shared"
+except ImportError:
+ # no Cython available - use existing C code
+ cmdclass = {}
+ source_pattern = "pysam/c%s.c"
+ # Set mode to separate, as "shared" not fully tested yet.
+ HTSLIB_MODE = "separate"
# collect pysam version
sys.path.insert(0, "pysam")
import version
version = version.__version__
-# exclude sources that contains a main function
-samtools_exclude = ("bamtk.c",
- "razip.c",
- "bgzip.c",
- "main.c",
- "calDepth.c",
- "bam2bed.c",
- "wgsim.c",
- "md5fa.c",
- "maq2sam.c",
- "bamcheck.c",
- "chk_indel.c",
- "vcf-miniview.c",
- "htslib-1.2.1", # do not import twice
- "hfile_irods.c", # requires irods library
- )
-
-htslib_exclude = ('htslib/tabix.c',
- 'htslib/bgzip.c',
- 'htslib/htsfile.c',
- 'htslib/hfile_irods.c')
-
-# destination directories for import of samtools and tabix
-samtools_dest = os.path.abspath("samtools")
+# exclude sources that contain a main function
+EXCLUDE = {
+ "samtools": (
+ "razip.c", "bgzip.c", "main.c",
+ "calDepth.c", "bam2bed.c", "wgsim.c",
+ "md5fa.c", "md5sum-lite.c", "maq2sam.c",
+ "bamcheck.c", "chk_indel.c", "vcf-miniview.c",
+ "htslib-1.3", # do not import twice
+ "hfile_irods.c", # requires irods library
+ ),
+ "bcftools": (
+ "test", "plugins", "peakfit.c",
+ "peakfit.h",
+ # needs to renamed, name conflict with samtools reheader
+ "reheader.c",
+ "polysomy.c"),
+ "htslib": (
+ 'htslib/tabix.c', 'htslib/bgzip.c',
+ 'htslib/htsfile.c', 'htslib/hfile_irods.c'),
+}
+
+print ("# pysam: htslib mode is {}".format(HTSLIB_MODE))
+
+htslib_configure_options = None
+
+if HTSLIB_MODE in ['shared', 'separate']:
+ htslib_configure_options = configure_library(
+ "htslib",
+ HTSLIB_CONFIGURE_OPTIONS,
+ ["--enable-libcurl"])
+
+ HTSLIB_SOURCE = "builtin"
+ print ("# pysam: htslib configure options: {}".format(
+ str(htslib_configure_options)))
+
+ if htslib_configure_options is None:
+ # create empty config.h file
+ with open("htslib/config.h", "w") as outf:
+ outf.write(
+ "/* empty config.h created by pysam */\n")
+ outf.write(
+ "/* conservative compilation options */")
if HTSLIB_LIBRARY_DIR:
# linking against a shared, externally installed htslib version, no
@@ -85,7 +165,9 @@ if HTSLIB_LIBRARY_DIR:
chtslib_sources = []
htslib_library_dirs = [HTSLIB_LIBRARY_DIR]
htslib_include_dirs = [HTSLIB_INCLUDE_DIR]
- htslib_libraries = ['hts']
+ internal_htslib_libraries = []
+ external_htslib_libraries = ['z', 'hts']
+
elif HTSLIB_MODE == 'separate':
# add to each pysam component a separately compiled
# htslib
@@ -93,12 +175,15 @@ elif HTSLIB_MODE == 'separate':
x for x in
glob.glob(os.path.join("htslib", "*.c")) +
glob.glob(os.path.join("htslib", "cram", "*.c"))
- if x not in htslib_exclude]
+ if x not in EXCLUDE["htslib"]]
shared_htslib_sources = htslib_sources
htslib_library_dirs = []
htslib_include_dirs = ['htslib']
- htslib_libraries = []
+ internal_htslib_libraries = []
+ external_htslib_libraries = ['z']
+
elif HTSLIB_MODE == 'shared':
+
# link each pysam component against the same
# htslib built from sources included in the pysam
# package.
@@ -107,180 +192,108 @@ elif HTSLIB_MODE == 'shared':
x for x in
glob.glob(os.path.join("htslib", "*.c")) +
glob.glob(os.path.join("htslib", "cram", "*.c"))
- if x not in htslib_exclude]
- htslib_library_dirs = ['pysam']
+ if x not in EXCLUDE["htslib"]]
+ htslib_library_dirs = ['pysam', "."]
htslib_include_dirs = ['htslib']
- htslib_libraries = ['chtslib']
-else:
- raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
+ external_htslib_libraries = ['z']
+ if IS_PYTHON3:
+ import sysconfig
+ if sys.version_info.minor >= 5:
+ internal_htslib_libraries = ["chtslib.{}".format(
+ sysconfig.get_config_var('SOABI'))]
+ else:
+ if sys.platform == "darwin":
+ # On OSX, python 3.3 and 3.4 Libs have no platform tags.
+ internal_htslib_libraries = ["chtslib"]
+ else:
+ internal_htslib_libraries = ["chtslib.{}{}".format(
+ sys.implementation.cache_tag,
+ sys.abiflags)]
+ else:
+ internal_htslib_libraries = ["chtslib"]
-def locate(pattern, root=os.curdir):
- '''Locate all files matching supplied filename pattern in and below
- supplied root directory.
- '''
- for path, dirs, files in os.walk(os.path.abspath(root)):
- for filename in fnmatch.filter(files, pattern):
- yield os.path.join(path, filename)
-
-
-def _update_pysam_files(cf, destdir):
- '''update pysam files applying redirection of ouput'''
- for filename in cf:
- if not filename:
- continue
- dest = filename + ".pysam.c"
- with open(filename) as infile:
- with open(dest, "w") as outfile:
- outfile.write('#include "pysam.h"\n\n')
- outfile.write(
- re.sub("stderr", "pysamerr", "".join(infile.readlines())))
- with open(os.path.join(destdir, "pysam.h"), "w")as outfile:
- outfile.write("""#ifndef PYSAM_H
-#define PYSAM_H
-#include "stdio.h"
-extern FILE * pysamerr;
-#endif
-""")
-
-#################################################################
-# Importing samtools and htslib
-#
-# For htslib, simply copy the whole release tar-ball
-# into the directory "htslib" and recreate the file version.h
-#
-# rm -rf htslib
-# mv download/htslib htslib
-# git checkout -- htslib/version.h
-# Edit the file htslib/version.h to set the right version number.
-#
-# For samtools, type:
-# rm -rf samtools
-# python setup.py import download/samtools
-#
-if len(sys.argv) >= 2 and sys.argv[1] == "import":
- if len(sys.argv) < 3:
- raise ValueError("missing PATH to samtools source directory")
-
- destdir = samtools_dest
- srcdir = sys.argv[2]
- exclude = samtools_exclude
-
- srcdir = os.path.abspath(srcdir)
- if not os.path.exists(srcdir):
- raise IOError(
- "source directory `%s` does not exist." % srcdir)
-
- cfiles = locate("*.c", srcdir)
- hfiles = locate("*.h", srcdir)
-
- # remove unwanted files and htslib subdirectory.
- cfiles = [x for x in cfiles if os.path.basename(x) not in exclude
- and not re.search("htslib-", x)]
-
- hfiles = [x for x in hfiles if os.path.basename(x) not in exclude
- and not re.search("htslib-", x)]
-
- ncopied = 0
-
- def _compareAndCopy(src, srcdir, destdir, exclude):
-
- d, f = os.path.split(src)
- common_prefix = os.path.commonprefix((d, srcdir))
- subdir = re.sub(common_prefix, "", d)[1:]
- targetdir = os.path.join(destdir, subdir)
- if not os.path.exists(targetdir):
- os.makedirs(targetdir)
- old_file = os.path.join(targetdir, f)
- if os.path.exists(old_file):
- md5_old = hashlib.md5(
- "".join(open(old_file, "r").readlines())).digest()
- md5_new = hashlib.md5(
- "".join(open(src, "r").readlines())).digest()
- if md5_old != md5_new:
- raise ValueError(
- "incompatible files for %s and %s" %
- (old_file, src))
-
- shutil.copy(src, targetdir)
- return old_file
-
- for src_file in hfiles:
- _compareAndCopy(src_file, srcdir, destdir, exclude)
- ncopied += 1
-
- cf = []
- for src_file in cfiles:
- cf.append(_compareAndCopy(src_file,
- srcdir,
- destdir,
- exclude))
- ncopied += 1
-
- sys.stdout.write(
- "installed latest source code from %s: "
- "%i files copied\n" % (srcdir, ncopied))
- # redirect stderr to pysamerr and replace bam.h with a stub.
- sys.stdout.write("applying stderr redirection\n")
-
- _update_pysam_files(cf, destdir)
-
- sys.exit(0)
-
-
-if len(sys.argv) >= 2 and sys.argv[1] == "refresh":
- sys.stdout.write("refreshing latest source code from .c to .pysam.c")
- # redirect stderr to pysamerr and replace bam.h with a stub.
- sys.stdout.write("applying stderr redirection")
- for destdir in ('samtools', ):
- pysamcfiles = locate("*.pysam.c", destdir)
- for f in pysamcfiles:
- os.remove(f)
- cfiles = locate("*.c", destdir)
- _update_pysam_files(cfiles, destdir)
-
- sys.exit(0)
-
-
-###################
-# populate headers
-# mkdir pysam/include pysam/include/win32
-# touch pysam/include/__init__.py pysam/include/win32/__init__.py
-# cp samtools/*.h pysam/*.h pysam/include
-# cp samtools/win32/*.h pysam/include/win32
+else:
+ raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
-from setuptools import Extension, setup
-#######################################################
-parts = ["samtools", "htslib", "tabix",
- "faidx", "samfile", "utils",
- "alignmentfile", "tabixproxies",
- "vcf", "bcf"]
+# build config.py
+with open(os.path.join("pysam", "config.py"), "w") as outf:
+ outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE))
+ config_values = collections.defaultdict(int)
+
+ if HTSLIB_SOURCE == "builtin":
+ with open(os.path.join("htslib", "config.h")) as inf:
+ for line in inf:
+ if line.startswith("#define"):
+ key, value = re.match(
+ "#define (\S+)\s+(\S+)", line).groups()
+ config_values[key] = int(value)
+ for key in ["ENABLE_PLUGINS",
+ "HAVE_COMMONCRYPTO",
+ "HAVE_GMTIME_R",
+ "HAVE_HMAC",
+ "HAVE_IRODS",
+ "HAVE_LIBCURL",
+ "HAVE_MMAP"]:
+ outf.write("{} = {}\n".format(key, config_values[key]))
+
+
+if HTSLIB_SOURCE == "builtin":
+ EXCLUDE_HTSLIB = ["htslib/hfile_libcurl.c"]
+ if htslib_configure_options is None:
+ print ("# pysam: could not configure htslib, choosing "
+ "conservative defaults")
+ htslib_sources = [x for x in htslib_sources
+ if x not in EXCLUDE_HTSLIB]
+ shared_htslib_sources = [x for x in shared_htslib_sources
+ if x not in EXCLUDE_HTSLIB]
+ elif "--disable-libcurl" in htslib_configure_options:
+ print ("# pysam: libcurl has been disabled")
+ htslib_sources = [x for x in htslib_sources
+ if x not in EXCLUDE_HTSLIB]
+ shared_htslib_sources = [x for x in shared_htslib_sources
+ if x not in EXCLUDE_HTSLIB]
+ elif "--enable-libcurl" in htslib_configure_options:
+ print ("# pysam: libcurl of builtin htslib has been enabled, "
+ "adding shared libcurl and libcrypto")
+ external_htslib_libraries.extend(["curl", "crypto"])
+
+parts = ["samtools",
+ "bcftools",
+ "htslib",
+ "tabix",
+ "faidx",
+ "samfile",
+ "utils",
+ "alignmentfile",
+ "tabixproxies",
+ "vcf",
+ "bcf"]
+
+# remove existing files to recompute
+# necessary to be both compatible for python 2.7 and 3.3
+if IS_PYTHON3:
+ for part in parts:
+ try:
+ os.unlink("pysam/c%s.c" % part)
+ except:
+ pass
+
+# Exit if there are no pre-compiled files and no cython available
+fn = source_pattern % "htslib"
+if not os.path.exists(fn):
+ raise ValueError(
+ "no cython installed, but can not find {}."
+ "Make sure that cython is installed when building "
+ "from the repository"
+ .format(fn))
-try:
- from Cython.Distutils import build_ext
-except ImportError:
- # no Cython available - use existing C code
- cmdclass = {}
- source_pattern = "pysam/c%s.c"
-else:
- # remove existing files to recompute
- # necessary to be both compatible for python 2.7 and 3.3
- if IS_PYTHON3:
- for part in parts:
- try:
- os.unlink("pysam/c%s.c" % part)
- except:
- pass
- source_pattern = "pysam/c%s.pyx"
- cmdclass = {'build_ext': build_ext}
#######################################################
classifiers = """
-Development Status :: 2 - Alpha
+Development Status :: 3 - Beta
Operating System :: MacOS :: MacOS X
-Operating System :: Microsoft :: Windows :: Windows NT/2000
-Operating System :: OS Independent
Operating System :: POSIX
Operating System :: POSIX :: Linux
Operating System :: Unix
@@ -290,35 +303,27 @@ Topic :: Scientific/Engineering :: Bioinformatics
"""
#######################################################
-# Windows compatibility
+
+#######################################################
+# Windows compatibility - untested
if platform.system() == 'Windows':
include_os = ['win32']
os_c_files = ['win32/getopt.c']
+ extra_compile_args = []
else:
include_os = []
os_c_files = []
-
-#######################################################
-extra_compile_args = ["-Wno-error=declaration-after-statement",
- "-DSAMTOOLS=1"]
-define_macros = [('_FILE_OFFSET_BITS', '64'),
- ('_USE_KNETFILE', '')]
-
-csamtools = Extension(
- "pysam.csamtools",
- [source_pattern % "samtools",
- "pysam/pysam_util.c"] +
- glob.glob(os.path.join("samtools", "*.pysam.c")) +
- glob.glob(os.path.join("samtools", "*", "*.pysam.c")) +
- os_c_files +
- htslib_sources,
- library_dirs=htslib_library_dirs,
- include_dirs=["samtools", "pysam"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
- language="c",
- extra_compile_args=extra_compile_args,
- define_macros=define_macros
-)
+ # for python 3.4, see for example
+ # http://stackoverflow.com/questions/25587039/
+ # error-compiling-rpy2-on-python3-4-due-to-werror-
+ # declaration-after-statement
+ extra_compile_args = [
+ "-Wno-unused",
+ "-Wno-strict-prototypes",
+ "-Wno-sign-compare",
+ "-Wno-error=declaration-after-statement"]
+
+define_macros = []
chtslib = Extension(
"pysam.libchtslib",
@@ -327,8 +332,9 @@ chtslib = Extension(
shared_htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- include_dirs=["pysam"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ runtime_library_dirs=htslib_library_dirs,
+ include_dirs=["pysam", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -347,8 +353,8 @@ csamfile = Extension(
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- include_dirs=["pysam", "samtools"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ include_dirs=["pysam", "samtools", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -368,7 +374,7 @@ calignmentfile = Extension(
os_c_files,
library_dirs=htslib_library_dirs,
include_dirs=["pysam", "samtools"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -387,8 +393,8 @@ calignedsegment = Extension(
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- include_dirs=["pysam", "samtools"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ include_dirs=["pysam", "samtools", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -401,8 +407,8 @@ ctabix = Extension(
htslib_sources +
os_c_files,
library_dirs=["pysam"] + htslib_library_dirs,
- include_dirs=["pysam"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ include_dirs=["pysam", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -410,12 +416,17 @@ ctabix = Extension(
cutils = Extension(
"pysam.cutils",
- [source_pattern % "utils"] +
+ [source_pattern % "utils", "pysam/pysam_util.c"] +
+ glob.glob(os.path.join("samtools", "*.pysam.c")) +
+ # glob.glob(os.path.join("samtools", "*", "*.pysam.c")) +
+ glob.glob(os.path.join("bcftools", "*.pysam.c")) +
+ # glob.glob(os.path.join("bcftools", "*", "*.pysam.c")) +
htslib_sources +
os_c_files,
library_dirs=["pysam"] + htslib_library_dirs,
- include_dirs=["pysam"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ include_dirs=["samtools", "bcftools", "pysam", "."] +
+ include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -427,8 +438,8 @@ cfaidx = Extension(
htslib_sources +
os_c_files,
library_dirs=["pysam"] + htslib_library_dirs,
- include_dirs=["pysam"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ include_dirs=["pysam", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -436,11 +447,11 @@ cfaidx = Extension(
ctabixproxies = Extension(
"pysam.ctabixproxies",
- [source_pattern % "tabixproxies"] +
+ [source_pattern % "tabixproxies"] +
os_c_files,
library_dirs=[],
include_dirs=include_os,
- libraries=["z"],
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -448,11 +459,11 @@ ctabixproxies = Extension(
cvcf = Extension(
"pysam.cvcf",
- [source_pattern % "vcf"] +
+ [source_pattern % "vcf"] +
os_c_files,
library_dirs=[],
- include_dirs=["htslib"] + include_os + htslib_include_dirs,
- libraries=["z"],
+ include_dirs=["htslib", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
@@ -460,19 +471,19 @@ cvcf = Extension(
cbcf = Extension(
"pysam.cbcf",
- [source_pattern % "bcf"] +
+ [source_pattern % "bcf"] +
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- include_dirs=["htslib"] + include_os + htslib_include_dirs,
- libraries=["z"] + htslib_libraries,
+ include_dirs=["htslib", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
define_macros=define_macros
)
metadata = {
- 'name': name,
+ 'name': "pysam",
'version': version,
'description': "pysam",
'long_description': __doc__,
@@ -486,11 +497,10 @@ metadata = {
'pysam.include.htslib',
'pysam.include.htslib.htslib',
'pysam.include.samtools',
- # 'pysam.include.samtools.bcftools',
+ 'pysam.include.bcftools',
'pysam.include.samtools.win32'],
'requires': ['cython (>=0.21)'],
- 'ext_modules': [csamtools,
- chtslib,
+ 'ext_modules': [chtslib,
csamfile,
calignmentfile,
calignedsegment,
@@ -503,7 +513,8 @@ metadata = {
'cmdclass': cmdclass,
'package_dir': {'pysam': 'pysam',
'pysam.include.htslib': 'htslib',
- 'pysam.include.samtools': 'samtools'},
+ 'pysam.include.samtools': 'samtools',
+ 'pysam.include.bcftools': 'bcftools'},
'package_data': {'': ['*.pxd', '*.h'], },
# do not pack in order to permit linking to csamtools.so
'zip_safe': False,
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py
index a42a6cb..5995faa 100644
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -1,9 +1,12 @@
import os
import pysam
import unittest
-from TestUtils import checkFieldEqual
+import collections
import copy
+from TestUtils import checkFieldEqual
+
+
SAMTOOLS = "samtools"
WORKDIR = "pysam_test_work"
DATADIR = "pysam_data"
@@ -344,6 +347,26 @@ class TestAlignedSegment(ReadTest):
self.assertEqual(a.query_alignment_length, 20)
+class TestAlignedPairs(unittest.TestCase):
+ filename = os.path.join(DATADIR, "example_aligned_pairs.bam")
+
+ def testReferenceBases(self):
+ """reference bases should always be the same nucleotide
+ """
+ reference_bases = collections.defaultdict(list)
+ with pysam.AlignmentFile(self.filename) as inf:
+ for c in inf.pileup():
+ for r in c.pileups:
+ for read, ref, base in r.alignment.get_aligned_pairs(
+ with_seq=True):
+ if ref is None:
+ continue
+ reference_bases[ref].append(base.upper())
+
+ for x, y in reference_bases.items():
+ self.assertEqual(len(set(y)), 1)
+
+
class TestTags(ReadTest):
def testMissingTag(self):
@@ -463,7 +486,7 @@ class TestTags(ReadTest):
after = entry.get_tags()
self.assertEqual(after, before)
- def testMDTag(self):
+ def testMDTagMatchOnly(self):
a = self.buildRead()
# Substitutions only
@@ -488,23 +511,31 @@ class TestTags(ReadTest):
"ctgAAAAAcgt",
a.get_reference_sequence())
- # insertions are silent
+ def testMDTagInsertions(self):
+ a = self.buildRead()
+
+ # insertions are silent in the reference sequence
a.cigarstring = "5M1I5M"
a.query_sequence = "A" * 5 + "C" + "A" * 5
- a.set_tag('MD', "11")
+ a.set_tag('MD', "10")
self.assertEqual(
- a.query_sequence,
- a.get_reference_sequence())
+ a.get_reference_sequence(),
+ "A" * 10)
a.cigarstring = "1I10M"
+ a.query_sequence = "C" * 1 + "A" * 10
self.assertEqual(
- a.query_sequence,
- a.get_reference_sequence())
+ a.get_reference_sequence(),
+ "A" * 10)
a.cigarstring = "10M1I"
+ a.query_sequence = "A" * 10 + "C" * 1
self.assertEqual(
- a.query_sequence,
- a.get_reference_sequence())
+ a.get_reference_sequence(),
+ "A" * 10)
+
+ def testMDTagDeletions(self):
+ a = self.buildRead()
a.cigarstring = "5M1D5M"
a.query_sequence = "A" * 10
@@ -513,21 +544,24 @@ class TestTags(ReadTest):
"A" * 5 + "C" + "A" * 5,
a.get_reference_sequence())
- a.cigarstring = "5M1D5M"
+ a.cigarstring = "5M3D5M"
a.query_sequence = "A" * 10
a.set_tag('MD', "5^CCC5")
self.assertEqual(
"A" * 5 + "C" * 3 + "A" * 5,
a.get_reference_sequence())
+ def testMDTagSoftClipping(self):
+ a = self.buildRead()
+
# softclipping
a.cigarstring = "5S5M1D5M5S"
a.query_sequence = "G" * 5 + "A" * 10 + "G" * 5
- a.set_tag('MD', "10")
+ a.set_tag('MD', "5^C5")
self.assertEqual(
- "A" * 10,
+ "A" * 5 + "C" + "A" * 5,
a.get_reference_sequence())
-
+
# all together
a.cigarstring = "5S5M1D5M1I5M5S"
a.query_sequence = "G" * 5 + "A" * 16 + "G" * 5
@@ -536,13 +570,39 @@ class TestTags(ReadTest):
"AAcAATAAAAAAAAAA",
a.get_reference_sequence())
- # all together
- a.cigarstring = "5S5M1D2I5M5S"
+ def testMDTagComplex(self):
+ a = self.buildRead()
+
+ a.cigarstring = "5S5M1I2D5M5S"
a.query_sequence = "G" * 5 + "A" * 11 + "G" * 5
a.set_tag('MD', "2C2^TC5")
self.assertEqual(
"AAcAATCAAAAA",
a.get_reference_sequence())
+
+ a.cigarstring = "5S5M2D1I5M5S"
+ a.query_sequence = "G" * 5 + "A" * 11 + "G" * 5
+ a.set_tag('MD', "2C2^TC5")
+ self.assertEqual(
+ "AAcAATCAAAAA",
+ a.get_reference_sequence())
+
+ # insertion in reference overlapping deletion in reference
+ # read: AACCCCA---AAA
+ # ref: AA----AGGGAAA
+ a.cigarstring = "2M4I1M3D3M"
+ a.set_tag("MD", "3^GGG3")
+ a.query_sequence = "AACCCCAAAA"
+ self.assertEqual(
+ "AAAGGGAAA",
+ a.get_reference_sequence())
+
+ a.cigarstring = "5M2D2I2M"
+ a.set_tag("MD", "4C^TT2")
+ a.query_sequence = "A" * 9
+ self.assertEqual(
+ "AAAAcTTAA",
+ a.get_reference_sequence())
class TestCopy(ReadTest):
@@ -576,7 +636,7 @@ class TestAsString(unittest.TestCase):
def testAsString(self):
with open(os.path.join(DATADIR, "ex2.sam")) as samf:
- reference = [x for x in samf if not x.startswith("@")]
+ reference = [x[:-1] for x in samf if not x.startswith("@")]
with pysam.AlignmentFile(
os.path.join(DATADIR, "ex2.bam"), "r") as pysamf:
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py
index 30fed5b..c03e234 100644
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -5,7 +5,6 @@ Execute in the :file:`tests` directory as it requires the Makefile
and data files located there.
'''
-import pysam
import unittest
import os
import shutil
@@ -13,14 +12,20 @@ import sys
import collections
import subprocess
import logging
-from functools import partial
-from TestUtils import checkBinaryEqual, checkURL, checkSamtoolsViewEqual, checkFieldEqual
import array
+if sys.version_info.major >= 3:
+ from io import StringIO
+else:
+ from StringIO import StringIO
+
+from functools import partial
+
+import pysam
+import pysam.samtools
+from TestUtils import checkBinaryEqual, checkURL, \
+ checkSamtoolsViewEqual, checkFieldEqual, force_str
-IS_PYTHON3 = sys.version_info[0] >= 3
-SAMTOOLS = "samtools"
-WORKDIR = "pysam_test_work"
DATADIR = "pysam_data"
@@ -166,12 +171,15 @@ class BasicTestBAMFromFetch(unittest.TestCase):
self.reads[3].query_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
def testARqual(self):
- self.assertEqual(pysam.qualities_to_qualitystring(self.reads[0].query_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "quality string mismatch in read 1: %s != %s" % (pysam.qualities_to_qualitystring(self.reads[0].query_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
- self.assertEqual(pysam.qualities_to_qualitystring(self.reads[1].query_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "quality string mismatch in read 2: %s != %s" % (
- pysam.qualities_to_qualitystring(self.reads[1].query_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"))
- self.assertEqual(pysam.qualities_to_qualitystring(self.reads[3].query_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "quality string mismatch in read 3: %s != %s" % (pysam.qualities_to_qualitystring(self.reads[3].query_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
+ self.assertEqual(
+ pysam.qualities_to_qualitystring(self.reads[0].query_qualities),
+ "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+ self.assertEqual(
+ pysam.qualities_to_qualitystring(self.reads[1].query_qualities),
+ "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")
+ self.assertEqual(
+ pysam.qualities_to_qualitystring(self.reads[3].query_qualities),
+ "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
def testARquery(self):
self.assertEqual(
@@ -361,11 +369,27 @@ class BasicTestCRAMFromFile(BasicTestCRAMFromFetch):
def setUp(self):
f = open(os.path.join(DATADIR, "ex3.cram"))
- self.samfile = pysam.AlignmentFile(
- f, "rc")
+ self.samfile = pysam.AlignmentFile(f, "rc")
self.reads = [r for r in self.samfile]
+class BasicTestSAMFromStringIO(BasicTestBAMFromFetch):
+
+ def testRaises(self):
+ statement = "samtools view -h {}".format(
+ os.path.join(DATADIR, "ex3.bam"))
+ stdout = subprocess.check_output(statement.split(" "))
+ bam = StringIO()
+ if sys.version_info.major >= 3:
+ bam.write(stdout.decode('ascii'))
+ else:
+ bam.write(stdout)
+ bam.seek(0)
+ self.assertRaises(NotImplementedError,
+ pysam.AlignmentFile, bam)
+ # self.reads = [r for r in samfile]
+
+
##################################################
#
# Test of basic File I/O
@@ -405,6 +429,17 @@ class TestIO(unittest.TestCase):
infile = pysam.AlignmentFile(
os.path.join(DATADIR, input_filename),
input_mode)
+
+ if "b" in input_mode:
+ self.assertTrue(infile.is_bam)
+ self.assertFalse(infile.is_cram)
+ elif "c" in input_mode:
+ self.assertFalse(infile.is_bam)
+ self.assertTrue(infile.is_cram)
+ else:
+ self.assertFalse(infile.is_cram)
+ self.assertFalse(infile.is_bam)
+
if use_template:
outfile = pysam.AlignmentFile(
output_filename,
@@ -686,17 +721,6 @@ class TestIO(unittest.TestCase):
# write on closed file
self.assertEqual(0, samfile.write(None))
- def testAutoDetection(self):
- '''test if autodetection works.'''
-
- # TODO
- # samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex3.sam"))
- # self.assertRaises(ValueError, samfile.fetch, 'chr1')
- # samfile.close()
-
- samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex3.bam"))
- samfile.fetch('chr1')
- samfile.close()
# TOOD
# def testReadingFromSamFileWithoutHeader(self):
@@ -768,6 +792,62 @@ class TestIO(unittest.TestCase):
self.assertEqual(samfile.unmapped, 0)
self.assertEqual(samfile.nocoordinate, 0)
+ def testEmptyWithHeaderBAM(self):
+ self.assertRaises(
+ ValueError,
+ pysam.Samfile,
+ os.path.join(DATADIR, "example_empty_with_header.bam"),
+ "rb")
+
+ samfile = pysam.Samfile(
+ os.path.join(DATADIR, "example_empty_with_header.bam"),
+ "rb",
+ check_sq=False)
+ self.assertEqual(samfile.mapped, 0)
+ self.assertEqual(samfile.unmapped, 0)
+ self.assertEqual(samfile.nocoordinate, 0)
+ self.assertEqual([], list(samfile.fetch()))
+
+ def testOpenFromFilename(self):
+
+ samfile = pysam.AlignmentFile(
+ filename=os.path.join(DATADIR, "ex1.bam"),
+ mode="rb")
+ self.assertEqual(len(list(samfile.fetch())), 3270)
+
+
+class TestAutoDetect(unittest.TestCase):
+
+ def testSAM(self):
+ """test SAM autodetection."""
+
+ with pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.sam")) as inf:
+ self.assertFalse(inf.is_bam)
+ self.assertFalse(inf.is_cram)
+
+ self.assertRaises(ValueError, inf.fetch, 'chr1')
+
+ def testBAM(self):
+ """test BAM autodetection."""
+
+ with pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.bam")) as inf:
+ self.assertTrue(inf.is_bam)
+ self.assertFalse(inf.is_cram)
+ self.assertEqual(len(list(inf.fetch('chr1'))), 1)
+ self.assertEqual(len(list(inf.fetch('chr2'))), 3)
+
+ def testCRAM(self):
+ """test CRAM autodetection."""
+
+ with pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.cram")) as inf:
+ self.assertFalse(inf.is_bam)
+ self.assertTrue(inf.is_cram)
+ self.assertEqual(len(list(inf.fetch('chr1'))), 1)
+ self.assertEqual(len(list(inf.fetch('chr2'))), 3)
+
##################################################
#
@@ -787,9 +867,11 @@ class TestIteratorRowBAM(unittest.TestCase):
def checkRange(self, rnge):
'''compare results from iterator with those from samtools.'''
ps = list(self.samfile.fetch(region=rnge))
- sa = list(pysam.view(self.filename,
- rnge,
- raw=True))
+ sa = force_str(
+ pysam.samtools.view(
+ self.filename,
+ rnge,
+ raw=True)).splitlines(True)
self.assertEqual(
len(ps), len(sa),
"unequal number of results for range %s: %i != %i" %
@@ -846,15 +928,15 @@ class TestIteratorRowAllBAM(unittest.TestCase):
def testIterate(self):
'''compare results from iterator with those from samtools.'''
ps = list(self.samfile.fetch())
- sa = list(pysam.view(self.filename,
- raw=True))
+ sa = pysam.samtools.view(self.filename,
+ raw=True).splitlines()
self.assertEqual(
len(ps), len(sa),
"unequal number of results: %i != %i" %
(len(ps), len(sa)))
# check if the same reads are returned
for line, pair in enumerate(list(zip(ps, sa))):
- data = pair[1].split("\t")
+ data = force_str(pair[1]).split("\t")
self.assertEqual(
pair[0].query_name,
data[0],
@@ -1271,7 +1353,7 @@ class TestHeaderCRAM(TestHeaderSAM):
self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b))
_strip(b[ak])
- self.assertEqual(sorted(av), sorted(b[ak]))
+ self.assertEqual(av, b[ak])
class TestHeaderFromRefs(unittest.TestCase):
@@ -1482,7 +1564,9 @@ class TestWrongFormat(unittest.TestCase):
def testOpenBamAsSam(self):
# test fails, needs to be implemented.
# sam.fetch() fails on reading, not on opening
- # self.assertRaises( ValueError, pysam.AlignmentFile, 'ex1.bam', 'r' )
+ #self.assertRaises(ValueError, pysam.AlignmentFile,
+ # os.path.join(DATADIR, 'ex1.bam'),
+ # 'r')
pass
def testOpenFastaAsSam(self):
@@ -1597,8 +1681,10 @@ class TestDeNovoConstruction(unittest.TestCase):
outfile.write(x)
outfile.close()
- self.assertTrue(checkBinaryEqual(tmpfilename, self.bamfile),
- "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile))
+ self.assertTrue(
+ checkBinaryEqual(tmpfilename, self.bamfile),
+ "mismatch when construction BAM file, see %s %s" %
+ (tmpfilename, self.bamfile))
os.unlink(tmpfilename)
@@ -1807,7 +1893,7 @@ class TestRemoteFileFTP(unittest.TestCase):
if not checkURL(self.url):
return
- result = pysam.view(self.url, self.region)
+ result = pysam.samtools.view(self.url, self.region)
self.assertEqual(len(result), 36)
def testFTPFetch(self):
@@ -1833,8 +1919,8 @@ class TestRemoteFileHTTP(unittest.TestCase):
samfile_local = pysam.AlignmentFile(self.local, "rb")
ref = list(samfile_local.fetch(region=self.region))
- result = pysam.view(self.url, self.region)
- self.assertEqual(len(result), len(ref))
+ result = pysam.samtools.view(self.url, self.region)
+ self.assertEqual(len(result.splitlines()), len(ref))
def testFetch(self):
if not checkURL(self.url):
@@ -1926,25 +2012,31 @@ class TestPileup(unittest.TestCase):
def checkEqual(self, references, iterator):
for x, column in enumerate(iterator):
+ v = references[x][:-1].split("\t")
+ self.assertEqual(
+ len(v), 6,
+ "expected 6 values, got {}".format(v))
(contig, pos, reference_base,
read_bases, read_qualities, alignment_mapping_qualities) \
- = references[x][:-1].split("\t")
+ = v
self.assertEqual(int(pos) - 1, column.reference_pos)
def testSamtoolsStepper(self):
- refs = pysam.mpileup(
- "-f", self.fastafilename,
- self.samfilename)
+ refs = force_str(
+ pysam.samtools.mpileup(
+ "-f", self.fastafilename,
+ self.samfilename)).splitlines(True)
iterator = self.samfile.pileup(
stepper="samtools",
fastafile=self.fastafile)
self.checkEqual(refs, iterator)
def testAllStepper(self):
- refs = pysam.mpileup(
- "-f", self.fastafilename,
- "-A", "-B",
- self.samfilename)
+ refs = force_str(
+ pysam.samtools.mpileup(
+ "-f", self.fastafilename,
+ "-A", "-B",
+ self.samfilename)).splitlines(True)
iterator = self.samfile.pileup(
stepper="all",
@@ -1976,7 +2068,7 @@ class TestCountCoverage(unittest.TestCase):
read.flag = read.flag | 0x400
samfile.write(read)
samfile.close()
- pysam.index("test_count_coverage_read_all.bam")
+ pysam.samtools.index("test_count_coverage_read_all.bam")
def count_coverage_python(self, bam, chrom, start, stop,
read_callback,
@@ -2109,7 +2201,7 @@ class TestCountCoverage(unittest.TestCase):
read.flag = read.flag | 0x400
samfile.write(read)
samfile.close()
- pysam.index("test_count_coverage_nofilter.bam")
+ pysam.samtools.index("test_count_coverage_nofilter.bam")
samfile = pysam.AlignmentFile("test_count_coverage_nofilter.bam")
chr = 'chr1'
start = 0
@@ -2130,6 +2222,26 @@ class TestCountCoverage(unittest.TestCase):
self.assertEqual(fast_counts[3], manual_counts[3])
+class TestPileupQueryPosition(unittest.TestCase):
+
+ filename = "test_query_position.bam"
+
+ def testPileup(self):
+ last = {}
+ with pysam.AlignmentFile(os.path.join(DATADIR, self.filename)) as inf:
+ for col in inf.pileup():
+ for r in col.pileups:
+ # print r.alignment.query_name
+ # print r.query_position, r.query_position_or_next, r.is_del
+ if r.is_del:
+ self.assertEqual(r.query_position, None)
+ self.assertEqual(r.query_position_or_next,
+ last[r.alignment.query_name] + 1)
+ else:
+ self.assertNotEqual(r.query_position, None)
+ last[r.alignment.query_name] = r.query_position
+
+
class TestLogging(unittest.TestCase):
'''test around bug issue 42,
@@ -2258,26 +2370,71 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase):
self.assertEqual(samfile.nocoordinate, 0)
+class TestMappedUnmapped(unittest.TestCase):
+ filename = "test_mapped_unmapped.bam"
+
+ def testMapped(self):
+
+ with pysam.AlignmentFile(os.path.join(DATADIR,
+ self.filename)) as inf:
+ unmapped_flag = 0
+ unmapped_nopos = 0
+ mapped_flag = 0
+ for x in inf.fetch(until_eof=True):
+ if x.is_unmapped:
+ if x.reference_id < 0:
+ unmapped_nopos += 1
+ else:
+ unmapped_flag += 1
+ else:
+ mapped_flag += 1
+
+ self.assertEqual(inf.mapped, mapped_flag)
+ self.assertEqual(inf.unmapped, unmapped_flag + unmapped_nopos)
+
+ inf.reset()
+ self.assertEqual(inf.count(),
+ inf.mapped + unmapped_flag)
+
+ inf.reset()
+ self.assertEqual(inf.count(until_eof=True),
+ inf.mapped + unmapped_flag + unmapped_nopos)
+
+ inf.reset()
+ self.assertEqual(inf.count(read_callback="all"),
+ inf.mapped)
+
+ inf.reset()
+ self.assertEqual(inf.count(until_eof=True, read_callback="all"),
+ inf.mapped)
+
+
+
class TestSamtoolsProxy(unittest.TestCase):
'''tests for sanity checking access to samtools functions.'''
def testIndex(self):
- self.assertRaises(IOError, pysam.index, "missing_file")
+ self.assertRaises(IOError, pysam.samtools.index, "missing_file")
def testView(self):
# note that view still echos "open: No such file or directory"
- self.assertRaises(pysam.SamtoolsError, pysam.view, "missing_file")
+ self.assertRaises(pysam.SamtoolsError,
+ pysam.samtools.view,
+ "missing_file")
def testSort(self):
- self.assertRaises(pysam.SamtoolsError, pysam.sort, "missing_file")
+ self.assertRaises(pysam.SamtoolsError,
+ pysam.samtools.sort,
+ "missing_file")
class TestAlignmentFileIndex(unittest.TestCase):
def testIndex(self):
- samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex1.bam"),
+ "rb")
index = pysam.IndexedReads(samfile)
index.build()
reads = collections.defaultdict(int)
@@ -2292,6 +2449,31 @@ class TestAlignmentFileIndex(unittest.TestCase):
self.assertEqual(x.query_name, qname)
+class TestExplicitIndex(unittest.TestCase):
+
+ def testExplicitIndexBAM(self):
+ samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "explicit_index.bam"),
+ "rb",
+ filepath_index=os.path.join(DATADIR, 'ex1.bam.bai'))
+
+ samfile.fetch("chr1")
+
+ def testExplicitIndexCRAM(self):
+ samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "explicit_index.cram"),
+ "rc",
+ filepath_index=os.path.join(DATADIR, 'ex1.cram.crai'))
+
+ def testRemoteExplicitIndexBAM(self):
+ samfile = pysam.AlignmentFile(
+ "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam",
+ "rb",
+ filepath_index=os.path.join(DATADIR, 'ex1.bam.bai'))
+
+ samfile.fetch("chr1")
+
+
class TestVerbosity(unittest.TestCase):
'''test if setting/getting of verbosity works.'''
diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py
index 79b11a5..1fc88f3 100644
--- a/tests/SamFile_test.py
+++ b/tests/SamFile_test.py
@@ -6,6 +6,7 @@ and data files located there.
'''
import pysam
+import pysam.samtools
import unittest
import os
import shutil
@@ -14,12 +15,8 @@ import collections
import subprocess
import logging
import array
-from TestUtils import checkBinaryEqual, checkURL
+from TestUtils import checkBinaryEqual, checkURL, force_str
-IS_PYTHON3 = sys.version_info[0] >= 3
-
-SAMTOOLS = "samtools"
-WORKDIR = "pysam_test_work"
DATADIR = "pysam_data"
@@ -755,23 +752,34 @@ class TestIteratorRow(unittest.TestCase):
def checkRange(self, rnge):
'''compare results from iterator with those from samtools.'''
ps = list(self.samfile.fetch(region=rnge))
- sa = list(pysam.view(os.path.join(DATADIR, "ex1.bam"),
- rnge,
- raw=True))
- self.assertEqual(len(ps), len(
- sa), "unequal number of results for range %s: %i != %i" % (rnge, len(ps), len(sa)))
+ sa = force_str(
+ pysam.samtools.view(
+ os.path.join(DATADIR, "ex1.bam"),
+ rnge,
+ raw=True)).splitlines(True)
+ self.assertEqual(
+ len(ps), len(sa),
+ "unequal number of results for range %s: %i != %i" %
+ (rnge, len(ps), len(sa)))
# check if the same reads are returned and in the same order
for line, (a, b) in enumerate(list(zip(ps, sa))):
d = b.split("\t")
self.assertEqual(
- a.qname, d[0], "line %i: read id mismatch: %s != %s" % (line, a.rname, d[0]))
- self.assertEqual(a.pos, int(d[3]) - 1, "line %i: read position mismatch: %s != %s, \n%s\n%s\n" %
- (line, a.pos, int(d[3]) - 1,
- str(a), str(d)))
+ a.qname, d[0],
+ "line %i: read id mismatch: %s != %s" %
+ (line, a.rname, d[0]))
+ self.assertEqual(
+ a.pos, int(d[3]) - 1,
+ "line %i: read position mismatch: %s != %s, "
+ "\n%s\n%s\n" %
+ (line, a.pos, int(d[3]) - 1,
+ str(a), str(d)))
qual = d[10]
- self.assertEqual(a.qual, qual, "line %i: quality mismatch: %s != %s, \n%s\n%s\n" %
- (line, a.qual, qual,
- str(a), str(d)))
+ self.assertEqual(
+ a.qual, qual,
+ "line %i: quality mismatch: %s != %s, \n%s\n%s\n" %
+ (line, a.qual, qual,
+ str(a), str(d)))
def testIteratePerContig(self):
'''check random access per contig'''
@@ -798,8 +806,11 @@ class TestIteratorRowAll(unittest.TestCase):
def testIterate(self):
'''compare results from iterator with those from samtools.'''
ps = list(self.samfile.fetch())
- sa = list(pysam.view(os.path.join(DATADIR, "ex1.bam"),
- raw=True))
+ sa = force_str(
+ pysam.samtools.view(
+ os.path.join(DATADIR, "ex1.bam"),
+ raw=True)).splitlines(True)
+
self.assertEqual(
len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa)))
# check if the same reads are returned
@@ -853,7 +864,8 @@ class TestIteratorColumn(unittest.TestCase):
def testIterateRanges(self):
'''check random access per range'''
- for contig, length in zip(self.samfile.references, self.samfile.lengths):
+ for contig, length in zip(
+ self.samfile.references, self.samfile.lengths):
for start in range(1, length, 90):
# this includes empty ranges
self.checkRange(contig, start, start + 90)
@@ -1661,7 +1673,7 @@ class TestRemoteFileFTP(unittest.TestCase):
if not checkURL(self.url):
return
- result = pysam.view(self.url, self.region)
+ result = pysam.samtools.view(self.url, self.region)
self.assertEqual(len(result), 36)
def testFTPFetch(self):
@@ -1687,7 +1699,8 @@ class TestRemoteFileHTTP(unittest.TestCase):
samfile_local = pysam.Samfile(self.local, "rb")
ref = list(samfile_local.fetch(region=self.region))
- result = pysam.view(self.url, self.region)
+ result = pysam.samtools.view(
+ self.url, self.region).splitlines(True)
self.assertEqual(len(result), len(ref))
def testFetch(self):
@@ -1786,20 +1799,22 @@ class TestPileup(unittest.TestCase):
self.assertEqual(int(pos) - 1, column.pos)
def testSamtoolsStepper(self):
- refs = pysam.mpileup(
- "-f", self.fastafilename,
- self.samfilename)
+ refs = force_str(
+ pysam.samtools.mpileup(
+ "-f", self.fastafilename,
+ self.samfilename)).splitlines(True)
iterator = self.samfile.pileup(
stepper="samtools",
fastafile=self.fastafile)
self.checkEqual(refs, iterator)
def testAllStepper(self):
- refs = pysam.mpileup(
- "-f", self.fastafilename,
- "-A", "-B",
- self.samfilename)
-
+ refs = force_str(
+ pysam.samtools.mpileup(
+ "-f", self.fastafilename,
+ "-A", "-B",
+ self.samfilename)).splitlines(True)
+
iterator = self.samfile.pileup(
stepper="all",
fastafile=self.fastafile)
@@ -1937,14 +1952,14 @@ class TestSamtoolsProxy(unittest.TestCase):
'''tests for sanity checking access to samtools functions.'''
def testIndex(self):
- self.assertRaises(IOError, pysam.index, "missing_file")
+ self.assertRaises(IOError, pysam.samtools.index, "missing_file")
def testView(self):
# note that view still echos "open: No such file or directory"
- self.assertRaises(pysam.SamtoolsError, pysam.view, "missing_file")
+ self.assertRaises(pysam.SamtoolsError, pysam.samtools.view, "missing_file")
def testSort(self):
- self.assertRaises(pysam.SamtoolsError, pysam.sort, "missing_file")
+ self.assertRaises(pysam.SamtoolsError, pysam.samtools.sort, "missing_file")
class TestSamfileIndex(unittest.TestCase):
diff --git a/tests/TestUtils.py b/tests/TestUtils.py
index 5cc048a..efb2333 100644
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -2,7 +2,9 @@ import sys
import os
import pysam
import difflib
-
+import gzip
+import inspect
+import tempfile
IS_PYTHON3 = sys.version_info[0] >= 3
@@ -14,6 +16,21 @@ else:
from urllib2 import urlopen
+if IS_PYTHON3:
+ def force_str(s):
+ return s.decode('ascii')
+else:
+ def force_str(s):
+ return s
+
+
+def openfile(fn):
+ if fn.endswith(".gz"):
+ return gzip.open(fn)
+ else:
+ return open(fn)
+
+
def checkBinaryEqual(filename1, filename2):
'''return true if the two files are binary equal.
'''
@@ -53,8 +70,8 @@ def checkSamtoolsViewEqual(filename1, filename2,
if not without_header:
args.append("-h")
- lines1 = pysam.view(*(args + [filename1]))
- lines2 = pysam.view(*(args + [filename2]))
+ lines1 = pysam.samtools.view(*(args + [filename1]))
+ lines2 = pysam.samtools.view(*(args + [filename2]))
if len(lines1) != len(lines2):
return False
@@ -120,3 +137,35 @@ def checkFieldEqual(cls, read1, read2, exclude=[]):
cls.assertEqual(getattr(read1, n), getattr(read2, n),
"attribute mismatch for %s: %s != %s" %
(n, getattr(read1, n), getattr(read2, n)))
+
+
+def check_lines_equal(cls, a, b, sort=False, filter_f=None):
+ """check if contents of two files are equal comparing line-wise.
+
+ sort: bool
+ sort contents of both files before comparing.
+ filter_f:
+ remover lines in both a and b where expression is True
+ """
+
+ aa = openfile(a).readlines()
+ bb = openfile(b).readlines()
+
+ if filter_f is not None:
+ aa = [x for x in aa if not filter_f]
+ bb = [x for x in bb if not filter_f]
+ if sort:
+ cls.assertEqual(sorted(aa), sorted(bb))
+ else:
+ cls.assertEqual(aa, bb)
+
+
+def get_temp_filename(suffix=""):
+ caller_name = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
+ f = tempfile.NamedTemporaryFile(
+ prefix="tmp_{}_".format(caller_name),
+ suffix=suffix,
+ delete=False,
+ dir=".")
+ f.close()
+ return f.name
diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py
new file mode 100644
index 0000000..a7e54ac
--- /dev/null
+++ b/tests/VariantFile_test.py
@@ -0,0 +1,418 @@
+import os
+import unittest
+import pysam
+import gzip
+from TestUtils import get_temp_filename, check_lines_equal
+
+DATADIR="cbcf_data"
+from tabix_test import loadAndConvert
+
+
+def read_header(filename):
+
+ data = []
+ if filename.endswith(".gz"):
+ for line in gzip.open(filename):
+ line = line.decode("ascii")
+ if line.startswith("#"):
+ data.append(line)
+ else:
+ with open(filename) as f:
+ for line in f:
+ if line.startswith("#"):
+ data.append(line)
+ return data
+
+
+class TestMissingGenotypes(unittest.TestCase):
+
+ filename = "missing_genotypes.vcf"
+
+ def setUp(self):
+ self.compare = loadAndConvert(
+ os.path.join(DATADIR, self.filename),
+ encode=False)
+
+ def check(self, filename):
+ """see issue 203 - check for segmentation fault"""
+ fn = os.path.join(DATADIR, filename)
+ self.assertEqual(True, os.path.exists(fn))
+ v = pysam.VariantFile(fn)
+ for site in v:
+ for ss,rec in site.samples.items():
+ a, b = ss, rec
+
+ v = pysam.VariantFile(fn)
+ for x, site in enumerate(v):
+ for ss,rec in site.samples.items():
+ a, b = ss, rec.alleles
+ a, b = ss, rec.allele_indices
+
+ def testVCF(self):
+ self.check(self.filename)
+
+ def testVCFGZ(self):
+ self.check(self.filename + ".gz")
+
+
+class TestOpening(unittest.TestCase):
+
+ def testMissingFile(self):
+ self.assertRaises(IOError, pysam.VariantFile,
+ "missing_file.vcf")
+
+ def testMissingFileVCFGZ(self):
+ self.assertRaises(IOError, pysam.VariantFile,
+ "missing_file.vcf.gz")
+
+ def testEmptyFileVCF(self):
+ with open("tmp_testEmptyFile.vcf", "w"):
+ pass
+
+ self.assertRaises(ValueError, pysam.VariantFile,
+ "tmp_testEmptyFile.vcf")
+
+ os.unlink("tmp_testEmptyFile.vcf")
+
+ def testEmptyFileVCFGZWithIndex(self):
+ with open("tmp_testEmptyFile.vcf", "w"):
+ pass
+
+ pysam.tabix_index("tmp_testEmptyFile.vcf",
+ preset="vcf",
+ force=True)
+
+ self.assertRaises(ValueError, pysam.VariantFile,
+ "tmp_testEmptyFile.vcf.gz")
+
+ os.unlink("tmp_testEmptyFile.vcf.gz")
+ os.unlink("tmp_testEmptyFile.vcf.gz.tbi")
+
+ def testEmptyFileVCFGZWithoutIndex(self):
+ with open("tmp_testEmptyFileWithoutIndex.vcf", "w"):
+ pass
+
+ pysam.tabix_compress("tmp_testEmptyFileWithoutIndex.vcf",
+ "tmp_testEmptyFileWithoutIndex.vcf.gz",
+ force=True)
+
+ self.assertRaises(ValueError, pysam.VariantFile,
+ "tmp_testEmptyFileWithoutIndex.vcf.gz")
+
+ os.unlink("tmp_testEmptyFileWithoutIndex.vcf")
+ os.unlink("tmp_testEmptyFileWithoutIndex.vcf.gz")
+
+ def testEmptyFileVCFOnlyHeader(self):
+ with pysam.VariantFile(os.path.join(
+ DATADIR,
+ "example_vcf42_only_header.vcf")) as inf:
+ self.assertEqual(len(list(inf.fetch())), 0)
+
+ def testEmptyFileVCFGZOnlyHeader(self):
+ with pysam.VariantFile(os.path.join(
+ DATADIR,
+ "example_vcf42_only_header.vcf")) as inf:
+ self.assertEqual(len(list(inf.fetch())), 0)
+
+ def testDetectVCF(self):
+ with pysam.VariantFile(os.path.join(DATADIR,
+ "example_vcf40.vcf")) as inf:
+ self.assertEqual(inf.category, 'VARIANTS')
+ self.assertEqual(inf.format, 'VCF')
+ self.assertEqual(inf.compression, 'NONE')
+ self.assertFalse(inf.is_remote)
+ self.assertFalse(inf.is_stream)
+ self.assertEqual(len(list(inf.fetch())), 5)
+
+ def testDetectVCFGZ(self):
+ with pysam.VariantFile(os.path.join(DATADIR,
+ "example_vcf40.vcf.gz")) as inf:
+ self.assertEqual(inf.category, 'VARIANTS')
+ self.assertEqual(inf.format, 'VCF')
+ self.assertEqual(inf.compression, 'BGZF')
+ self.assertFalse(inf.is_remote)
+ self.assertFalse(inf.is_stream)
+ self.assertEqual(len(list(inf.fetch())), 5)
+
+ def testDetectBCF(self):
+ with pysam.VariantFile(os.path.join(DATADIR,
+ "example_vcf40.bcf")) as inf:
+ self.assertEqual(inf.category, 'VARIANTS')
+ self.assertEqual(inf.format, 'BCF')
+ self.assertEqual(inf.compression, 'BGZF')
+ self.assertFalse(inf.is_remote)
+ self.assertFalse(inf.is_stream)
+ self.assertEqual(len(list(inf.fetch())), 5)
+
+
+class TestHeader(unittest.TestCase):
+
+ filename = "example_vcf40.vcf"
+
+ def testStr(self):
+
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+
+ ref = read_header(fn)
+ comp = str(v.header).splitlines(True)
+
+ self.assertEqual(sorted(ref),
+ sorted(comp))
+
+ def testIterator(self):
+
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+
+ ref = read_header(fn)
+ # remove last header line starting with #CHROM
+ ref.pop()
+ ref = sorted(ref)
+ comp = sorted([str(x) for x in v.header.records])
+
+ self.assertEqual(len(ref), len(comp))
+
+ for x, y in zip(ref, comp):
+ self.assertEqual(x[:-1], str(y))
+
+
+# These tests need to be separate and start from newly opened files. This
+# is because htslib's parser is lazy and the pysam API needs to trigger
+# appropriate parsing when accessing each time of data. Failure to do so
+# will result in crashes or return of incorrect data. Thus this test suite
+# is testing both the triggering of the lazy parser and the results of the
+# parser.
+class TestParsing(unittest.TestCase):
+
+ filename = "example_vcf40.vcf.gz"
+
+ def testChrom(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ chrom = [rec.chrom for rec in v]
+ self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
+
+ def testPos(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ pos = [rec.pos for rec in v]
+ self.assertEqual(pos, [1230237, 14370, 17330, 1110696, 1234567])
+
+ def testStart(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ start = [rec.start for rec in v]
+ self.assertEqual(start, [1230236, 14369, 17329, 1110695, 1234566])
+
+ def testStop(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ stop = [rec.stop for rec in v]
+ self.assertEqual(stop, [1230237, 14370, 17330, 1110696, 1234570])
+
+ def testId(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ ids = [rec.id for rec in v]
+ self.assertEqual(ids, [None, 'rs6054257', None, 'rs6040355', 'microsat1'])
+
+ def testRef(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ ref = [rec.ref for rec in v]
+ self.assertEqual(ref, ['T', 'G', 'T', 'A', 'GTCT'])
+
+ def testAlt(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ alts = [rec.alts for rec in v]
+ self.assertEqual(alts, [None, ('A',), ('A',), ('G', 'T'), ('G', 'GTACT')])
+
+ def testAlleles(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ alleles = [rec.alleles for rec in v]
+ self.assertEqual(alleles, [('T',), ('G', 'A'), ('T', 'A'), ('A', 'G', 'T'), ('GTCT', 'G', 'GTACT')])
+
+ def testQual(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ qual = [rec.qual for rec in v]
+ self.assertEqual(qual, [47.0, 29.0, 3.0, 67.0, 50.0])
+
+ def testFilter(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ filter = [rec.filter.keys() for rec in v]
+ self.assertEqual(filter, [['PASS'], ['PASS'], ['q10'], ['PASS'], ['PASS']])
+
+ def testInfo(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ info = [rec.info.items() for rec in v]
+ self.assertEqual(info, [[('NS', 3), ('DP', 13), ('AA', 'T')],
+ [('NS', 3), ('DP', 14), ('AF', (0.5,)), ('DB', True), ('H2', True)],
+ [('NS', 3), ('DP', 11), ('AF', (0.017000000923871994,))],
+ [('NS', 2), ('DP', 10), ('AF', (0.3330000042915344, 0.6669999957084656)),
+ ('AA', 'T'), ('DB', True)],
+ [('NS', 3), ('DP', 9), ('AA', 'G')]])
+
+ def testFormat(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ format = [rec.format.keys() for rec in v]
+ self.assertEqual(format, [['GT', 'GQ', 'DP', 'HQ'],
+ ['GT', 'GQ', 'DP', 'HQ'],
+ ['GT', 'GQ', 'DP', 'HQ'],
+ ['GT', 'GQ', 'DP', 'HQ'],
+ ['GT', 'GQ', 'DP']])
+
+ def testSampleAlleles(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ alleles = [s.alleles for rec in v for s in rec.samples.values()]
+ self.assertEqual(alleles, [('T', 'T'), ('T', 'T'), ('T', 'T'),
+ ('G', 'G'), ('A', 'G'), ('A', 'A'),
+ ('T', 'T'), ('T', 'A'), ('T', 'T'),
+ ('G', 'T'), ('T', 'G'), ('T', 'T'),
+ ('GTCT', 'G'), ('GTCT', 'GTACT'),
+ ('G', 'G')])
+
+ def testSampleFormats(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ format = [s.items() for rec in v for s in rec.samples.values()]
+ self.assertEqual(format, [[('GT', (0, 0)), ('GQ', 54), ('DP', 7), ('HQ', (56, 60))],
+ [('GT', (0, 0)), ('GQ', 48), ('DP', 4), ('HQ', (51, 51))],
+ [('GT', (0, 0)), ('GQ', 61), ('DP', 2), ('HQ', (None,))],
+ [('GT', (0, 0)), ('GQ', 48), ('DP', 1), ('HQ', (51, 51))],
+ [('GT', (1, 0)), ('GQ', 48), ('DP', 8), ('HQ', (51, 51))],
+ [('GT', (1, 1)), ('GQ', 43), ('DP', 5), ('HQ', (None, None))],
+ [('GT', (0, 0)), ('GQ', 49), ('DP', 3), ('HQ', (58, 50))],
+ [('GT', (0, 1)), ('GQ', 3), ('DP', 5), ('HQ', (65, 3))],
+ [('GT', (0, 0)), ('GQ', 41), ('DP', 3), ('HQ', (None,))],
+ [('GT', (1, 2)), ('GQ', 21), ('DP', 6), ('HQ', (23, 27))],
+ [('GT', (2, 1)), ('GQ', 2), ('DP', 0), ('HQ', (18, 2))],
+ [('GT', (2, 2)), ('GQ', 35), ('DP', 4), ('HQ', (None,))],
+ [('GT', (0, 1)), ('GQ', 35), ('DP', 4)],
+ [('GT', (0, 2)), ('GQ', 17), ('DP', 2)],
+ [('GT', (1, 1)), ('GQ', 40), ('DP', 3)]])
+
+ def testSampleAlleleIndices(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(fn)
+ indices = [s.allele_indices for rec in v for s in rec.samples.values()]
+ self.assertEqual(indices, [(0, 0), (0, 0), (0, 0), (0, 0), (1, 0),
+ (1, 1), (0, 0), (0, 1), (0, 0), (1, 2),
+ (2, 1), (2, 2), (0, 1), (0, 2), (1, 1)])
+
+
+class TestIndexFilename(unittest.TestCase):
+
+ filenames = [('example_vcf40.vcf.gz', 'example_vcf40.vcf.gz.tbi'),
+ ('example_vcf40.vcf.gz', 'example_vcf40.vcf.gz.csi'),
+ ('example_vcf40.bcf', 'example_vcf40.bcf.csi')]
+
+ def testOpen(self):
+ for fn, idx_fn in self.filenames:
+ fn = os.path.join(DATADIR, fn)
+ idx_fn = os.path.join(DATADIR, idx_fn)
+
+ v = pysam.VariantFile(fn, index_filename=idx_fn)
+
+ self.assertEqual(len(list(v.fetch('20'))), 3)
+
+
+class TestConstructionVCFWithContigs(unittest.TestCase):
+ """construct VariantFile from scratch."""
+
+ filename = "example_vcf42_withcontigs.vcf"
+
+ def complete_check(self, fn_in, fn_out):
+
+ check_lines_equal(
+ self, fn_in, fn_out, sort=True,
+ filter_f=lambda x: not x.startswith("##contig"))
+ os.unlink(fn_out)
+
+ def testConstructionWithRecords(self):
+
+ fn_in = os.path.join(DATADIR, self.filename)
+ fn_out = get_temp_filename(suffix=".vcf")
+ vcf_in = pysam.VariantFile(fn_in)
+
+ header = pysam.VariantHeader()
+
+ for record in vcf_in.header.records:
+ header.add_record(record)
+
+ fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf"
+ vcf_out = pysam.VariantFile(fn, "w", header=header)
+ for record in vcf_in:
+ # currently segfaults here:
+ # vcf_out.write(record)
+ pass
+ return
+
+ vcf_out.close()
+ self.complete_check(fn_in, fn_out)
+
+ def testConstructionFromCopy(self):
+
+ fn_in = os.path.join(DATADIR, self.filename)
+ fn_out = get_temp_filename(suffix=".vcf")
+ vcf_in = pysam.VariantFile(fn_in)
+
+ vcf_out = pysam.VariantFile(fn_out, "w", header=vcf_in.header)
+ for record in vcf_in:
+ vcf_out.write(record)
+
+ vcf_out.close()
+
+ self.complete_check(fn_in, fn_out)
+
+ def testConstructionWithLines(self):
+
+ fn_in = os.path.join(DATADIR, self.filename)
+ fn_out = get_temp_filename(suffix=".vcf")
+ vcf_in = pysam.VariantFile(fn_in)
+
+ header = pysam.VariantHeader()
+ for sample in vcf_in.header.samples:
+ header.add_sample(sample)
+
+ for hr in vcf_in.header.records:
+ header.add_line(str(hr))
+
+ vcf_out = pysam.VariantFile(fn_out, "w", header=header)
+
+ for record in vcf_in:
+ vcf_out.write(record)
+
+ vcf_out.close()
+ vcf_in.close()
+
+ self.complete_check(fn_in, fn_out)
+
+# Currently segfaults for VCFs without contigs
+# class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs):
+# """construct VariantFile from scratch."""
+# filename = "example_vcf40.vcf"
+
+
+class TestConstructionVCFGZWithContigs(TestConstructionVCFWithContigs):
+ """construct VariantFile from scratch."""
+
+ filename = "example_vcf42_withcontigs.vcf.gz"
+
+
+class TestConstructionVCFGZWithoutContigs(TestConstructionVCFWithContigs):
+ """construct VariantFile from scratch."""
+
+ filename = "example_vcf42.vcf.gz"
+
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/cbcf_data/Makefile b/tests/cbcf_data/Makefile
new file mode 100644
index 0000000..796c3a6
--- /dev/null
+++ b/tests/cbcf_data/Makefile
@@ -0,0 +1,23 @@
+ALL_VCF=$(wildcard *.vcf)
+VCF=$(filter-out example_empty.vcf,$(ALL_VCF))
+
+VCFGZ=$(VCF:%.vcf=%.vcf.gz)
+BCF=$(VCF:%.vcf=%.bcf)
+
+all: $(VCFGZ) $(BCF)
+
+%.vcf.gz: %.vcf
+ bgzip < $< > $@
+ tabix -p vcf $@ # create tbi index
+ bcftools index $@ # create csi index
+
+%.bcf: %.vcf.gz
+ bcftools view -O b $< -o $@
+ bcftools index $@
+
+example_empty.bcf: example_empty.vcf.gz
+ touch $@
+
+clean:
+ rm -f *.gz *.tbi *.csi *.bcf
+
diff --git a/KNOWN_BUGS b/tests/cbcf_data/example_empty.vcf
similarity index 100%
rename from KNOWN_BUGS
rename to tests/cbcf_data/example_empty.vcf
diff --git a/tests/cbcf_data/example_vcf40.vcf b/tests/cbcf_data/example_vcf40.vcf
new file mode 100644
index 0000000..8dd9f20
--- /dev/null
+++ b/tests/cbcf_data/example_vcf40.vcf
@@ -0,0 +1,24 @@
+##fileformat=VCFv4.0
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+M 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
+17 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
+20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
diff --git a/tests/cbcf_data/example_vcf42.vcf b/tests/cbcf_data/example_vcf42.vcf
new file mode 100644
index 0000000..c6c7030
--- /dev/null
+++ b/tests/cbcf_data/example_vcf42.vcf
@@ -0,0 +1,24 @@
+##fileformat=VCFv4.2
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+M 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
+17 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
+20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
diff --git a/tests/cbcf_data/example_vcf42_only_header.vcf b/tests/cbcf_data/example_vcf42_only_header.vcf
new file mode 100644
index 0000000..ee0f8fa
--- /dev/null
+++ b/tests/cbcf_data/example_vcf42_only_header.vcf
@@ -0,0 +1,19 @@
+##fileformat=VCFv4.2
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
diff --git a/tests/cbcf_data/example_vcf42_withcontigs.vcf b/tests/cbcf_data/example_vcf42_withcontigs.vcf
new file mode 100644
index 0000000..1d298eb
--- /dev/null
+++ b/tests/cbcf_data/example_vcf42_withcontigs.vcf
@@ -0,0 +1,27 @@
+##fileformat=VCFv4.2
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##contig=<ID=M>
+##contig=<ID=17>
+##contig=<ID=20>
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+M 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:.
+17 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.
+20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
diff --git a/tests/cbcf_data/missing_genotypes.vcf b/tests/cbcf_data/missing_genotypes.vcf
new file mode 100644
index 0000000..a6f0dbc
--- /dev/null
+++ b/tests/cbcf_data/missing_genotypes.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.1
+##contig=<ID=chr1,length=100>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2
+chr1 1 . A G 999 . . GT 0/0 1/1
+chr1 50 . T C 999 . . GT ./. 0/1
diff --git a/tests/faidx_test.py b/tests/faidx_test.py
index ee448c3..f3e6cc4 100644
--- a/tests/faidx_test.py
+++ b/tests/faidx_test.py
@@ -1,6 +1,9 @@
import pysam
import unittest
import os
+import gzip
+
+from TestUtils import checkURL
DATADIR = "pysam_data"
@@ -115,6 +118,17 @@ class TestFastxFileFastq(unittest.TestCase):
else:
self.checkLast(first)
+ def testManager(self):
+ with self.filetype(os.path.join(DATADIR, self.filename),
+ persist=self.persist) as inf:
+ first = inf.__next__()
+ self.checkFirst(first)
+ for last in inf:
+ pass
+ self.checkLast(last)
+
+ self.assertEqual(inf.closed, True)
+
# Test for backwards compatibility
class TestFastqFileFastq(TestFastxFileFastq):
@@ -130,5 +144,42 @@ class TestFastxFileFasta(TestFastxFileFastq):
class TestFastxFileFastqStream(TestFastxFileFastq):
persist = False
+
+class TestFastxFileWithEmptySequence(unittest.TestCase):
+ """see issue 204:
+
+ iteration over fastq file with empty sequence stops prematurely
+ """
+
+ filetype = pysam.FastxFile
+ filename = "faidx_empty_seq.fq.gz"
+
+ def testIteration(self):
+ fn = os.path.join(DATADIR, self.filename)
+
+ with gzip.open(fn) as inf:
+ ref_num = len(list(inf)) / 4
+
+ f = self.filetype(fn)
+ l = len(list(f))
+ self.assertEqual(ref_num, l)
+
+
+class TestRemoteFileFTP(unittest.TestCase):
+ '''test remote access.
+ '''
+
+ url = "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa"
+
+
+ def testFTPView(self):
+ if not checkURL(self.url):
+ return
+ f = pysam.Fastafile(self.url)
+ self.assertEqual(
+ len(f.fetch("chr1", 0, 1000)),
+ 1000)
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile
index 6166fd2..aed77b5 100644
--- a/tests/pysam_data/Makefile
+++ b/tests/pysam_data/Makefile
@@ -14,8 +14,10 @@ all: ex1.pileup.gz \
$(CRAM) $(CRAI) \
example_bai.bam \
rg_with_tab.bam \
- ex2_truncated.bam ex2_truncated.bam.bai \
- empty.bam empty.bam.bai
+ ex2_truncated.bam \
+ empty.bam empty.bam.bai \
+ explicit_index.bam explicit_index.cram \
+ faidx_empty_seq.fq.gz
# ex2.sam - as ex1.sam, but with header
ex2.sam.gz: ex1.bam ex1.bam.bai
@@ -66,7 +68,17 @@ example_bai.bam: ex1.bam
samtools index $@
mv $@.bai example_bai.bai
+explicit_index.bam: ex1.bam
+ cp ex1.bam $@
+
+explicit_index.cram: ex1.cram
+ cp ex1.cram $@
+
clean:
- rm -fr *.bam *.bai *.fai *.pileup* \
- *~ calDepth *.dSYM pysam_*.sam \
- ex2.sam ex2.sam.gz ex1.sam
+ rm -fr *.bam *.bai *.fai *.pileup* *.cram \
+ *~ calDepth *.dSYM pysam_*.sam \
+ ex2.sam ex2.sam.gz ex1.sam \
+ *.fq.gz
+
+%.fq.gz: %.fq
+ gzip < $< > $@
diff --git a/tests/pysam_data/ex1.vcf.gz b/tests/pysam_data/ex1.vcf.gz
new file mode 100644
index 0000000..64a453d
Binary files /dev/null and b/tests/pysam_data/ex1.vcf.gz differ
diff --git a/tests/pysam_data/ex1.vcf.gz.tbi b/tests/pysam_data/ex1.vcf.gz.tbi
new file mode 100644
index 0000000..03179c3
Binary files /dev/null and b/tests/pysam_data/ex1.vcf.gz.tbi differ
diff --git a/tests/pysam_data/example_aligned_pairs.sam b/tests/pysam_data/example_aligned_pairs.sam
new file mode 100644
index 0000000..e6f9830
--- /dev/null
+++ b/tests/pysam_data/example_aligned_pairs.sam
@@ -0,0 +1,81 @@
+ at HD VN:1.0 GO:none SO:coordinate
+ at SQ SN:chrM LN:16571
+ at SQ SN:chr1 LN:249250621
+ at SQ SN:chr2 LN:243199373
+ at SQ SN:chr3 LN:198022430
+ at SQ SN:chr4 LN:191154276
+ at SQ SN:chr5 LN:180915260
+ at SQ SN:chr6 LN:171115067
+ at SQ SN:chr7 LN:159138663
+ at SQ SN:chr8 LN:146364022
+ at SQ SN:chr9 LN:141213431
+ at SQ SN:chr10 LN:135534747
+ at SQ SN:chr11 LN:135006516
+ at SQ SN:chr12 LN:133851895
+ at SQ SN:chr13 LN:115169878
+ at SQ SN:chr14 LN:107349540
+ at SQ SN:chr15 LN:102531392
+ at SQ SN:chr16 LN:90354753
+ at SQ SN:chr17 LN:81195210
+ at SQ SN:chr18 LN:78077248
+ at SQ SN:chr19 LN:59128983
+ at SQ SN:chr20 LN:63025520
+ at SQ SN:chr21 LN:48129895
+ at SQ SN:chr22 LN:51304566
+ at SQ SN:chrX LN:155270560
+ at SQ SN:chrY LN:59373566
+ at RG ID:FC1_NA12892_03 PL:ILLUMINA SM:FC1_NA12892_03
+ at PG ID:bwa PN:bwa VN:0.7.7-isis-1.0.0
+ at PG ID:GATK IndelRealigner VN:1.6-23-gf0210b3 CL:knownAlleles=[] targetIntervals=/data/scratch/workspace/RunFolder/Data/Intensities/BaseCalls/Alignment/FC1-NA12892-03_S1_chrM.intervals LODThresholdForCleaning=5.0 consensusDeterminationModel=USE_READS entropyThreshold=0.15 maxReadsInMemory=150000 maxIsizeForMovement=3000 maxPositionalMoveAllowed=200 maxConsensuses=30 maxReadsForConsensuses=120 maxReadsForRealignment=20000 noOriginalAlignmentTags=false nWayOut=null generate_nWayOut_md5s=fa [...]
+D0004:230:H08B1ADXX:1:1204:18619:71946 163 chr22 16084109 27 149M = 16084228 268 GTGCAGGCATGGAGATTCTGGGGTGAATCTGCTGAGTTTAAAAGCTTCCTTTGGAGATGCCCCTGGCCCCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGC BBBFFBFFFFFFFFFIFIFIFFI0BBFIFIIIIIFFFFIFIIIFFBIFIIIIFI<BFFFIIIIFFFFFFFFFFFFFFBBFBBFBFBBBBBFFFFBBBFFFFFFFBBBBBBBFFFFBBBFFFFFB<<7B<BBBBBFBFBFFFFBFFBBBB MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:149 XS:i:149
+D0004:230:H08B1ADXX:2:2208:19052:44947 99 chr22 16084116 27 149M = 16084172 205 CATGGAGATTCTGGGGTGAATCTGCTGAGTTTAAAAGCTTCCTTTGGAGATGCCCCTGGCCCCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATG BBBFFFFFFFFFFIIIBFFFIIIIIIIFIFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFBFBFFFFFFFFFFFFFFFFFFFFFFFFBFFBFFFFBBFFBFFFFFFFB MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:149 XS:i:149
+D0004:230:H08B1ADXX:2:2206:17383:92394 163 chr22 16084134 27 148M = 16084223 236 AATCTGCTGAGTTTAAAGGCTTCCTTTGGAGATGCCCCTGGCCCCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAAC BBBFFFBFFFFBFBFFI00BFBFFFFFIBBFF<FFIIIBFBFFIIIFIIII<BBFBFFFFIFIIFFFFBBBFFBBBF<BBBBBBBBB<BB<7<BBB<BBBBBBBBBBBBBB<BBBBBBB<0<<0<BBBBBBB<BBBBBBBBBB<BBBB MD:Z:17A130 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:27 AS:i:143 XS:i:143
+D0004:230:H08B1ADXX:2:1116:18441:31081 163 chr22 16084162 45 147M = 16084175 162 TAGATGCCCCTGGCCCTCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGCCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATTGCCCAGATGCCTATCTGTGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCT '7'<B0'0<'7<<7B0'0<<BBBB<BB<<'7<<B<<BB<7<<<B<<<B'7<'<'07<<0777<BBB<BBB7<B<BB<<<<7770<<<<<<<70'07<<<'077<<<007<<'0<77<<<<<<<<<<<<<707<<<<<<7<7777<<7 MD:Z:0G15C36T39G13G3A35 RG:Z:FC1_NA12892_03 NM:i:6 MQ:i:48 AS:i:121 XS:i:117
+D0004:230:H08B1ADXX:1:2108:10243:19235 99 chr22 16084168 48 150M = 16084274 254 CCCCTGGCCCCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCA BBBFFFFFFFFFFIIIIIFIIIIIFIIFFFFFIBFIIIIIIIIIIIIIIIIIIIFFFFIIIFFFFFFFFFFFFFFFBFFFFFFFFFFBBFFFFFFFFFFFBBBBFFFFFFFFFFBBFBFFFFFFFFFBFFBFFFF0BBBBFFBBBBBBF7 MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:2:2208:19052:44947 147 chr22 16084172 27 150M = 16084116 -205 TGGCCCCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAG 7BFFFFFFFBB<FFFFFFFFFFFFBFFFBFFFFFBFFFFFFFFFFFFBBFFFFFFFBFFFFFFFFBBFBFFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFIFFIFFFFBFFFIIFIFIIIIIIIIIIIIIIIFIIIIFFFFFFFFFFBBB MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:1:2203:19752:25713 99 chr22 16084173 27 150M = 16084341 317 GGCCCCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGG BBBFFFFFFFFFFIFIFFIIIIFIIIIIFFFFIIIIIIIIIIIFFFFFBFIFFFFIIIIIIFFFFFFFFFFFFBFFFFFFBFBFFFFFFBBBBFFFFFFFFFFFFBFFBBFFFFFBFFFFFBBBBBFFFB7BBBFBBBBFFFFFFFFFB< MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:2:1116:18441:31081 83 chr22 16084175 48 150M = 16084162 -162 CCCCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGAC BFBFFBB<0BBFFFFFFFFFFFFBFFFFFFFFBFFFBFFBBBFFFFFFFFFFFBBFFFFFFFFBB<FFFFFFFFFFFBBBFFFFFBFFFFFFFBFFFFFIFFFFFFFFFIIIIIFFIIIIIIIIFIIIFBFIFFFIIFFFFFFFFFFBBB MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:45 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:2:2209:11952:24810 163 chr22 16084177 48 148M = 16084258 230 CCCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGACATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGAC BBBBBBFFFFFFFFFBBFBFIIIBFFFBFBFFIIFFIFFIIIFFFF<FBBFFFFFFF<BBBBFFFFFFFFFFBFFFFFBBBBBBFFBBBB<B7<BB<BBBB7BBBBFBFFFFFFBB7BBBBBBB<77<<7<7<<<<7BBBBBBBB<BB MD:Z:49C98 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:143 XS:i:138
+D0004:230:H08B1ADXX:2:2203:3439:13375 99 chr22 16084178 48 147M = 16084223 192 CCTCCACCTGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGAC BBBFFFFFFFFFFFIBFIIIIIIIIFFIFFFBFFFIIFFIIIIIFFFFFIFFFFFFFIIIIIIIIBFBBFFFFFFFBBBB<BBBFBBFFFFFFFBFFFBBFFFFFFBFBFBBBBBFFFBBFBFFB<BB<FFFBBFFFFFFFFFBBBB MD:Z:147 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:147 XS:i:142
+D0004:230:H08B1ADXX:1:2115:11464:79259 121 chr22 16084186 8 150M = 16084186 0 TGTCAAGAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACC BFBBFFFFFFFFBFBFFBBFFFFFFFFFFFFFFFFFFFFFFBFFFFFFBFFBBBBFFFFFBFFBFFFFFFFBBFFFFFFFFFFFFFFFFFFFFB<FFFIIIIIIIIIIIIFFIFFFIIIIIIIFIIIIIIIFIIIIIFFFFFFFFFF<BB MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:2:2105:4479:40683 99 chr22 16084192 48 148M = 16084244 201 GAAGAGGCCATCCTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTG BBBFFFFFFFFFFIIIIIIIIIIFIIIIIIFIIFIFIFFIIIIIIIIIIIIIFFIIIIIIIIIIIFFFFFFFFFFFBFFFFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFFBBBBFBBFFBFFBFFFFFFFFFFFBFFFFFBBBBFFF MD:Z:148 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:148 XS:i:143
+D0004:230:H08B1ADXX:1:1207:14702:88254 163 chr22 16084204 27 150M = 16084382 327 CTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTT BBBFFFBFBFFFFB<FIIIBF0BB7FFBFFB7FFFFIIFIIIIFF<FBFF<<B<BFBBBBBBFBB<BF<<B<'07<<BB<B<<BBBB<BBBBB<<BBB<<77007<B<0<<<<<BB<<<'7<<<<B<<<BBB<BBBBBBBBBBBBBBBBB MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:1:2103:12875:91705 99 chr22 16084204 59 150M = 16084223 166 CTGTCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTT BBBFFB<FFFFFFIIIIFFFFFFFFFBFFFFFFFFIIBFFIIFIFIIIIIIIIIFIFIIIIIIIFFFFFBFFFFFFFFFFFFBBBBBFFFBBB<BBFBF7<7<B<BFF<BBFFFFFBBFFFFBBFFFFFFFFBFFBFFFBFBBFBBBFBF MD:Z:2A36G110 RG:Z:FC1_NA12892_03 NM:i:2 MQ:i:48 AS:i:142 XS:i:133
+D0004:230:H08B1ADXX:2:2201:14140:86034 163 chr22 16084204 48 150M = 16084281 226 CTATCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGCCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTT BBBFFF<BBFFF<<BBBF7<FBFF<<FFFBFBFBBFFIFFBFFB7B<FBF<70<<BBFF<BFFFB<BBBBBBF<BBBBBBBBB<<<BB<7777<B70070<<'7''0<<B'07B77BBBBBB<'<B<0<<<BBBBB<B<'<<BBB<<BBB MD:Z:99T50 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:145 XS:i:140
+D0004:230:H08B1ADXX:1:1106:14482:98700 99 chr22 16084207 57 7S141M = 16084232 174 ACCCCTGTCTGCCTGTCAAGAAGAGGCCATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGG <B7BFFFBFFBFFFIFI0BFFFFBB<FFBFIBFBFBFBFBFIIIIIFFIFFIIFFFBFFFFBFFF<BFFFFFFB<<77<7BBBBBBBBBBBBB700<777<77<BBBBBBBBB<BB<BB<BBB<BBBBBBBBBBBB<BB7BBB<BBBB MD:Z:36G104 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:47 AS:i:136 XS:i:128
+D0004:230:H08B1ADXX:1:1115:10677:34277 1123 chr22 16084214 6 148M = 16084265 200 GTCAAGAAGAGGCCATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCT BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIFIIIIIIIIIIIIIIIFIIIIIIIIIIIIIIFFFFFFFFFFFFFFFFFFFFFFFFFF<BBFFFFFFFFFFFFFFFBBFFFFFFBFFFFBFFFFFBFFFFFFFFBBFBFFFFFFBFB MD:Z:29G118 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:0 AS:i:143 XS:i:138
+D0004:230:H08B1ADXX:1:1115:16500:65098 99 chr22 16084214 8 148M = 16084270 203 GTCAAGAAGAGGCCATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCT BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFFFFFFFFFFFFFFFFFFFFBFBBFBBFFFFFFFFFFBBFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFFBFFFF MD:Z:29G118 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:0 AS:i:143 XS:i:138
+D0004:230:H08B1ADXX:1:2211:12618:19471 99 chr22 16084214 50 101M = 16084214 100 GTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGA BBBBBFFFFFFBFFFBFFFFIIIFFB7BFFFFFFIIIFIFFIBFFIIIFFFFFFFFFFFBFIIFFFBFFBBBFBBBBBBBBBB<7BBB<<BBBBBBF<B7B MD:Z:101 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:50 AS:i:101 XS:i:96
+D0004:230:H08B1ADXX:2:2101:9743:53144 99 chr22 16084214 6 148M = 16084265 200 GTCAAGAAGAGGCCATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCT BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIFIIIIIIIIFIIIIIIIIIIIIIIIIIIIIIFFFFFFFFFFFFFFFFFFFFFFFFFFBBFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF MD:Z:29G118 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:0 AS:i:143 XS:i:138
+D0004:230:H08B1ADXX:2:2104:16950:36967 163 chr22 16084214 48 148M = 16084229 164 GTCAAGAAGAGGCCACCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCT BBBFFF<00BFFFIFIBF<<<F<FIFIIIFFFFFFFFIIIB<FFFFIFIIFB7BFFBFBF<<FBFBBFBBBBBBBBBBBB<7B7BBBBB7B<BBBBBBBBBBBBBBBBBBBB7<BBBBBBBBBBBBB<<BBBB7<BBBBBBBB0BBBB MD:Z:15T132 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:143 XS:i:138
+D0004:230:H08B1ADXX:2:2201:15832:36778 99 chr22 16084214 48 148M = 16084276 210 GTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCT BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFFFFFFFFFFFFFFFFFFFFFFFFFBBFFFFFFFFFFBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF MD:Z:148 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:148 XS:i:143
+D0004:230:H08B1ADXX:1:2211:12618:19471 147 chr22 16084214 50 101M = 16084214 -100 GTCAAGAAGAGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGA B<B<BBBBFFBBBBFBFBBBBBB<B<FFFBFFFFFBBBB<BBB<BFFFFFFIFFFFF<FFFFBFFFBFBFFFBFBBFFFFIFFIFFFFFBFBFFFFF<BBB MD:Z:101 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:50 AS:i:101 XS:i:96
+D0004:230:H08B1ADXX:2:1114:8448:83144 99 chr22 16084222 50 93M = 16084230 92 GAGGCCACCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGA BBBFFFFFFBFFFB<F<BBFFI<<BB<BBFBFF7<'<BBFBF7FF7B<BBFB707BBFB<BBBFFBFBFBBBBB<B<<BB<<B777<BBB'0< MD:Z:7T85 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:88 XS:i:83
+D0004:230:H08B1ADXX:1:2101:9549:86450 99 chr22 16084223 45 148M = 16084303 228 AGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCAC BBBFFFFFFFFFFFIFIIIIIFFB<FFIFFFFIFFFFIFFFFFFFFIFIIIIIFFIIIIIBFFFFFFFFF<BB<<B<BFBBBBBBBFBF<7<B<BFBBB0BBBFBB<BBBBFFFBBBBF<B<BBBFBBBFFFFBBBBB<<BBBFFBBF MD:Z:148 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:43 AS:i:148 XS:i:143
+D0004:230:H08B1ADXX:1:2103:12875:91705 147 chr22 16084223 48 148M = 16084204 -166 AGGCCATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCAC BBBBBB<BBBBBBBBBBBBBBBBBBBBFFBBBBBBBBBBBBBBBBBBBBBBBBBBBB7BB<BBBBBBBBBBBBBBBBBBBBBBFBBFFFFIFFFFFFFFFFFFF<BFFBFBIIIFFFFFBFFIFFIFBFFBIIFFB<FFFBFFFFBBB MD:Z:20G127 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:59 AS:i:143 XS:i:138
+D0004:230:H08B1ADXX:2:2203:3439:13375 147 chr22 16084223 48 148M = 16084178 -192 AGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCAC FFBBFFBFBBBBFFFBFFBBFFBFFFFB<FBFBFFFFFBBBFBFBBBFFFBBBFBFBFFBBFBFBBBFFFFFBBFBFFFFFFFFFFFFIIIIFIIFFIIIFFIIIIFFFFFBFBFFFFFFFIFIIIIFBFFFBFFFFFFFBFFFFBBB MD:Z:148 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:148 XS:i:143
+D0004:230:H08B1ADXX:2:2206:17383:92394 83 chr22 16084223 27 148M = 16084134 -236 AGGCCATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCAC BBBBBBBBBBBBBBBBBFB<BBB<BBBFFBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB<BBBBBBBB<BB<BBBBBBB<BFFB<BBFIIIIFFFFFBBBFIIIFFFBB0FIFFB777BFFF<FIFFFFFF<FFBBFBFBFFFFBBB MD:Z:148 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:148 XS:i:143
+D0004:230:H08B1ADXX:1:1106:12402:15905 163 chr22 16084228 27 149M = 16084425 344 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFFFFFFFFFIIIIIIFIIIIIIFIIIIIIIIIIIIIIFFIFIIIIIIIIIIFIIIIIIFFFFFFFFFFFFFBBFFFFFFBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFFF MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:149 XS:i:144
+D0004:230:H08B1ADXX:1:1114:17248:48776 1187 chr22 16084228 27 149M = 16084401 320 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGGGGGCAAACCTTCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFF<7F<BBBBBFB<<FF<FFFB0<BBBF7BB<<<B'BBF0B'7BFFF7F7<F'7<BBFFF<B<BBBB<7<B07<BBBBBBBBBB7BB7BB<BBBBB<<BB7<BBBBB<B<BBB<<BBBBBBBBBBBBBBBBBBBBB<BBBBBBBB MD:Z:45A10C92 RG:Z:FC1_NA12892_03 NM:i:2 MQ:i:27 AS:i:139 XS:i:134
+D0004:230:H08B1ADXX:1:1207:13940:97014 163 chr22 16084228 27 149M = 16084401 320 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFF77BFBFFFFBFFBFFFFFFFBFBFFFBBBFFFFFFBFFFFFFFFBFFFI<FFFFFFFFFFFFFB<<<'<BBBBBBB<<BBBBB<077B<BBBBBBBBBBBB<BBB0<B<BBB<BB<BBBBBBBBB<BBBBBBB<BBBBBBBBB MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:149 XS:i:144
+D0004:230:H08B1ADXX:1:2202:6988:58967 99 chr22 16084228 48 149M = 16084286 207 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFFFFFFFBFIIIFIIFFFIFIFBFFFIFFFIBBFFFFFBFBFBFBBFFFFFFBBBFFFFFFFFF<<<BBB<0<<7<BBBB<BBB7BBBB7<<<<BBBBBFBFBB7<BBBB<B<BB7B7BBBBBBFBF<<<<B<<BBB<7<B<B0< MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:149 XS:i:144
+D0004:230:H08B1ADXX:2:1109:12526:35372 99 chr22 16084228 48 149M = 16084262 180 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFFFFFFFFFFFFBBBFFFFFFFBFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFF MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:149 XS:i:144
+D0004:230:H08B1ADXX:2:1114:10667:73443 163 chr22 16084228 48 149M = 16084229 150 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFFFFFFFFFIIIIFIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFFIIFIIIIIIIFFFFFFFBBFB7<<BBFFFFFFBBFFFFFFFFFFFFBFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFBFFBBFFFFFBFFFFFFFFF MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:149 XS:i:144
+D0004:230:H08B1ADXX:2:1212:17241:34552 99 chr22 16084228 48 149M = 16084230 151 ATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIFIIIIIIIIIIIIIIIIIIIIFFFFFFFFFFFFFBBFBFFFFFFBBFFFFFFFFFFFFFFFFBBFFFFFFFFFFFFFFFFBBFFFFFFFFFFFFFFFFFFFFFFFFFF MD:Z:15G133 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:57 AS:i:144 XS:i:139
+D0004:230:H08B1ADXX:2:2103:6351:62666 163 chr22 16084228 48 143M = 16084228 142 ATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCAC BBBFBFFFFFFFFFIFFIFIIIFIIFIBFFIFFFIFFFFIIIFIIIFFFFFFFIIIIIIFIFFFFFFFFBBBFBF<B<BFFFBBFBBFFFFFFFFFFFFBBBBBFBFBBFBFFFFFFFFBBBBBBBBFFFFBBBBFFFFFFFB MD:Z:15G127 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:138 XS:i:133
+D0004:230:H08B1ADXX:2:2109:12183:35153 163 chr22 16084228 27 149M = 16084372 293 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTG BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIFIIIIIIIIIIIIIIIIIIIIFFFFFFFFBFBBF<BFFFFFFFBFFFFFFFFFFFFFFFFFBFFFFBFFFFFFFFFFFFFFBFFFFBFFFFFFFFFFFFFFBFFFFBF MD:Z:149 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:149 XS:i:144
+D0004:230:H08B1ADXX:1:1204:18619:71946 83 chr22 16084228 27 150M = 16084109 -268 ATCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGC 0BBFFFFFBFFFFFFFBFFFFFFFFFFFBBBFFFFFFFFFFFFFFFBBFFFBBFBBFBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFIFFIIIIFFBIIIIIIIIBFFIFFIIIIIIIIIIIIIIIIIFFFBFFFFFFBBB MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:27 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:2:2103:6351:62666 83 chr22 16084228 48 143M = 16084228 -142 ATCCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCAC <BBBBBB<<<00<BB<BBBBBBF<<7BB70<BBBBB<<BBB<BBBBBB<0<00<<<BBBBBBBBB<BBBBBBBBBBBFBFF<FBFFBFBFFFFBFFFFFBFFBFB7FFBBFBIFFBBBBB7BFIFFFFFFFFFFFFFFFFBBB MD:Z:15G127 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:138 XS:i:133
+D0004:230:H08B1ADXX:2:1114:10667:73443 83 chr22 16084229 48 150M = 16084228 -150 TCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCA BFFFFFFBFFFFFFFFFFFFFFFBFFFFFBFFFFFFFFFFFFFFFFFFBFB<FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFIFIIIIIIIFFBIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFFFFFFFFFBBB MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:2:2104:16950:36967 83 chr22 16084229 48 150M = 16084214 -164 CCCTGGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCA 07<00BBBBB<<B<BBBBBBBBB<BBBBBBBBBBBBBBB<B<7<<BBB7B<7BBBBBBBBBBBBBBBB7007<<7BBB7BB<BFBBBBFB<FBFBFBBBFFFBB7FFFFF<FFBBFFFFBBFFFFBBFFFBBFFF<FFFFFFFFFFFBB< MD:Z:0T149 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:149 XS:i:144
+D0004:230:H08B1ADXX:2:1114:8448:83144 147 chr22 16084230 48 65S85M = 16084222 -92 CCACCGAGACCTACACGTCTGTATTCGTCGGCAGCGTAAGAAGTGTATGAGAGTCCGGAGACCACCCTGGGCAGCACTGTAGAGGCAAATGCCCCAGATGCCTAGCTGAGGAAAAACCTCCATGCCTGGAGGAGGAGGACGCCTCTGGGA '77777000'000'<7'<70''0'''7070<<<7000'000''BB<770000''7'007''0'77<7B<<BB<<B<7'<<<7'B<<<77<0'7'<BBBB<0<BBB<BB<<<'7BB<<7B<BBBBBBBBBB'BB<B<'0'<<00BBBB<<< MD:Z:12A13G19G0C25T11 RG:Z:FC1_NA12892_03 NM:i:5 MQ:i:50 AS:i:60 XS:i:55
+D0004:230:H08B1ADXX:2:1212:17241:34552 147 chr22 16084230 57 150M = 16084228 -151 CCTGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCAG BBFFFFFBBFFFFFBBFFFFFFBFFFBFBBBFBFBFBFFFFFFFFFFBBBBFFBFFFFFFFFFFFFFFFFFFFFFBFBFFFFFFFFFFFFFFFFIIIIFIIFFFIIIFIFIIIFFFIIIIIIFIIIIIIIIIIIIIIFFFFFFFFFFBBB MD:Z:13G136 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:48 AS:i:145 XS:i:137
+D0004:230:H08B1ADXX:1:1106:14482:98700 147 chr22 16084232 47 150M = 16084207 -174 TGGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGCCCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCAGGC 0B00BBBFBBFBBBBBBBFFBBBBBBBBB<BB7<BBBBBBBBB<0B<07BBB7B<<<BBB<<BBBBBB7777BB<0BBBBBBBBB<BBBB7'B<FBFFBB70<BFB<FFFFFF7FBB<<B00BBBFB7B0<00BB<FFFFBBFFBFFB<< MD:Z:11G79A58 RG:Z:FC1_NA12892_03 NM:i:2 MQ:i:57 AS:i:140 XS:i:135
+D0004:230:H08B1ADXX:1:1211:1598:3784 99 chr22 16084233 22 150M = 16084251 166 GGGCAGCACATTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCAGGCA B<BFFFFFFBBBFBBBFFBFFFB7B<0BBFBBFFFFBFFFBBFFFFFFFFFIIFBBFFF<FBBFFBFFFB7<BB<BBBBBBBBBB<0<B77<70<BBB<BBBBBBB0<BBBBB<BBBBBBB0<<BBBBBBBBBBBBBB0BBBBBBBBBB7 MD:Z:10G139 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:22 AS:i:145 XS:i:140
+D0004:230:H08B1ADXX:1:1213:6486:68134 99 chr22 16084233 27 150M = 16084298 213 GGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCCCTCCAGCACCTCCTGCAGGCA BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFFFFFFFBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFBFFFFFFFFFFFFFF'''00070''007007000'0<''' MD:Z:128T21 RG:Z:FC1_NA12892_03 NM:i:1 MQ:i:27 AS:i:145 XS:i:140
+D0004:230:H08B1ADXX:1:2113:18050:73868 99 chr22 16084233 48 148M = 16084233 147 GGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCAGG BBBFFBBFFFBFBBFIIBFBFB<BFFFBFFFIIIIFFFFIFFBFIIIIFFFFF<FFFFBBFBBBFBB<BB07<0<<BB<<B7<<B77BBBBBBBBB<<BBBBBBBBBBBBBBB<7BB00<BBBBBBBBBBB<BBBBBBBBBB<<<BB' MD:Z:148 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:47 AS:i:148 XS:i:143
+D0004:230:H08B1ADXX:2:1106:17498:33865 99 chr22 16084233 48 150M = 16084266 182 GGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCAGGCA BBBFFFFFFFFFFIFIIIIIFIIIIIIIIIIIIIFIIIIFFIIIIIIIIFIIIIIIIIIIIIFF<FFFFF<BBBFFFFBFBFFFFFFFFBBBFFFBFFFFFFFFFBBFFFFFFFFFFBBFBBBFFFFBFBBFBFFFFFFBBFFFFFBFFB MD:Z:150 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:48 AS:i:150 XS:i:145
+D0004:230:H08B1ADXX:2:2108:13049:6453 163 chr22 16084233 60 146M = 16084233 145 GGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCA BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIFIIIIIIIIIIIIIIIIIIIIIFFFFFFFF<BBFFBFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF MD:Z:146 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:60 AS:i:146 XS:i:129
+D0004:230:H08B1ADXX:2:2113:10982:55029 163 chr22 16084233 27 150M = 16084316 230 AGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCTTCTGGGAGCAGGAGGACCTGCGGGAACGCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCAGGCA 0<7BBB<BFFFFFFF<B<FFFB0000B'<BF7BBFBB'BF<7BFF'B77BB'07B<7B<BB<B0<BB<BB<<<<''7B<'7770BB<<07<BB'0B''7<BB'0<<BBBBBBBBBBBBB7<BBBBB<BBBBBBBBBB'0<'07BBB077' MD:Z:0G73C21T5C47 RG:Z:FC1_NA12892_03 NM:i:4 MQ:i:27 AS:i:134 XS:i:129
+D0004:230:H08B1ADXX:1:2113:18050:73868 147 chr22 16084233 47 148M = 16084233 -147 GGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCATTTTCTTGCTCTCCAGCACCTCCAGCAGG 77BBBB<BBBB<0BB<BBFBBBBBBBBBBBBBBBB<BBBBBBBB<0''7BB70<770<0BBBB<BBB<0<0770BB<B7B<B<BBBFF<'<FFBFFB<<B7<FFBFFBBBBBFFFBFBBFFFFFFFFF0BFB0<<B<BBBBB<<<<<B MD:Z:118C23T5 RG:Z:FC1_NA12892_03 NM:i:2 MQ:i:48 AS:i:138 XS:i:133
+D0004:230:H08B1ADXX:2:2108:13049:6453 83 chr22 16084233 60 146M = 16084233 -145 GGGCAGCACAGTAGAGGCAAATGGCCCAGATGCCTAGCTGAGGGCAAACCTCCATGCCTGGAGGAGGAGGTCGCCTCTGGGAGCAGGAGGACCTGCTGGAACCCCTGCTCACAGGCTCCTTTTCTTGCTCTCCAGCACCTCCTGCA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFIFIIIIIIIIFBIIIIIIIIIIIIIFIIIIIIIIIIIIIIIIIIFFFFFFFFFFBBB MD:Z:146 RG:Z:FC1_NA12892_03 NM:i:0 MQ:i:60 AS:i:146 XS:i:129
diff --git a/tests/pysam_data/example_empty_with_header.sam b/tests/pysam_data/example_empty_with_header.sam
new file mode 100644
index 0000000..60f088e
--- /dev/null
+++ b/tests/pysam_data/example_empty_with_header.sam
@@ -0,0 +1 @@
+ at HD VN:1.3 SO:coordinate
diff --git a/tests/pysam_data/faidx_empty_seq.fq b/tests/pysam_data/faidx_empty_seq.fq
new file mode 100644
index 0000000..572bf2f
--- /dev/null
+++ b/tests/pysam_data/faidx_empty_seq.fq
@@ -0,0 +1,40 @@
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:6834:72051:TCGCTGTG
+TACCTCTAAAGGGGAGCGCTTTGGA
++
+IBBBBBEEEEEEEEEEEEBBBBBII
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:6822:72067:TCGCTGTG
+TGCTCTAGAATTACCACAGTTATGA
++
+ at A<;BB<<0B<ABB??EEB?<<;CH
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:6892:72120:TCGCTGTG
+TGCCGCCCCCGCCGCTCCCGTCCACTCTCGGA
++
+HBBBBBEEEEEEEEEEEEEEEEEEEBBBBBII
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:6833:72134:TCGCTGTG
+TAACGCAGAGTACTAACCACTAGA
++
+I?BBBBEEEEEEDEEEB?B??BHH
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:6907:72225:TCGCTGTG
+TAACGCAGAGTACTAACCACTATACGATCACGGCG
++
+IBBBBBEEEEEEEEEEEEEEEEEEEEEEEBBBBBI
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:7089:72002:TCGCTGTG
+TCGCGCGTGCAGCCCCGGACATCGA
++
+<<0;;0BDDBDEB0>A0B?A?<BH1
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:7094:72048:TCGCTGTG
+
++
+
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:7152:72066:TCGCTGTG
+TAACCAGGAATCCTAACCGCTGA
++
+HBBBBBEEEEEEEEDDBBBBBII
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:7138:72162:TCGCTGTG
+TGAGGAGGGGGGAACGGGGGGCGGGA
++
+EA0BB?BEEEB0EEBEEEAB??BBIH
+ at HISEQ2500-10:332:HCWLKBCXX:1:1101:7188:72166:TCGCTGTG
+TGGCGAGAATTCTACCACTGAACCACCAATGCGA
++
+HBBBBBEEEEEEEEEEEEDEEEEEEEEB?BB?HD
diff --git a/tests/pysam_data/test_mapped_unmapped.sam b/tests/pysam_data/test_mapped_unmapped.sam
new file mode 100644
index 0000000..c0b6230
--- /dev/null
+++ b/tests/pysam_data/test_mapped_unmapped.sam
@@ -0,0 +1,17 @@
+ at HD VN:1.0
+ at SQ SN:chr1 LN:100
+ at CO Test counting of mapped/unmapped reads
+read1_mapped 0 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+read2_unmapped 4 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+read3_unmapped 20 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair1a_mapped 67 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair1b_mapped 131 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair2a_unmapped 71 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair2b_mapped 139 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair3a_unmapped 77 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair3b_unmapped 141 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+noseq2b_mapped 139 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+noseq1_unmapped 4 * 0 20 * = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+noseq2a_unmapped 71 * 0 20 * = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair3a_unmapped 77 * 0 20 * = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+pair3b_unmapped 141 * 0 20 * = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
diff --git a/tests/pysam_data/test_query_position.sam b/tests/pysam_data/test_query_position.sam
new file mode 100644
index 0000000..5e8d2d1
--- /dev/null
+++ b/tests/pysam_data/test_query_position.sam
@@ -0,0 +1,9 @@
+ at HD VN:1.0
+ at SQ SN:chr1 LN:100
+ at SQ SN:chr2 LN:100
+ at SQ SN:chr3 LN:100
+ at SQ SN:chr4 LN:100
+read1 0 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+read2 0 chr2 21 30 10M2D25M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18
+read1 16 chr3 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1
+read2 16 chr4 21 30 10M2D25M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18
diff --git a/tests/samtools_test.py b/tests/samtools_test.py
index f48d23e..e5fd8b9 100644
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -6,9 +6,12 @@ and data files located there.
'''
import pysam
+import pysam.samtools
+import pysam.bcftools
import unittest
import os
import re
+import glob
import sys
import subprocess
import shutil
@@ -16,12 +19,11 @@ from TestUtils import checkBinaryEqual
IS_PYTHON3 = sys.version_info[0] >= 3
-SAMTOOLS = "samtools"
WORKDIR = "pysam_test_work"
DATADIR = "pysam_data"
-def runSamtools(cmd):
+def run_command(cmd):
'''run a samtools command'''
try:
retcode = subprocess.call(cmd, shell=True,
@@ -32,19 +34,23 @@ def runSamtools(cmd):
print("Execution failed:", e)
-def getSamtoolsVersion():
- '''return samtools version'''
+def get_version(executable):
+ '''return samtools/bcftools version'''
- with subprocess.Popen(SAMTOOLS, shell=True,
+ with subprocess.Popen(executable, shell=True,
stderr=subprocess.PIPE).stderr as pipe:
lines = b"".join(pipe.readlines())
if IS_PYTHON3:
lines = lines.decode('ascii')
- return re.search("Version:\s+(\S+)", lines).groups()[0]
+ try:
+ x = re.search("Version:\s+(\S+)", lines).groups()[0]
+ except AttributeError:
+ raise ValueError("could not get version from %s" % lines)
+ return x
-class BinaryTest(unittest.TestCase):
+class SamtoolsTest(unittest.TestCase):
'''test samtools command line commands and compare
against pysam commands.
@@ -52,238 +58,64 @@ class BinaryTest(unittest.TestCase):
Tests fail, if the output is not binary identical.
'''
- first_time = True
-
- # a dictionary of commands to test
- # first entry: (samtools output file, samtools command)
- # second entry: (pysam output file, (pysam function, pysam options) )
- commands = \
- {
- "view":
- (
- ("ex1.view", "view ex1.bam > ex1.view"),
- ("pysam_ex1.view", (pysam.view, "ex1.bam")),
- ),
- "view2":
- (
- ("ex1.view", "view -bT ex1.fa -o ex1.view2 ex1.sam"),
- # note that -o ex1.view2 throws exception.
- ("pysam_ex1.view",
- (pysam.view, "-bT ex1.fa -oex1.view2 ex1.sam")),
- ),
- "sort":
- (
- ("ex1.sort.bam", "sort ex1.bam ex1.sort"),
- ("pysam_ex1.sort.bam", (pysam.sort, "ex1.bam pysam_ex1.sort")),
- ),
- "mpileup":
- (
- ("ex1.pileup", "mpileup ex1.bam > ex1.pileup"),
- ("pysam_ex1.mpileup", (pysam.mpileup, "ex1.bam")),
- ),
- "depth":
- (
- ("ex1.depth", "depth ex1.bam > ex1.depth"),
- ("pysam_ex1.depth", (pysam.depth, "ex1.bam")),
- ),
- "faidx":
- (
- ("ex1.fa.fai", "faidx ex1.fa"),
- ("pysam_ex1.fa.fai", (pysam.faidx, "ex1.fa")),
- ),
- "index":
- (
- ("ex1.bam.bai", "index ex1.bam"),
- ("pysam_ex1.bam.bai", (pysam.index, "pysam_ex1.bam")),
- ),
- "idxstats":
- (
- ("ex1.idxstats", "idxstats ex1.bam > ex1.idxstats"),
- ("pysam_ex1.idxstats", (pysam.idxstats, "pysam_ex1.bam")),
- ),
- "fixmate":
- (
- ("ex1.fixmate.bam", "fixmate ex1.bam ex1.fixmate.bam"),
- ("pysam_ex1.fixmate.bam",
- (pysam.fixmate, "pysam_ex1.bam pysam_ex1.fixmate.bam")),
- ),
- "flagstat":
- (
- ("ex1.flagstat", "flagstat ex1.bam > ex1.flagstat"),
- ("pysam_ex1.flagstat", (pysam.flagstat, "pysam_ex1.bam")),
- ),
- "calmd":
- (
- ("ex1.calmd.bam", "calmd ex1.bam ex1.fa > ex1.calmd.bam"),
- ("pysam_ex1.calmd.bam", (pysam.calmd, "pysam_ex1.bam ex1.fa")),
- ),
- "merge":
- (
- ("ex1.merge", "merge -f ex1.merge ex1.bam ex1.bam"),
- # -f option does not work - following command will
- # cause the subsequent command to fail
- ("pysam_ex1.merge",
- (pysam.merge, "pysam_ex1.merge pysam_ex1.bam pysam_ex1.bam")),
- ),
- "rmdup":
- (
- # use -s option, otherwise the following error in samtools 1.2:
- # Samtools-htslib-API: bam_get_library() not yet implemented
- ("ex1.rmdup.bam", "rmdup -s ex1.bam ex1.rmdup.bam"),
- ("pysam_ex1.rmdup.bam",
- (pysam.rmdup, "pysam_ex1.bam -s pysam_ex1.rmdup.bam")),
- ),
- "reheader":
- (
- ("ex1.reheader", "reheader ex1.bam ex1.bam > ex1.reheader"),
- ("pysam_ex1.reheader", (pysam.reheader, "ex1.bam ex1.bam")),
- ),
- "cat":
- (
- ("ex1.cat.bam", "cat -o ex1.cat.bam ex1.bam ex1.bam"),
- ("pysam_ex1.cat.bam",
- (pysam.cat, " -o pysam_ex1.cat.bam ex1.bam ex1.bam")),
- ),
- "targetcut":
- (
- ("ex1.targetcut", "targetcut ex1.bam > ex1.targetcut"),
- ("pysam_ex1.targetcut", (pysam.targetcut, "pysam_ex1.bam")),
- ),
- "phase":
- (
- ("ex1.phase", "phase ex1.bam > ex1.phase"),
- ("pysam_ex1.phase", (pysam.phase, "pysam_ex1.bam")),
- ),
- "import":
- (
- ("ex1.bam", "import ex1.fa.fai ex1.sam.gz ex1.bam"),
- ("pysam_ex1.bam",
- (pysam.samimport, "ex1.fa.fai ex1.sam.gz pysam_ex1.bam")),
- ),
- "bam2fq":
- (
- ("ex1.bam2fq", "bam2fq ex1.bam > ex1.bam2fq"),
- ("pysam_ex1.bam2fq", (pysam.bam2fq, "pysam_ex1.bam")),
- ),
- "pad2unpad":
- (
- ("ex2.unpad", "pad2unpad -T ex1.fa ex2.bam > ex2.unpad"),
- ("pysam_ex2.unpad", (pysam.pad2unpad, "-T ex1.fa ex2.bam")),
- ),
- "bamshuf":
- (
- ("ex1.bamshuf.bam", "bamshuf ex1.bam ex1.bamshuf"),
- ("pysam_ex1.bamshuf.bam",
- (pysam.bamshuf, "ex1.bam pysam_ex1.bamshuf")),
- ),
- "bedcov":
- (
- ("ex1.bedcov", "bedcov ex1.bed ex1.bam > ex1.bedcov"),
- ("pysam_ex1.bedcov", (pysam.bedcov, "ex1.bed ex1.bam")),
- ),
- }
-
- # some tests depend on others. The order specifies in which order
- # the samtools commands are executed.
- # The first three (faidx, import, index) need to be in that order,
- # the rest is arbitrary.
- order = ('faidx',
- 'import',
- 'index',
- 'view',
- 'view2',
- 'sort',
- 'mpileup',
- 'depth',
- 'idxstats',
- 'fixmate',
- 'flagstat',
- 'calmd',
- 'merge',
- 'rmdup',
- 'reheader',
- 'cat',
- 'bedcov',
- 'targetcut',
- 'phase',
- 'bam2fq',
- # Segmentation fault:
- # 'bamshuf',
- # File not binary identical
- # 'pad2unpad',
- )
-
- def setUp(self):
- '''setup tests.
-
- For setup, all commands will be run before the first test is
- executed. Individual tests will then just compare the output
- files.
-
- '''
- if BinaryTest.first_time:
-
- # remove previous files
- if os.path.exists(WORKDIR):
- shutil.rmtree(WORKDIR)
- pass
-
- # copy the source files to WORKDIR
- os.makedirs(WORKDIR)
-
- for f in ("ex1.fa", "ex1.sam.gz",
- "ex1.sam", "ex2.bam",
- "ex1.bed"):
- shutil.copy(os.path.join(DATADIR, f),
- os.path.join(WORKDIR, f))
-
- # cd to workdir
- savedir = os.getcwd()
- os.chdir(WORKDIR)
- for label in self.order:
- sys.stdout.write("preparing test {}".format(label))
- command = self.commands[label]
- # build samtools command and target and run
- samtools_target, samtools_command = command[0]
- runSamtools(" ".join((SAMTOOLS, samtools_command)))
- sys.stdout.write(" samtools ok")
- # get pysam command and run
- try:
- pysam_target, pysam_command = command[1]
- except ValueError as msg:
- raise ValueError("error while setting up %s=%s: %s" %
- (label, command, msg))
-
- pysam_method, pysam_options = pysam_command
-
- try:
- output = pysam_method(*pysam_options.split(" "),
- raw=True,
- catch_stdout=True)
- except pysam.SamtoolsError as msg:
- raise pysam.SamtoolsError(
- "error while executing %s: options=%s: msg=%s" %
- (label, pysam_options, msg))
-
- sys.stdout.write(" pysam ok\n")
-
- if ">" in samtools_command:
- with open(pysam_target, "wb") as outfile:
- if type(output) == list:
- if IS_PYTHON3:
- for line in output:
- outfile.write(line.encode('ascii'))
- else:
- for line in output:
- outfile.write(line)
- else:
- outfile.write(output)
-
- os.chdir(savedir)
- BinaryTest.first_time = False
-
- samtools_version = getSamtoolsVersion()
-
+ requisites = [
+ "ex1.fa", "ex1.fa.fai",
+ "ex1.sam.gz",
+ "ex1.bam", "ex1.bam.bai",
+ "ex1.sam", "ex2.bam",
+ "ex1.bed"]
+
+ # a list of statements to test
+ # should contain at least one %(out)s component indicating
+ # an output file.
+ statements = [
+ "view ex1.bam > %(out)s_ex1.view",
+ # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam",
+ "sort ex1.bam -o %(out)s_ex1.sort.bam",
+ "mpileup ex1.bam > %(out)s_ex1.pileup",
+ "depth ex1.bam > %(out)s_ex1.depth",
+ # TODO: issues with file naming
+ # "faidx ex1.fa; %(out)s_ex1.fa.fai",
+ "index ex1.bam %(out)s_ex1.bam.fai",
+ "idxstats ex1.bam > %(out)s_ex1.idxstats",
+ "fixmate ex1.bam %(out)s_ex1.fixmate.bam",
+ "flagstat ex1.bam > %(out)s_ex1.flagstat",
+ "calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam",
+ # use -s option, otherwise the following error in samtools 1.2:
+ # Samtools-htslib-API: bam_get_library() not yet implemented
+ # causes downstream problems
+ # TODO: The following cause subsequent commands to fail
+ # unknow option
+ # "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam",
+ # "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam",
+ "reheader ex1.sam ex1.bam > %(out)s_ex1.reheader",
+ "cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam",
+ "targetcut ex1.bam > %(out)s_ex1.targetcut",
+ "phase ex1.bam > %(out)s_ex1.phase",
+ "import ex1.fa.fai ex1.sam.gz %(out)s_ex1.bam",
+ "bam2fq ex1.bam > %(out)s_ex1.bam2fq",
+ # TODO: not the same
+ # "pad2unpad -T ex1.fa ex2.bam > %(out)s_ex2.unpad",
+ # TODO: command line option problem
+ # "bamshuf ex1.bam -O --output-fmt SAM > %(out)s_ex1.bamshuf.sam",
+ # "collate ex1.bam %(out)s_ex1.collate",
+ "bedcov ex1.bed ex1.bam > %(out)s_ex1.bedcov",
+ "stats ex1.bam > %(out)s_ex1.stats",
+ "dict ex1.bam > %(out)s_ex1.dict",
+ # TODO: not the same
+ # ("addreplacerg -r 'RG\tID:ga\tSM:hs' ex1.bam > %(out)s_ex1.addreplacerg",
+ ]
+
+ map_command = {
+ "import": "samimport"}
+
+ executable = "samtools"
+
+ module = pysam.samtools
+
+ def check_version(self):
+
+ samtools_version = get_version(self.executable)
def _r(s):
# patch - remove any of the alpha/beta suffixes, i.e., 0.1.12a ->
# 0.1.12
@@ -293,100 +125,177 @@ class BinaryTest(unittest.TestCase):
if _r(samtools_version) != _r(pysam.__samtools_version__):
raise ValueError(
- "versions of pysam/samtools and samtools differ: %s != %s" %
- (pysam.__samtools_version__,
+ "versions of pysam.%s and %s differ: %s != %s" %
+ (self.executable,
+ self.executable,
+ pysam.__samtools_version__,
samtools_version))
- def checkCommand(self, command):
- if command:
- samtools_target, pysam_target = self.commands[
- command][0][0], self.commands[command][1][0]
- samtools_target = os.path.join(WORKDIR, samtools_target)
- pysam_target = os.path.join(WORKDIR, pysam_target)
- self.assertTrue(
- checkBinaryEqual(samtools_target, pysam_target),
- "%s failed: files %s and %s are not the same" %
- (command, samtools_target, pysam_target))
-
- def testImport(self):
- self.checkCommand("import")
-
- def testIndex(self):
- self.checkCommand("index")
-
- def testSort(self):
- self.checkCommand("sort")
-
- def testMpileup(self):
- self.checkCommand("mpileup")
-
- def testCalmd(self):
- self.checkCommand("calmd")
-
- def testDepth(self):
- self.checkCommand("depth")
-
- def testIdxstats(self):
- self.checkCommand("idxstats")
-
- def testFixmate(self):
- self.checkCommand("fixmate")
-
- def testFlagstat(self):
- self.checkCommand("flagstat")
-
- def testMerge(self):
- self.checkCommand("merge")
-
- def testRmdup(self):
- self.checkCommand("rmdup")
-
- def testReheader(self):
- self.checkCommand("reheader")
-
- def testCat(self):
- self.checkCommand("cat")
+ def setUp(self):
+ '''setup tests.
- def testTargetcut(self):
- self.checkCommand("targetcut")
+ For setup, all commands will be run before the first test is
+ executed. Individual tests will then just compare the output
+ files.
- def testPhase(self):
- self.checkCommand("phase")
+ '''
- def testBam2fq(self):
- self.checkCommand("bam2fq")
+ self.check_version()
- def testBedcov(self):
- self.checkCommand("bedcov")
+ if not os.path.exists(WORKDIR):
+ os.makedirs(WORKDIR)
- def testView(self):
- self.checkCommand("view")
+ for f in self.requisites:
+ shutil.copy(os.path.join(DATADIR, f),
+ os.path.join(WORKDIR, f))
+
+ self.savedir = os.getcwd()
+ os.chdir(WORKDIR)
+
+ return
+
+ def check_statement(self, statement):
+
+ parts = statement.split(" ")
+ r_samtools = {"out": self.executable}
+ r_pysam = {"out": "pysam"}
+
+ command = parts[0]
+ command = self.map_command.get(command, command)
+ # self.assertTrue(command in pysam.SAMTOOLS_DISPATCH)
+
+ targets = [x for x in parts if "%(out)s" in x]
+ samtools_targets = [x % r_samtools for x in targets]
+ pysam_targets = [x % r_pysam for x in targets]
+
+ pysam_method = getattr(self.module, command)
+ # run samtools
+ full_statement = re.sub("%\(out\)s", self.executable, statement)
+ run_command(" ".join((self.executable, full_statement)))
+ # sys.stdout.write("%s %s ok" % (command, self.executable))
+
+ # run pysam
+ if ">" in statement:
+ assert parts[-2] == ">"
+ parts = parts[:-2]
+
+ # avoid interpolation to preserve string quoting, tab chars, etc.
+ pysam_parts = [re.sub("%\(out\)s", "pysam", x) for x in parts[1:]]
+ output = pysam_method(*pysam_parts,
+ raw=True,
+ catch_stdout=True)
+
+ # sys.stdout.write(" pysam ok\n")
+
+ if ">" in statement:
+ with open(pysam_targets[-1], "wb") as outfile:
+ if output is not None:
+ outfile = outfile.write(output)
+
+ for samtools_target, pysam_target in zip(samtools_targets,
+ pysam_targets):
+ if os.path.isdir(samtools_target):
+ samtools_files = glob.glob(os.path.join(
+ samtools_target, "*"))
+ pysam_files = glob.glob(os.path.join(pysam_target, "*"))
+ self.assertEqual(len(samtools_files), len(pysam_files))
+ # need to be able to exclude files like README, etc.
+ continue
+ else:
+ samtools_files = [samtools_target]
+ pysam_files = [pysam_target]
+
+ for s, p in zip(samtools_files, pysam_files):
+ self.assertTrue(
+ checkBinaryEqual(s, p),
+ "%s failed: files %s and %s are not the same" %
+ (command, s, p))
+
+ def testStatements(self):
+ for statement in self.statements:
+ self.check_statement(statement)
+
+ def tearDown(self):
+ if os.path.exists(WORKDIR):
+ shutil.rmtree(WORKDIR)
+ os.chdir(self.savedir)
- # def testBamshuf(self):
- # self.checkCommand("bamshuf")
- # def testPad2Unpad(self):
- # self.checkCommand("pad2unpad")
+class EmptyIndexTest(unittest.TestCase):
def testEmptyIndex(self):
- self.assertRaises(IOError, pysam.index, "exdoesntexist.bam")
-
- def __del__(self):
- if os.path.exists(WORKDIR):
- shutil.rmtree(WORKDIR)
+ self.assertRaises(IOError, pysam.samtools.index,
+ "exdoesntexist.bam")
class StdoutTest(unittest.TestCase):
'''test if stdout can be redirected.'''
def testWithRedirectedStdout(self):
- r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"))
+ r = pysam.samtools.flagstat(
+ os.path.join(DATADIR, "ex1.bam"))
self.assertTrue(len(r) > 0)
def testWithoutRedirectedStdout(self):
- r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"),
- catch_stdout=False)
- self.assertTrue(len(r) == 0)
+ r = pysam.samtools.flagstat(
+ os.path.join(DATADIR, "ex1.bam"),
+ catch_stdout=False)
+ self.assertEqual(r, None)
+
+
+class PysamTest(SamtoolsTest):
+ """check access to samtools command in the pysam
+ main package.
+
+ This is for backwards capability.
+ """
+
+ module = pysam
+
+
+class BcftoolsTest(SamtoolsTest):
+
+ requisites = [
+ "ex1.fa",
+ "ex1.vcf.gz",
+ "ex1.vcf.gz.tbi",
+ ]
+ # a list of statements to test
+ # should contain at least one %(out)s component indicating
+ # an output file.
+ statements = [
+ # "index -n ex1.vcf.gz > %(out)s_ex1.index",
+
+ "annotate -x ID ex1.vcf.gz > %(out)s_ex1.annotate",
+ "concat -a ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.concat",
+ "isec -p %(out)s_ex1.isec ex1.vcf.gz ex1.vcf.gz",
+ "merge --force-samples ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.norm",
+ "norm -m +both ex1.vcf.gz > %(out)s_ex1.norm",
+
+ # "plugin",
+ # "query -f '%CHROM\n' ex1.vcf.gz > %(out)s_ex1.query",
+ # "reheader -s A > %(out)s_ex1.reheader",
+ # "view ex1.vcf.gz > %(out)s_ex1.view",
+ # "call -m ex1.vcf.gz > %(out)s_ex1.call",
+ # bad file descriptor
+ # "consensus -f ex1.fa ex1.vcf.gz > %(out)s_ex1.consensus"
+ # need appropriate VCF file
+ # "cnv",
+ # segfault
+ # "filter -s A ex1.vcf.gz > %(out)s_ex1.filter",
+ # exit
+ # "gtcheck -s A ex1.vcf.gz > %(out)s_ex1.gtcheck",
+ "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
+ "stats ex1.vcf.gz > %(out)s_ex1.stats",
+ ]
+
+ map_command = {
+ "import": "samimport"}
+
+ executable = "bcftools"
+
+ module = pysam.bcftools
+
if __name__ == "__main__":
# build data files
diff --git a/tests/tabix_data/empty.bed.gz b/tests/tabix_data/empty.bed.gz
new file mode 100644
index 0000000..a776e8d
Binary files /dev/null and b/tests/tabix_data/empty.bed.gz differ
diff --git a/tests/tabix_data/empty.bed.gz.tbi b/tests/tabix_data/empty.bed.gz.tbi
new file mode 100644
index 0000000..891fe9f
Binary files /dev/null and b/tests/tabix_data/empty.bed.gz.tbi differ
diff --git a/tests/tabix_data/example_0v23.bed.gz b/tests/tabix_data/example_0v23.bed.gz
new file mode 100644
index 0000000..b67da76
Binary files /dev/null and b/tests/tabix_data/example_0v23.bed.gz differ
diff --git a/tests/tabix_data/example_0v23.bed.gz.tbi b/tests/tabix_data/example_0v23.bed.gz.tbi
new file mode 100644
index 0000000..1077092
Binary files /dev/null and b/tests/tabix_data/example_0v23.bed.gz.tbi differ
diff --git a/tests/tabix_data/example_0v23.vcf.gz b/tests/tabix_data/example_0v23.vcf.gz
new file mode 100644
index 0000000..277bd7b
Binary files /dev/null and b/tests/tabix_data/example_0v23.vcf.gz differ
diff --git a/tests/tabix_data/example_0v23.vcf.gz.tbi b/tests/tabix_data/example_0v23.vcf.gz.tbi
new file mode 100644
index 0000000..27cdb5e
Binary files /dev/null and b/tests/tabix_data/example_0v23.vcf.gz.tbi differ
diff --git a/tests/tabix_data/example_0v26.bed.gz b/tests/tabix_data/example_0v26.bed.gz
new file mode 100644
index 0000000..b67da76
Binary files /dev/null and b/tests/tabix_data/example_0v26.bed.gz differ
diff --git a/tests/tabix_data/example_0v26.bed.gz.tbi b/tests/tabix_data/example_0v26.bed.gz.tbi
new file mode 100644
index 0000000..1077092
Binary files /dev/null and b/tests/tabix_data/example_0v26.bed.gz.tbi differ
diff --git a/tests/tabix_data/example_0v26.vcf.gz b/tests/tabix_data/example_0v26.vcf.gz
new file mode 100644
index 0000000..277bd7b
Binary files /dev/null and b/tests/tabix_data/example_0v26.vcf.gz differ
diff --git a/tests/tabix_data/example_0v26.vcf.gz.tbi b/tests/tabix_data/example_0v26.vcf.gz.tbi
new file mode 100644
index 0000000..27cdb5e
Binary files /dev/null and b/tests/tabix_data/example_0v26.vcf.gz.tbi differ
diff --git a/tests/tabix_data/vcf/vcf_v42.vcf b/tests/tabix_data/vcf/vcf_v42.vcf
new file mode 100644
index 0000000..11dbc0a
--- /dev/null
+++ b/tests/tabix_data/vcf/vcf_v42.vcf
@@ -0,0 +1,25 @@
+##fileformat=VCFv4.2
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
+20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
+20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
diff --git a/tests/tabix_test.py b/tests/tabix_test.py
index 961f89a..f09ba8c 100644
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -35,21 +35,22 @@ def myzip_open(infile, mode="r"):
return gzip.open(mode)
-def loadAndConvert(infile, encode=True):
- '''load data from infile and convert all fields to string.
+def loadAndConvert(filename, encode=True):
+ '''load data from filename and convert all fields to string.
- Infile can be either plain or compressed (ending in .gz).
+ Filename can be either plain or compressed (ending in .gz).
'''
data = []
- if infile.endswith(".gz"):
- for line in gzip.open(infile):
- line = line.decode("ascii")
- if line.startswith("#"):
- continue
- d = line.strip().split("\t")
- data.append(d)
+ if filename.endswith(".gz"):
+ with gzip.open(filename) as inf:
+ for line in inf:
+ line = line.decode("ascii")
+ if line.startswith("#"):
+ continue
+ d = line.strip().split("\t")
+ data.append(d)
else:
- with open(infile) as f:
+ with open(filename) as f:
for line in f:
if line.startswith("#"):
continue
@@ -69,10 +70,12 @@ def checkBinaryEqual(filename1, filename2):
if os.path.getsize(filename1) != os.path.getsize(filename2):
return False
- infile1 = open(filename1, "rb")
- infile2 = open(filename2, "rb")
-
- d1, d2 = infile1.read(), infile2.read()
+ with open(filename1, "rb") as infile:
+ d1 = infile.read()
+
+ with open(filename2, "rb") as infile:
+ d2 = infile.read()
+
found = False
for c1, c2 in zip(d1, d2):
if c1 != c2:
@@ -80,8 +83,6 @@ def checkBinaryEqual(filename1, filename2):
else:
found = True
- infile1.close()
- infile2.close()
return found
@@ -112,12 +113,10 @@ class TestCompression(unittest.TestCase):
def setUp(self):
- self.tmpfilename = "tmp_%i" % id(self)
- infile = gzip.open(self.filename, "rb")
- outfile = open(self.tmpfilename, "wb")
- outfile.write(infile.read())
- outfile.close()
- infile.close()
+ self.tmpfilename = "tmp_TestCompression_%i" % id(self)
+ with gzip.open(self.filename, "rb") as infile, \
+ open(self.tmpfilename, "wb") as outfile:
+ outfile.write(infile.read())
def testCompression(self):
'''see also issue 106'''
@@ -176,14 +175,13 @@ class IterationTest(unittest.TestCase):
def setUp(self):
lines = []
- inf = gzip.open(self.filename, "rb")
- for line in inf:
- line = line.decode('ascii')
- if line.startswith("#"):
- if not self.with_comments:
- continue
- lines.append(line)
- inf.close()
+ with gzip.open(self.filename, "rb") as inf:
+ for line in inf:
+ line = line.decode('ascii')
+ if line.startswith("#"):
+ if not self.with_comments:
+ continue
+ lines.append(line)
# creates index of contig, start, end, adds content without newline.
self.compare = [
@@ -252,11 +250,10 @@ class TestGZFile(IterationTest):
def setUp(self):
IterationTest.setUp(self)
-
- self.iter = pysam.GZIterator(self.filename)
+ self.gzfile = pysam.GZIterator(self.filename)
def testAll(self):
- result = list(self.iter)
+ result = list(self.gzfile)
ref = self.getSubset()
self.checkPairwise(result, ref)
@@ -350,7 +347,7 @@ class TestIterationWithoutComments(IterationTest):
# to be implemented
# self.assertRaises(IndexError, self.tabix.fetch, "chr1", 1000000, 2000000)
- # raise no error for invalid intervals
+ # raise no error for empty intervals
self.tabix.fetch("chr1", 100, 100)
def testGetContigs(self):
@@ -361,13 +358,12 @@ class TestIterationWithoutComments(IterationTest):
def testHeader(self):
ref = []
- inf = gzip.open(self.filename)
- for x in inf:
- x = x.decode("ascii")
- if not x.startswith("#"):
- break
- ref.append(x[:-1].encode('ascii'))
- inf.close()
+ with gzip.open(self.filename) as inf:
+ for x in inf:
+ x = x.decode("ascii")
+ if not x.startswith("#"):
+ break
+ ref.append(x[:-1].encode('ascii'))
header = list(self.tabix.header)
self.assertEqual(ref, header)
@@ -376,12 +372,15 @@ class TestIterationWithoutComments(IterationTest):
'''test repeated opening of the same file.'''
def func1():
# opens any tabix file
- inf = pysam.TabixFile(self.filename)
- return
-
- for i in range(10000):
+ with pysam.TabixFile(self.filename) as inf:
+ pass
+
+ for i in range(1000):
func1()
+ def tearDown(self):
+ self.tabix.close()
+
class TestIterationWithComments(TestIterationWithoutComments):
@@ -409,18 +408,21 @@ class TestParser(unittest.TestCase):
def testRead(self):
for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- self.assertEqual(len(self.compare[x]), len(r))
+ c = self.compare[x]
+ self.assertEqual(c, list(r))
+ self.assertEqual(len(c), len(r))
# test indexing
- for c in range(0, len(r)):
- self.assertEqual(self.compare[x][c], r[c])
+ for y in range(0, len(r)):
+ self.assertEqual(c[y], r[y])
# test slicing access
- for c in range(0, len(r) - 1):
- for cc in range(c + 1, len(r)):
- self.assertEqual(self.compare[x][c:cc],
- r[c:cc])
+ for y in range(0, len(r) - 1):
+ for cc in range(y + 1, len(r)):
+ self.assertEqual(c[y:cc],
+ r[y:cc])
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
def testWrite(self):
@@ -471,11 +473,9 @@ class TestParser(unittest.TestCase):
def testIteratorUncompressed(self):
'''test iteration from uncompressed file.'''
tmpfilename = 'tmp_testIteratorUncompressed'
- infile = gzip.open(self.filename, "rb")
- outfile = open(tmpfilename, "wb")
- outfile.write(infile.read())
- outfile.close()
- infile.close()
+ with gzip.open(self.filename, "rb") as infile, \
+ open(tmpfilename, "wb") as outfile:
+ outfile.write(infile.read())
with open(tmpfilename) as infile:
for x, r in enumerate(pysam.tabix_iterator(
@@ -504,7 +504,6 @@ class TestParser(unittest.TestCase):
b = copy.copy(a)
self.assertEqual(a, b)
-
class TestIterators(unittest.TestCase):
@@ -519,11 +518,9 @@ class TestIterators(unittest.TestCase):
self.tabix = pysam.TabixFile(self.filename)
self.compare = loadAndConvert(self.filename)
self.tmpfilename_uncompressed = 'tmp_TestIterators'
- infile = gzip.open(self.filename, "rb")
- outfile = open(self.tmpfilename_uncompressed, "wb")
- outfile.write(infile.read())
- outfile.close()
- infile.close()
+ with gzip.open(self.filename, "rb") as infile, \
+ open(self.tmpfilename_uncompressed, "wb") as outfile:
+ outfile.write(infile.read())
def open(self):
@@ -535,21 +532,20 @@ class TestIterators(unittest.TestCase):
def testIteration(self):
- infile = self.open()
-
- for x, r in enumerate(self.iterator(infile, self.parser())):
- self.assertEqual(self.compare[x], list(r))
- self.assertEqual(len(self.compare[x]), len(r))
+ with self.open() as infile:
+ for x, r in enumerate(self.iterator(infile, self.parser())):
+ self.assertEqual(self.compare[x], list(r))
+ self.assertEqual(len(self.compare[x]), len(r))
- # test indexing
- for c in range(0, len(r)):
- self.assertEqual(self.compare[x][c], r[c])
+ # test indexing
+ for c in range(0, len(r)):
+ self.assertEqual(self.compare[x][c], r[c])
- # test slicing access
- for c in range(0, len(r) - 1):
- for cc in range(c + 1, len(r)):
- self.assertEqual(self.compare[x][c:cc],
- r[c:cc])
+ # test slicing access
+ for c in range(0, len(r) - 1):
+ for cc in range(c + 1, len(r)):
+ self.assertEqual(self.compare[x][c:cc],
+ r[c:cc])
def testClosedFile(self):
'''test for error when iterating from closed file.'''
@@ -601,6 +597,8 @@ class TestGTF(TestParser):
if r.feature != 'gene':
self.assertTrue(r.transcript_id.startswith("ENST"))
self.assertEqual(c[0], r.contig)
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
class TestIterationMalformattedGTFFiles(unittest.TestCase):
@@ -613,19 +611,23 @@ class TestIterationMalformattedGTFFiles(unittest.TestCase):
def testGTFTooManyFields(self):
- iterator = self.iterator(
- gzip.open(os.path.join(DATADIR,
- "gtf_toomany_fields.gtf.gz")),
- parser=self.parser())
- self.assertRaises(ValueError, iterator.next)
+ with gzip.open(os.path.join(
+ DATADIR,
+ "gtf_toomany_fields.gtf.gz")) as infile:
+ iterator = self.iterator(
+ infile,
+ parser=self.parser())
+ self.assertRaises(ValueError, iterator.next)
def testGTFTooFewFields(self):
- iterator = self.iterator(
- gzip.open(os.path.join(DATADIR,
- "gtf_toofew_fields.gtf.gz")),
- parser=self.parser())
- self.assertRaises(ValueError, iterator.next)
+ with gzip.open(os.path.join(
+ DATADIR,
+ "gtf_toofew_fields.gtf.gz")) as infile:
+ iterator = self.iterator(
+ infile,
+ parser=self.parser())
+ self.assertRaises(ValueError, iterator.next)
class TestBed(unittest.TestCase):
@@ -646,6 +648,8 @@ class TestBed(unittest.TestCase):
self.assertEqual(int(c[1]), r.start)
self.assertEqual(int(c[2]), r.end)
self.assertEqual(list(c), list(r))
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
def testWrite(self):
@@ -666,6 +670,9 @@ class TestBed(unittest.TestCase):
self.assertEqual(int(c[2]) + 1, r.end)
self.assertEqual(str(int(c[2]) + 1), r[2])
+ def tearDown(self):
+ self.tabix.close()
+
class TestVCF(unittest.TestCase):
@@ -697,13 +704,15 @@ if IS_PYTHON3:
def testFromTabix(self):
# use ascii encoding - should raise error
- t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="ascii")
- results = list(t.fetch(parser=pysam.asVCF()))
- self.assertRaises(UnicodeDecodeError, getattr, results[1], "id")
+ with pysam.TabixFile(
+ self.tmpfilename + ".gz", encoding="ascii") as t:
+ results = list(t.fetch(parser=pysam.asVCF()))
+ self.assertRaises(UnicodeDecodeError, getattr, results[1], "id")
- t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="utf-8")
- results = list(t.fetch(parser=pysam.asVCF()))
- self.assertEqual(getattr(results[1], "id"), u"Rene\xe9")
+ with pysam.TabixFile(
+ self.tmpfilename + ".gz", encoding="utf-8") as t:
+ results = list(t.fetch(parser=pysam.asVCF()))
+ self.assertEqual(getattr(results[1], "id"), u"Rene\xe9")
def testFromVCF(self):
self.vcf = pysam.VCF()
@@ -751,6 +760,8 @@ class TestVCFFromTabix(TestVCF):
for y in range(len(c) - ncolumns):
self.assertEqual(c[ncolumns + y], r[y])
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
def testWrite(self):
@@ -793,6 +804,9 @@ class TestVCFFromTabix(TestVCF):
c[ncolumns + y] = "test_%i" % y
r[y] = "test_%i" % y
self.assertEqual(c[ncolumns + y], r[y])
+
+ def tearDown(self):
+ self.tabix.close()
class TestVCFFromVCF(TestVCF):
@@ -813,6 +827,15 @@ class TestVCFFromVCF(TestVCF):
fail_on_opening = ((24, "Error HEADING_NOT_SEPARATED_BY_TABS"),
)
+ fail_on_samples = []
+
+ check_samples = False
+ coordinate_offset = 1
+
+ # value returned for missing values
+ missing_value = "."
+ missing_quality = -1
+
def setUp(self):
TestVCF.setUp(self)
@@ -831,97 +854,180 @@ class TestVCFFromVCF(TestVCF):
else:
self.vcf.connect(self.tmpfilename + ".gz")
+ def get_iterator(self):
+
+ f = open(self.filename)
+ fn = os.path.basename(self.filename)
+
+ for x, msg in self.fail_on_opening:
+ if "%i.vcf" % x == fn:
+ self.assertRaises(ValueError, self.vcf.parse, f)
+ return
+
+ return self.vcf.parse(f)
+
+ def get_field_value(self, record, field):
+ return record[field]
+
+ def sample2value(self, r, v):
+ return r, v
+
+ def alt2value(self, r, v):
+ if r == ".":
+ return [], v
+ else:
+ return r.split(","), list(v)
+
+ def filter2value(self, r, v):
+ if r == "PASS":
+ return [], v
+ elif r == ".":
+ return [], v
+ else:
+ return r.split(";"), v
+
def testParsing(self):
+ itr = self.get_iterator()
+ if itr is None:
+ return
+
fn = os.path.basename(self.filename)
- with open(self.filename) as f:
- for x, msg in self.fail_on_opening:
- if "%i.vcf" % x == fn:
- self.assertRaises(ValueError, self.vcf.parse, f)
- return
- else:
- iter = self.vcf.parse(f)
+ for vcf_code, msg in self.fail_on_parsing:
+ if "%i.vcf" % vcf_code == fn:
+ self.assertRaises((ValueError,
+ AssertionError),
+ list, itr)
+ return
+ # python 2.7
+ # self.assertRaisesRegexp(
+ # ValueError, re.compile(msg), self.vcf.parse, f)
+
+ check_samples = self.check_samples
+ for vcf_code, msg in self.fail_on_samples:
+ if "%i.vcf" % vcf_code == fn:
+ check_samples = False
+
+ for x, r in enumerate(itr):
+ c = self.compare[x]
+ for y, field in enumerate(self.columns):
+ # it is ok to have a missing format column
+ if y == 8 and y == len(c):
+ continue
+
+ val = self.get_field_value(r, field)
+ if field == "pos":
+ self.assertEqual(int(c[y]) - self.coordinate_offset,
+ val)
+ elif field == "alt" or field == "alts":
+ cc, vv = self.alt2value(c[y], val)
+ if cc != vv:
+ # import pdb; pdb.set_trace()
+ pass
+ self.assertEqual(
+ cc, vv,
+ "mismatch in field %s: expected %s, got %s" %
+ (field, cc, vv))
+
+ elif field == "filter":
+ cc, vv = self.filter2value(c[y], val)
+ self.assertEqual(
+ cc, vv,
+ "mismatch in field %s: expected %s, got %s" %
+ (field, cc, vv))
+
+ elif field == "info":
+ # tests for info field not implemented
+ pass
+
+ elif field == "qual" and c[y] == ".":
+ self.assertEqual(
+ self.missing_quality, val,
+ "mismatch in field %s: expected %s, got %s" %
+ (field, c[y], val))
+
+ elif field == "format":
+ # format field converted to list
+ self.assertEqual(
+ c[y].split(":"), list(val),
+ "mismatch in field %s: expected %s, got %s" %
+ (field, c[y], val))
+
+ elif type(val) in (int, float):
+ if c[y] == ".":
+ self.assertEqual(
+ None, val,
+ "mismatch in field %s: expected %s, got %s" %
+ (field, c[y], val))
+ else:
+ self.assertAlmostEqual(
+ float(c[y]), float(val), 2,
+ "mismatch in field %s: expected %s, got %s" %
+ (field, c[y], val))
+ else:
+ if c[y] == ".":
+ ref_val = self.missing_value
+ else:
+ ref_val = c[y]
+ self.assertEqual(
+ ref_val, val,
+ "mismatch in field %s: expected %s(%s), got %s(%s)" %
+ (field, ref_val, type(ref_val), val, type(val)))
+ # parse samples
+ if check_samples:
+ if len(c) == 8:
+ for x, s in enumerate(r.samples):
+ self.assertEqual(
+ [], r.samples[s].values(),
+ "mismatch in sample {}: "
+ "expected [], got {}, src={}, line={}".format(
+ s, r.samples[s].values(),
+ r.samples[s].items(), r))
+ else:
+ for x, s in enumerate(r.samples):
+ ref, comp = self.sample2value(
+ c[9 + x],
+ r.samples[s])
+ self.compare_samples(ref, comp, s, r)
+
+ def compare_samples(self, ref, comp, s, r):
+
+ if ref != comp:
+
+ # check if GT not at start, not VCF conform and
+ # not supported by cbcf.pyx
+ k = r.format.keys()
+ if "GT" in k and k[0] != "GT":
+ return
+
+ # perform an element-wise checto work around rounding differences
+ for a, b in zip(re.split("[:,;]", ref),
+ re.split("[:,;]", comp)):
+ is_float = True
+ try:
+ a = float(a)
+ b = float(b)
+ except ValueError:
+ is_float = False
+
+ if is_float:
+ self.assertAlmostEqual(
+ a, b, 2,
+ "mismatch in sample {}: "
+ "expected {}, got {}, src={}, line={}"
+ .format(
+ s, ref, comp,
+ r.samples[s].items(), r))
+ else:
+ self.assertEqual(
+ a, b,
+ "mismatch in sample {}: "
+ "expected {}, got {}, src={}, line={}"
+ .format(
+ s, ref, comp,
+ r.samples[s].items(), r))
- for x, msg in self.fail_on_parsing:
- if "%i.vcf" % x == fn:
- self.assertRaises(ValueError, list, iter)
- break
- # python 2.7
- # self.assertRaisesRegexp(
- # ValueError, re.compile(msg), self.vcf.parse, f)
- else:
- # do the actual parsing
- for x, r in enumerate(iter):
- c = self.compare[x]
- for y, field in enumerate(self.columns):
- # it is ok to have a missing format column
- if y == 8 and y == len(c):
- continue
-
- val = r[field]
- if field == "pos":
- self.assertEqual(int(c[y]) - 1, val)
- elif field == "alt":
- if c[y] == ".":
- # convert . to empty list
- self.assertEqual(
- [], val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
- else:
- # convert to list
- self.assertEqual(
- c[y].split(","), val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
-
- elif field == "filter":
- if c[y] == "PASS" or c[y] == ".":
- # convert PASS to empty list
- self.assertEqual(
- [], val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
- else:
- # convert to list
- self.assertEqual(
- c[y].split(";"), val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
-
- elif field == "info":
- # tests for info field not implemented
- pass
- elif field == "qual" and c[y] == ".":
- self.assertEqual(
- -1, val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
-
- elif field == "format":
- # format field converted to list
- self.assertEqual(
- c[y].split(":"), val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
-
- elif type(val) in (int, float):
- if c[y] == ".":
- self.assertEqual(
- None, val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
- else:
- self.assertEqual(
- float(c[y]), float(val),
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
- else:
- self.assertEqual(
- c[y], val,
- "mismatch in field %s: expected %s, got %s" %
- (field, c[y], val))
############################################################################
# create a test class for each example vcf file.
@@ -937,6 +1043,80 @@ for vcf_file in vcf_files:
globals()[n] = type(n, (TestVCFFromVCF,), dict(filename=vcf_file,))
+class TestVCFFromVariantFile(TestVCFFromVCF):
+
+ columns = ("chrom", "pos", "id",
+ "ref", "alts", "qual",
+ "filter", "info", "format")
+
+ fail_on_parsing = []
+ fail_on_opening = []
+ coordinate_offset = 0
+ check_samples = True
+ fail_on_samples = [
+ (9, "PL field not defined. Expected to be scalar, but is array"),
+ (12, "PL field not defined. Expected to be scalar, but is array"),
+ (18, "PL field not defined. Expected to be scalar, but is array"),
+ ]
+
+ # value returned for missing values
+ missing_value = None
+ missing_quality = None
+
+ def filter2value(self, r, v):
+ if r == "PASS":
+ return ["PASS"], list(v)
+ elif r == ".":
+ return [], list(v)
+ else:
+ return r.split(";"), list(v)
+
+ def alt2value(self, r, v):
+ if r == ".":
+ return None, v
+ else:
+ return r.split(","), list(v)
+
+ def sample2value(self, r, smp):
+
+ def convert_field(f):
+ if f is None:
+ return "."
+ elif isinstance(f, tuple):
+ return ",".join(map(convert_field, f))
+ else:
+ return str(f)
+
+ v = smp.values()
+
+ if 'GT' in smp:
+ alleles = [str(a) if a is not None else '.' for a in smp.allele_indices]
+ v[0] = '/|'[smp.phased].join(alleles)
+
+ comp = ":".join(map(convert_field, v))
+
+ if comp.endswith(":."):
+ comp = comp[:-2]
+
+ return r, comp
+
+ def setUp(self):
+ TestVCF.setUp(self)
+ self.compare = loadAndConvert(self.filename, encode=False)
+
+ def get_iterator(self):
+ vcf = pysam.VariantFile(self.filename)
+ return vcf.fetch()
+
+ def get_field_value(self, record, field):
+ return getattr(record, field)
+
+
+for vcf_file in vcf_files:
+ n = "TestVCFFromVariantFile_%s" % os.path.basename(vcf_file[:-4])
+ globals()[n] = type(n, (TestVCFFromVariantFile,), dict(filename=vcf_file,))
+
+
class TestRemoteFileHTTP(unittest.TestCase):
url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example_htslib.gtf.gz"
@@ -965,6 +1145,10 @@ class TestRemoteFileHTTP(unittest.TestCase):
self.remote_file,
"header")
+ def tearDown(self):
+ self.remote_file.close()
+ self.local_file.close()
+
class TestIndexArgument(unittest.TestCase):
@@ -977,17 +1161,24 @@ class TestIndexArgument(unittest.TestCase):
def testFetchAll(self):
shutil.copyfile(self.filename_src, self.filename_dst)
shutil.copyfile(self.index_src, self.index_dst)
- same_basename_file = pysam.TabixFile(
- self.filename_src, "r", index=self.index_src)
- same_basename_results = list(same_basename_file.fetch())
- diff_index_file = pysam.TabixFile(
- self.filename_dst, "r", index=self.index_dst)
- diff_index_result = list(diff_index_file.fetch())
+
+ with pysam.TabixFile(
+ self.filename_src, "r", index=self.index_src) as \
+ same_basename_file:
+ same_basename_results = list(same_basename_file.fetch())
+
+ with pysam.TabixFile(
+ self.filename_dst, "r", index=self.index_dst) as \
+ diff_index_file:
+ diff_index_result = list(diff_index_file.fetch())
self.assertEqual(len(same_basename_results), len(diff_index_result))
for x, y in zip(same_basename_results, diff_index_result):
self.assertEqual(x, y)
+ os.unlink(self.filename_dst)
+ os.unlink(self.index_dst)
+
def _TestMultipleIteratorsHelper(filename, multiple_iterators):
'''open file within scope, return iterator.'''
@@ -999,6 +1190,44 @@ def _TestMultipleIteratorsHelper(filename, multiple_iterators):
return iterator
+class TestBackwardsCompatibility(unittest.TestCase):
+ """check if error is raised if a tabix file from an
+ old version is accessed from pysam"""
+
+ def check(self, filename, raises=None):
+ with pysam.TabixFile(filename) as tf:
+ ref = loadAndConvert(filename)
+ if raises is None:
+ self.assertEqual(len(list(tf.fetch())), len(ref))
+ else:
+ self.assertRaises(raises, tf.fetch)
+
+ def testVCF0v23(self):
+ self.check(os.path.join(DATADIR, "example_0v23.vcf.gz"),
+ ValueError)
+
+ def testBED0v23(self):
+ self.check(os.path.join(DATADIR, "example_0v23.bed.gz"),
+ ValueError)
+
+ def testVCF0v26(self):
+ self.check(os.path.join(DATADIR, "example_0v26.vcf.gz"),
+ ValueError)
+
+ def testBED0v26(self):
+ self.check(os.path.join(DATADIR, "example_0v26.bed.gz"),
+ ValueError)
+
+ def testVCF(self):
+ self.check(os.path.join(DATADIR, "example.vcf.gz"))
+
+ def testBED(self):
+ self.check(os.path.join(DATADIR, "example.bed.gz"))
+
+ def testEmpty(self):
+ self.check(os.path.join(DATADIR, "empty.bed.gz"))
+
+
class TestMultipleIterators(unittest.TestCase):
filename = os.path.join(DATADIR, "example.gtf.gz")
@@ -1006,22 +1235,22 @@ class TestMultipleIterators(unittest.TestCase):
def testJoinedIterators(self):
# two iterators working on the same file
- tabix = pysam.TabixFile(self.filename)
- a = tabix.fetch(parser=pysam.asGTF()).next()
- b = tabix.fetch(parser=pysam.asGTF()).next()
- # the first two lines differ only by the feature field
- self.assertEqual(a.feature, "UTR")
- self.assertEqual(b.feature, "exon")
- self.assertEqual(re.sub("UTR", "", str(a)),
- re.sub("exon", "", str(b)))
+ with pysam.TabixFile(self.filename) as tabix:
+ a = tabix.fetch(parser=pysam.asGTF()).next()
+ b = tabix.fetch(parser=pysam.asGTF()).next()
+ # the first two lines differ only by the feature field
+ self.assertEqual(a.feature, "UTR")
+ self.assertEqual(b.feature, "exon")
+ self.assertEqual(re.sub("UTR", "", str(a)),
+ re.sub("exon", "", str(b)))
def testDisjointIterators(self):
# two iterators working on the same file
- tabix = pysam.TabixFile(self.filename)
- a = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
- b = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
- # both iterators are at top of file
- self.assertEqual(str(a), str(b))
+ with pysam.TabixFile(self.filename) as tabix:
+ a = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
+ b = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next()
+ # both iterators are at top of file
+ self.assertEqual(str(a), str(b))
def testScope(self):
# technically it does not really test if the scope is correct
@@ -1034,11 +1263,11 @@ class TestMultipleIterators(unittest.TestCase):
def testDoubleFetch(self):
- f = pysam.TabixFile(self.filename)
+ with pysam.TabixFile(self.filename) as f:
- for a, b in zip(f.fetch(multiple_iterators=True),
- f.fetch(multiple_iterators=True)):
- self.assertEqual(str(a), str(b))
+ for a, b in zip(f.fetch(multiple_iterators=True),
+ f.fetch(multiple_iterators=True)):
+ self.assertEqual(str(a), str(b))
class TestContextManager(unittest.TestCase):
@@ -1052,6 +1281,5 @@ class TestContextManager(unittest.TestCase):
self.assertEqual(tabixfile.closed, True)
-
if __name__ == "__main__":
unittest.main()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pysam.git
More information about the debian-med-commit
mailing list