[med-svn] [python-pysam] 01/04: Imported Upstream version 0.11.2.2+ds
Afif Elghraoui
afif at moszumanska.debian.org
Tue Jul 4 04:25:06 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository python-pysam.
commit 1520aab08562a7f44fd4570ea351a3dbd5db5a35
Author: Afif Elghraoui <afif at debian.org>
Date: Sun Jul 2 03:50:19 2017 -0400
Imported Upstream version 0.11.2.2+ds
---
.gitignore | 14 +-
MANIFEST.in | 7 +-
bcftools/HMM.c | 141 +-
bcftools/HMM.c.pysam.c | 141 +-
bcftools/HMM.h | 26 +-
{samtools => bcftools}/bam2bcf.c | 54 +-
{samtools => bcftools}/bam2bcf.c.pysam.c | 54 +-
{samtools => bcftools}/bam2bcf.h | 7 +-
{samtools => bcftools}/bam2bcf_indel.c | 141 +-
{samtools => bcftools}/bam2bcf_indel.c.pysam.c | 141 +-
bcftools/bam_sample.c | 393 ++
bcftools/bam_sample.c.pysam.c | 395 ++
samtools/errmod.h => bcftools/bam_sample.h | 41 +-
bcftools/bcftools.h | 1 +
bcftools/bin.c | 104 +
bcftools/bin.c.pysam.c | 106 +
bcftools/bin.h | 65 +
bcftools/call.h | 3 +-
bcftools/ccall.c | 10 +-
bcftools/ccall.c.pysam.c | 10 +-
bcftools/consensus.c | 56 +-
bcftools/consensus.c.pysam.c | 56 +-
bcftools/convert.c | 490 ++-
bcftools/convert.c.pysam.c | 490 ++-
bcftools/csq.c | 3824 +++++++++++++++++++
bcftools/csq.c.pysam.c | 3826 ++++++++++++++++++++
bcftools/filter.c | 357 +-
bcftools/filter.c.pysam.c | 357 +-
bcftools/hclust.c | 400 ++
bcftools/hclust.c.pysam.c | 402 ++
bcftools/hclust.h | 77 +
bcftools/kheap.h | 171 +
bcftools/main.c | 10 +
bcftools/main.c.pysam.c | 10 +
bcftools/mcall.c | 228 +-
bcftools/mcall.c.pysam.c | 228 +-
bcftools/mpileup.c | 1110 ++++++
bcftools/mpileup.c.pysam.c | 1112 ++++++
bcftools/mw.h | 1944 ++++++++++
bcftools/ploidy.c | 22 +-
bcftools/ploidy.c.pysam.c | 22 +-
bcftools/ploidy.h | 2 +-
bcftools/prob1.c | 12 +-
bcftools/prob1.c.pysam.c | 12 +-
bcftools/prob1.h | 2 +-
bcftools/regidx.c | 598 +++
bcftools/regidx.c.pysam.c | 600 +++
bcftools/regidx.h | 191 +
bcftools/smpl_ilist.c | 106 +
bcftools/smpl_ilist.c.pysam.c | 108 +
bcftools/smpl_ilist.h | 47 +
bcftools/tabix.c | 30 +-
bcftools/tabix.c.pysam.c | 30 +-
bcftools/tsv2vcf.c | 1 +
bcftools/tsv2vcf.c.pysam.c | 1 +
bcftools/vcfannotate.c | 744 ++--
bcftools/vcfannotate.c.pysam.c | 744 ++--
bcftools/vcfcall.c | 44 +-
bcftools/vcfcall.c.pysam.c | 44 +-
bcftools/vcfcnv.c | 46 +-
bcftools/vcfcnv.c.pysam.c | 46 +-
bcftools/vcfconcat.c | 164 +-
bcftools/vcfconcat.c.pysam.c | 164 +-
bcftools/vcfconvert.c | 120 +-
bcftools/vcfconvert.c.pysam.c | 120 +-
bcftools/vcffilter.c | 3 +-
bcftools/vcffilter.c.pysam.c | 3 +-
bcftools/vcfgtcheck.c | 298 +-
bcftools/vcfgtcheck.c.pysam.c | 298 +-
bcftools/vcfindex.c | 115 +-
bcftools/vcfindex.c.pysam.c | 115 +-
bcftools/vcfmerge.c | 1075 ++++--
bcftools/vcfmerge.c.pysam.c | 1075 ++++--
bcftools/vcfnorm.c | 82 +-
bcftools/vcfnorm.c.pysam.c | 82 +-
bcftools/vcfplugin.c | 56 +-
bcftools/vcfplugin.c.pysam.c | 56 +-
bcftools/vcfroh.c | 961 +++--
bcftools/vcfroh.c.pysam.c | 961 +++--
bcftools/vcfstats.c | 429 ++-
bcftools/vcfstats.c.pysam.c | 429 ++-
bcftools/vcfview.c | 50 +-
bcftools/vcfview.c.pysam.c | 50 +-
bcftools/version.h | 2 +-
buildwheels.sh | 2 +-
doc/api.rst | 4 +-
doc/release.rst | 75 +
doc/usage.rst | 19 +-
import.py | 29 +-
pysam/__init__.py | 2 +
pysam/cbcftools_util.h | 6 +
pysam/csamtools_util.h | 6 +
pysam/htslib_util.h | 20 -
pysam/libcalignedsegment.pxd | 6 -
pysam/libcalignedsegment.pyx | 288 +-
pysam/libcalignmentfile.pyx | 344 +-
pysam/libcbcf.pxd | 21 +-
pysam/libcbcf.pyx | 848 ++++-
pysam/libcbcftools.pxd | 3 +
pysam/libcbcftools.pyx | 2 +
pysam/libcbgzf.pyx | 53 +-
pysam/libcfaidx.pyx | 31 +-
pysam/libchtslib.pxd | 679 +++-
pysam/libchtslib.pyx | 285 +-
pysam/libcsamtools.pxd | 3 +
pysam/libcsamtools.pyx | 2 +
pysam/libctabix.pxd | 4 +
pysam/libctabix.pyx | 42 +-
pysam/libctabixproxies.pxd | 14 +-
pysam/libctabixproxies.pyx | 506 ++-
pysam/libcutils.pxd | 4 +-
pysam/libcutils.pyx | 30 +-
pysam/pysam_util.c | 6 +-
pysam/pysam_util.h | 4 +
pysam/samfile_util.c | 172 -
pysam/samfile_util.h | 3 -
pysam/tabix_util.c | 1 +
pysam/utils.py | 13 +-
pysam/version.py | 11 +-
samtools/bam.h | 2 +-
samtools/bam2bcf.c | 2 +-
samtools/bam2bcf.c.pysam.c | 2 +-
samtools/bam2bcf.h | 2 +-
samtools/bam2bcf_indel.c | 26 +-
samtools/bam2bcf_indel.c.pysam.c | 26 +-
samtools/bam2depth.c | 30 +-
samtools/bam2depth.c.pysam.c | 30 +-
samtools/bam_addrprg.c | 86 +-
samtools/bam_addrprg.c.pysam.c | 86 +-
samtools/bam_cat.c | 60 +-
samtools/bam_cat.c.pysam.c | 60 +-
samtools/bam_index.c | 52 +-
samtools/bam_index.c.pysam.c | 52 +-
samtools/bam_mate.c | 71 +-
samtools/bam_mate.c.pysam.c | 71 +-
samtools/bam_md.c | 202 +-
samtools/bam_md.c.pysam.c | 204 +-
samtools/bam_plcmd.c | 122 +-
samtools/bam_plcmd.c.pysam.c | 122 +-
samtools/bam_quickcheck.c | 52 +-
samtools/bam_quickcheck.c.pysam.c | 52 +-
samtools/bam_reheader.c | 4 +-
samtools/bam_reheader.c.pysam.c | 14 +-
samtools/bam_rmdup.c | 4 +-
samtools/bam_rmdup.c.pysam.c | 4 +-
samtools/bam_sort.c | 178 +-
samtools/bam_sort.c.pysam.c | 178 +-
samtools/bam_split.c | 99 +-
samtools/bam_split.c.pysam.c | 99 +-
samtools/bam_stat.c | 28 +-
samtools/bam_stat.c.pysam.c | 38 +-
samtools/bam_tview.c | 441 ---
samtools/bam_tview.c.pysam.c | 443 ---
samtools/bam_tview.h | 105 -
samtools/bam_tview_curses.c | 352 --
samtools/bam_tview_curses.c.pysam.c | 354 --
samtools/bam_tview_html.c | 377 --
samtools/bam_tview_html.c.pysam.c | 379 --
samtools/bamshuf.c | 22 +-
samtools/bamshuf.c.pysam.c | 22 +-
samtools/bamtk.c | 35 +-
samtools/bamtk.c.pysam.c | 39 +-
samtools/bedcov.c | 8 +-
samtools/bedcov.c.pysam.c | 8 +-
samtools/cut_target.c | 19 +-
samtools/cut_target.c.pysam.c | 19 +-
samtools/errmod.c | 194 -
samtools/errmod.c.pysam.c | 196 -
samtools/faidx.c | 74 +-
samtools/faidx.c.pysam.c | 74 +-
samtools/kprobaln.c | 282 --
samtools/kprobaln.c.pysam.c | 284 --
samtools/kprobaln.h | 49 -
samtools/misc/ace2sam.c | 5 +-
samtools/misc/ace2sam.c.pysam.c | 5 +-
samtools/padding.c | 4 +-
samtools/padding.c.pysam.c | 4 +-
samtools/phase.c | 6 +-
samtools/phase.c.pysam.c | 6 +-
samtools/sam.h | 2 +-
samtools/sam_opts.c | 8 +-
samtools/sam_opts.c.pysam.c | 8 +-
samtools/sam_opts.h | 7 +-
samtools/{test/test.c => sam_utils.c} | 51 +-
.../{test/test.c.pysam.c => sam_utils.c.pysam.c} | 51 +-
samtools/sam_view.c | 498 ++-
samtools/sam_view.c.pysam.c | 500 ++-
samtools/stats.c | 20 +-
samtools/stats.c.pysam.c | 29 +-
samtools/test/split/test_filter_header_rg.c | 15 +-
.../test/split/test_filter_header_rg.c.pysam.c | 15 +-
samtools/test/test.c | 8 +-
samtools/test/test.c.pysam.c | 8 +-
samtools/version.h | 2 +-
setup.py | 69 +-
tests/AlignedSegment_test.py | 93 +-
tests/AlignmentFile_test.py | 143 +-
tests/SamFile_test.py | 1990 ----------
tests/StreamFiledescriptors_test.py | 71 +-
tests/TestUtils.py | 32 +-
tests/VariantFile_test.py | 5 +-
tests/faidx_test.py | 14 +-
tests/samtools_test.py | 47 +-
tests/tabix_data/example.gff2.gz | Bin 0 -> 238 bytes
tests/tabix_data/example.gff2.gz.tbi | Bin 0 -> 107 bytes
tests/tabix_data/example.gff3.gz | Bin 0 -> 3067 bytes
tests/tabix_data/example.gff3.gz.tbi | Bin 0 -> 1457 bytes
tests/tabix_test.py | 262 +-
tests/tabixproxies_test.py | 318 ++
209 files changed, 28682 insertions(+), 12148 deletions(-)
diff --git a/.gitignore b/.gitignore
index 598948d..0910be8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,19 +23,7 @@ htslib/config.mk
pysam/config.py
# cython files
-pysam/TabProxies.c
-pysam/csamtools.c
-pysam/ctabix.c
-pysam/cvcf.c
-pysam/chtslib.c
-pysam/cutils.c
-pysam/calignedsegment.c
-pysam/calignmentfile.c
-pysam/cbcf.c
-pysam/cfaidx.c
-pysam/chtslib.c
-pysam/csamfile.c
-pysam/ctabixproxies.c
+pysam/libc*.c
###### Generic python ignores below ######
diff --git a/MANIFEST.in b/MANIFEST.in
index be43691..3f2a9cb 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -10,9 +10,9 @@ include KNOWN_BUGS
include THANKS
include cy_build.py
include requirements.txt
-include pysam/c*.pxd
-include pysam/c*.pyx
-include pysam/c*.c
+include pysam/libc*.pxd
+include pysam/libc*.pyx
+include pysam/libc*.c
include pysam/*.c
include pysam/*.h
include samtools/configure
@@ -29,6 +29,7 @@ include htslib/htslib_vars.mk
include htslib/configure
include htslib/config.mk.in
include htslib/config.h.in
+include htslib/htslib.pc.in
include htslib/htslib/*.h
include htslib/cram/*.c
include htslib/cram/*.h
diff --git a/bcftools/HMM.c b/bcftools/HMM.c
index 9196544..5795987 100644
--- a/bcftools/HMM.c
+++ b/bcftools/HMM.c
@@ -31,6 +31,17 @@
#include <htslib/hts.h>
#include "HMM.h"
+typedef struct
+{
+ int nstates; // number of hmm's states
+ int isite; // take snapshot at i-th position
+ uint32_t pos; // i-th site's position
+ double *vit_prob; // viterbi probabilities, NULL for uniform probs
+ double *fwd_prob; // transition probabilities
+ double *bwd_prob; // transition probabilities
+}
+snapshot_t;
+
struct _hmm_t
{
int nstates; // number of states
@@ -50,7 +61,8 @@ struct _hmm_t
set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities
// at each site (one step of Viterbi algorithm)
void *set_tprob_data;
- double *init_probs; // Initial state probabilities, NULL for uniform probs
+ snapshot_t init; // Initial state probabilities. Set isite=1 when site should be used
+ snapshot_t *snapshot;
};
uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
@@ -78,28 +90,79 @@ static inline void multiply_matrix(int n, double *a, double *b, double *dst, dou
memcpy(dst,out,sizeof(double)*n*n);
}
+void hmm_init_states(hmm_t *hmm, double *probs)
+{
+ hmm->init.isite = 0;
+ hmm->init.pos = 0;
+ if ( !hmm->init.vit_prob )
+ hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.fwd_prob )
+ hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.bwd_prob )
+ hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+
+ int i;
+ if ( probs )
+ {
+ memcpy(hmm->init.vit_prob,probs,sizeof(double)*hmm->nstates);
+ double sum = 0;
+ for (i=0; i<hmm->nstates; i++) sum += hmm->init.vit_prob[i];
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] /= sum;
+ }
+ else
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates;
+
+ memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+}
hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
{
hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t));
hmm->nstates = nstates;
hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates);
hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates);
-
hmm_set_tprob(hmm, tprob, ntprob);
-
+ hmm_init_states(hmm, NULL);
return hmm;
}
-void hmm_init_states(hmm_t *hmm, double *probs)
+void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
{
- if ( !probs )
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( snapshot && snapshot->nstates!=hmm->nstates )
{
- free(hmm->init_probs);
- hmm->init_probs = NULL;
+ free(snapshot);
+ snapshot = NULL;
}
-
- if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates);
- memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates);
+ if ( !snapshot )
+ {
+ // Allocate the snapshot as a single memory block so that it can be
+ // free()-ed by the user. So make sure the arrays are aligned..
+ size_t str_size = sizeof(snapshot_t);
+ size_t dbl_size = sizeof(double);
+ size_t pad_size = (dbl_size - str_size % dbl_size) % dbl_size;
+ uint8_t *mem = (uint8_t*) malloc(str_size + pad_size + dbl_size*2*hmm->nstates);
+ snapshot = (snapshot_t*) mem;
+ snapshot->nstates = hmm->nstates;
+ snapshot->vit_prob = (double*) (mem + str_size + pad_size);
+ snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates;
+ }
+ snapshot->isite = isite;
+ hmm->snapshot = snapshot;
+ return snapshot;
+}
+void hmm_restore(hmm_t *hmm, void *_snapshot)
+{
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( !snapshot )
+ {
+ hmm->init.isite = 0;
+ return;
+ }
+ hmm->init.isite = 1;
+ hmm->init.pos = snapshot->pos;
+ memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
}
void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
@@ -154,23 +217,18 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
}
-
// Init all states with equal likelihood
int i,j, nstates = hmm->nstates;
- if ( hmm->init_probs )
- for (i=0; i<nstates; i++) hmm->vprob[i] = hmm->init_probs[i];
- else
- for (i=0; i<nstates; i++) hmm->vprob[i] = 1./nstates;
+ memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run Viterbi
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
uint8_t *vpath = &hmm->vpath[i*nstates];
double *eprob = &eprobs[i*nstates];
int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
-
_set_tprob(hmm, pos_diff);
if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
prev_pos = sites[i];
@@ -191,6 +249,12 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
}
for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
+
+ if ( hmm->snapshot && i==hmm->snapshot->isite )
+ {
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates);
+ }
}
// Find the most likely state
@@ -224,19 +288,12 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
@@ -261,6 +318,13 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
for (j=0; j<nstates; j++) fwd[j] /= norm;
}
+ if ( hmm->snapshot )
+ {
+ i = hmm->snapshot->isite;
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates);
+ }
+
// Run bwd
double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
prev_pos = sites[n-1];
@@ -296,7 +360,7 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
}
}
-void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
{
// Init arrays when run for the first time
if ( hmm->nfwd < n )
@@ -312,16 +376,9 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// New transition matrix: temporary values
double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
@@ -329,7 +386,6 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
double *fwd_bwd = (double*) malloc(sizeof(double)*nstates);
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
@@ -416,11 +472,14 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
free(tmp_gamma);
free(tmp_xi);
free(fwd_bwd);
+ return hmm->curr_tprob;
}
void hmm_destroy(hmm_t *hmm)
{
- free(hmm->init_probs);
+ free(hmm->init.vit_prob);
+ free(hmm->init.fwd_prob);
+ free(hmm->init.bwd_prob);
free(hmm->vprob);
free(hmm->vprob_tmp);
free(hmm->vpath);
diff --git a/bcftools/HMM.c.pysam.c b/bcftools/HMM.c.pysam.c
index a3b91ff..513da35 100644
--- a/bcftools/HMM.c.pysam.c
+++ b/bcftools/HMM.c.pysam.c
@@ -33,6 +33,17 @@
#include <htslib/hts.h>
#include "HMM.h"
+typedef struct
+{
+ int nstates; // number of hmm's states
+ int isite; // take snapshot at i-th position
+ uint32_t pos; // i-th site's position
+ double *vit_prob; // viterbi probabilities, NULL for uniform probs
+ double *fwd_prob; // transition probabilities
+ double *bwd_prob; // transition probabilities
+}
+snapshot_t;
+
struct _hmm_t
{
int nstates; // number of states
@@ -52,7 +63,8 @@ struct _hmm_t
set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities
// at each site (one step of Viterbi algorithm)
void *set_tprob_data;
- double *init_probs; // Initial state probabilities, NULL for uniform probs
+ snapshot_t init; // Initial state probabilities. Set isite=1 when site should be used
+ snapshot_t *snapshot;
};
uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
@@ -80,28 +92,79 @@ static inline void multiply_matrix(int n, double *a, double *b, double *dst, dou
memcpy(dst,out,sizeof(double)*n*n);
}
+void hmm_init_states(hmm_t *hmm, double *probs)
+{
+ hmm->init.isite = 0;
+ hmm->init.pos = 0;
+ if ( !hmm->init.vit_prob )
+ hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.fwd_prob )
+ hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.bwd_prob )
+ hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+
+ int i;
+ if ( probs )
+ {
+ memcpy(hmm->init.vit_prob,probs,sizeof(double)*hmm->nstates);
+ double sum = 0;
+ for (i=0; i<hmm->nstates; i++) sum += hmm->init.vit_prob[i];
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] /= sum;
+ }
+ else
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates;
+
+ memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+}
hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
{
hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t));
hmm->nstates = nstates;
hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates);
hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates);
-
hmm_set_tprob(hmm, tprob, ntprob);
-
+ hmm_init_states(hmm, NULL);
return hmm;
}
-void hmm_init_states(hmm_t *hmm, double *probs)
+void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
{
- if ( !probs )
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( snapshot && snapshot->nstates!=hmm->nstates )
{
- free(hmm->init_probs);
- hmm->init_probs = NULL;
+ free(snapshot);
+ snapshot = NULL;
}
-
- if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates);
- memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates);
+ if ( !snapshot )
+ {
+ // Allocate the snapshot as a single memory block so that it can be
+ // free()-ed by the user. So make sure the arrays are aligned..
+ size_t str_size = sizeof(snapshot_t);
+ size_t dbl_size = sizeof(double);
+ size_t pad_size = (dbl_size - str_size % dbl_size) % dbl_size;
+ uint8_t *mem = (uint8_t*) malloc(str_size + pad_size + dbl_size*2*hmm->nstates);
+ snapshot = (snapshot_t*) mem;
+ snapshot->nstates = hmm->nstates;
+ snapshot->vit_prob = (double*) (mem + str_size + pad_size);
+ snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates;
+ }
+ snapshot->isite = isite;
+ hmm->snapshot = snapshot;
+ return snapshot;
+}
+void hmm_restore(hmm_t *hmm, void *_snapshot)
+{
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( !snapshot )
+ {
+ hmm->init.isite = 0;
+ return;
+ }
+ hmm->init.isite = 1;
+ hmm->init.pos = snapshot->pos;
+ memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
}
void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
@@ -156,23 +219,18 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
}
-
// Init all states with equal likelihood
int i,j, nstates = hmm->nstates;
- if ( hmm->init_probs )
- for (i=0; i<nstates; i++) hmm->vprob[i] = hmm->init_probs[i];
- else
- for (i=0; i<nstates; i++) hmm->vprob[i] = 1./nstates;
+ memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run Viterbi
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
uint8_t *vpath = &hmm->vpath[i*nstates];
double *eprob = &eprobs[i*nstates];
int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
-
_set_tprob(hmm, pos_diff);
if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
prev_pos = sites[i];
@@ -193,6 +251,12 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
}
for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
+
+ if ( hmm->snapshot && i==hmm->snapshot->isite )
+ {
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates);
+ }
}
// Find the most likely state
@@ -226,19 +290,12 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
@@ -263,6 +320,13 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
for (j=0; j<nstates; j++) fwd[j] /= norm;
}
+ if ( hmm->snapshot )
+ {
+ i = hmm->snapshot->isite;
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates);
+ }
+
// Run bwd
double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
prev_pos = sites[n-1];
@@ -298,7 +362,7 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
}
}
-void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
{
// Init arrays when run for the first time
if ( hmm->nfwd < n )
@@ -314,16 +378,9 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// New transition matrix: temporary values
double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
@@ -331,7 +388,6 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
double *fwd_bwd = (double*) malloc(sizeof(double)*nstates);
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
@@ -418,11 +474,14 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
free(tmp_gamma);
free(tmp_xi);
free(fwd_bwd);
+ return hmm->curr_tprob;
}
void hmm_destroy(hmm_t *hmm)
{
- free(hmm->init_probs);
+ free(hmm->init.vit_prob);
+ free(hmm->init.fwd_prob);
+ free(hmm->init.bwd_prob);
free(hmm->vprob);
free(hmm->vprob_tmp);
free(hmm->vpath);
diff --git a/bcftools/HMM.h b/bcftools/HMM.h
index 7f01245..3e5cf7f 100644
--- a/bcftools/HMM.h
+++ b/bcftools/HMM.h
@@ -44,6 +44,10 @@ typedef void (*set_tprob_f) (hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *
hmm_t *hmm_init(int nstates, double *tprob, int ntprob);
void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob);
+#define HMM_VIT 1
+#define HMM_FWD 2
+#define HMM_BWD 4
+
/**
* hmm_init_states() - initial state probabilities
* @probs: initial state probabilities or NULL to reset to default
@@ -53,6 +57,20 @@ void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob);
void hmm_init_states(hmm_t *hmm, double *probs);
/**
+ * hmm_snapshot() - take the model's snapshot, intended for sliding HMM
+ * @snapshot: NULL or snapshot returned by previous hmm_snapshot() call, must be free()-ed by the caller
+ * @isite: take the snapshot at i-th step
+ */
+void *hmm_snapshot(hmm_t *hmm, void *snapshot, int isite);
+
+/**
+ * hmm_restore() - restore model's snapshot, intended for sliding HMM
+ * @snapshot: snapshot returned by hmm_snapshot() call or NULL to reset
+ * @isite: take the snapshot at i-th step
+ */
+void hmm_restore(hmm_t *hmm, void *snapshot);
+
+/**
* hmm_get_tprob() - return the array of transition matrices, precalculated
* to ntprob positions. The first matrix is the initial tprob matrix
* set by hmm_init() or hmm_set_tprob()
@@ -103,11 +121,11 @@ double *hmm_get_fwd_bwd_prob(hmm_t *hmm);
* @eprob: emission probabilities for each site and state (nsites x nstates)
* @sites: list of positions
*
- * Same as hmm_run_fwd_bwd, in addition curr_tprob contains the new
- * transition probabilities. In this verison, emission probabilities
- * are not updated.
+ * Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new
+ * transition probabilities is returned. In this verison, emission
+ * probabilities are not updated.
*/
-void hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
+double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
void hmm_destroy(hmm_t *hmm);
diff --git a/samtools/bam2bcf.c b/bcftools/bam2bcf.c
similarity index 96%
copy from samtools/bam2bcf.c
copy to bcftools/bam2bcf.c
index 85ce307..b4fb7f1 100644
--- a/samtools/bam2bcf.c
+++ b/bcftools/bam2bcf.c
@@ -23,17 +23,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#include <config.h>
-
#include <math.h>
#include <stdint.h>
#include <assert.h>
#include <float.h>
+#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include <htslib/kfunc.h>
#include "bam2bcf.h"
-#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
@@ -46,7 +44,7 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
{
bcf_callaux_t *bca;
if (theta <= 0.) theta = CALL_DEFTHETA;
- bca = calloc(1, sizeof(bcf_callaux_t));
+ bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
bca->capQ = 60;
bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
bca->min_baseQ = min_baseQ;
@@ -55,15 +53,15 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
bca->min_support = 1;
bca->per_sample_flt = 0;
bca->npos = 100;
- bca->ref_pos = malloc(bca->npos*sizeof(int));
- bca->alt_pos = malloc(bca->npos*sizeof(int));
+ bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
bca->nqual = 60;
- bca->ref_mq = malloc(bca->nqual*sizeof(int));
- bca->alt_mq = malloc(bca->nqual*sizeof(int));
- bca->ref_bq = malloc(bca->nqual*sizeof(int));
- bca->alt_bq = malloc(bca->nqual*sizeof(int));
- bca->fwd_mqs = malloc(bca->nqual*sizeof(int));
- bca->rev_mqs = malloc(bca->nqual*sizeof(int));
+ bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
+ bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
return bca;
}
@@ -352,11 +350,22 @@ double calc_chisq_bias(int *a, int *b, int n)
return prob;
}
+static double mann_whitney_1947_(int n, int m, int U)
+{
+ if (U<0) return 0;
+ if (n==0||m==0) return U==0 ? 1 : 0;
+ return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
+}
+
double mann_whitney_1947(int n, int m, int U)
{
- if (U<0) return 0;
- if (n==0||m==0) return U==0 ? 1 : 0;
- return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
+ #include "mw.h"
+
+ assert(n >= 2 && m >= 2);
+
+ return (n < 8 && m < 8 && U < 50)
+ ? mw[n-2][m-2][U]
+ : mann_whitney_1947_(n,m,U);
}
double mann_whitney_1947_cdf(int n, int m, int U)
@@ -418,11 +427,16 @@ double calc_mwu_bias(int *a, int *b, int n)
double U = 0, ties = 0;
for (i=0; i<n; i++)
{
- na += a[i];
- U += a[i] * (nb + b[i]*0.5);
- nb += b[i];
- if ( a[i] && b[i] )
- {
+ if (!a[i]) {
+ if (!b[i]) continue;
+ nb += b[i];
+ } else if (!b[i]) {
+ na += a[i];
+ U += a[i] * nb;
+ } else {
+ na += a[i];
+ U += a[i] * (nb + b[i]*0.5);
+ nb += b[i];
double tie = a[i] + b[i];
ties += (tie*tie-1)*tie;
}
diff --git a/samtools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c
similarity index 96%
copy from samtools/bam2bcf.c.pysam.c
copy to bcftools/bam2bcf.c.pysam.c
index 6938ec0..5a1a443 100644
--- a/samtools/bam2bcf.c.pysam.c
+++ b/bcftools/bam2bcf.c.pysam.c
@@ -25,17 +25,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#include <config.h>
-
#include <math.h>
#include <stdint.h>
#include <assert.h>
#include <float.h>
+#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include <htslib/kfunc.h>
#include "bam2bcf.h"
-#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
@@ -48,7 +46,7 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
{
bcf_callaux_t *bca;
if (theta <= 0.) theta = CALL_DEFTHETA;
- bca = calloc(1, sizeof(bcf_callaux_t));
+ bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
bca->capQ = 60;
bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
bca->min_baseQ = min_baseQ;
@@ -57,15 +55,15 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
bca->min_support = 1;
bca->per_sample_flt = 0;
bca->npos = 100;
- bca->ref_pos = malloc(bca->npos*sizeof(int));
- bca->alt_pos = malloc(bca->npos*sizeof(int));
+ bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
bca->nqual = 60;
- bca->ref_mq = malloc(bca->nqual*sizeof(int));
- bca->alt_mq = malloc(bca->nqual*sizeof(int));
- bca->ref_bq = malloc(bca->nqual*sizeof(int));
- bca->alt_bq = malloc(bca->nqual*sizeof(int));
- bca->fwd_mqs = malloc(bca->nqual*sizeof(int));
- bca->rev_mqs = malloc(bca->nqual*sizeof(int));
+ bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
+ bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
return bca;
}
@@ -354,11 +352,22 @@ double calc_chisq_bias(int *a, int *b, int n)
return prob;
}
+static double mann_whitney_1947_(int n, int m, int U)
+{
+ if (U<0) return 0;
+ if (n==0||m==0) return U==0 ? 1 : 0;
+ return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
+}
+
double mann_whitney_1947(int n, int m, int U)
{
- if (U<0) return 0;
- if (n==0||m==0) return U==0 ? 1 : 0;
- return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
+ #include "mw.h"
+
+ assert(n >= 2 && m >= 2);
+
+ return (n < 8 && m < 8 && U < 50)
+ ? mw[n-2][m-2][U]
+ : mann_whitney_1947_(n,m,U);
}
double mann_whitney_1947_cdf(int n, int m, int U)
@@ -420,11 +429,16 @@ double calc_mwu_bias(int *a, int *b, int n)
double U = 0, ties = 0;
for (i=0; i<n; i++)
{
- na += a[i];
- U += a[i] * (nb + b[i]*0.5);
- nb += b[i];
- if ( a[i] && b[i] )
- {
+ if (!a[i]) {
+ if (!b[i]) continue;
+ nb += b[i];
+ } else if (!b[i]) {
+ na += a[i];
+ U += a[i] * nb;
+ } else {
+ na += a[i];
+ U += a[i] * (nb + b[i]*0.5);
+ nb += b[i];
double tie = a[i] + b[i];
ties += (tie*tie-1)*tie;
}
diff --git a/samtools/bam2bcf.h b/bcftools/bam2bcf.h
similarity index 96%
copy from samtools/bam2bcf.h
copy to bcftools/bam2bcf.h
index 22c67cc..f81f9cf 100644
--- a/samtools/bam2bcf.h
+++ b/bcftools/bam2bcf.h
@@ -1,7 +1,7 @@
/* bam2bcf.h -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2014,2016 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -27,8 +27,8 @@ DEALINGS IN THE SOFTWARE. */
#define BAM2BCF_H
#include <stdint.h>
+#include <htslib/hts.h>
#include <htslib/vcf.h>
-#include "errmod.h"
/**
* A simplified version of Mann-Whitney U-test is calculated
@@ -128,8 +128,7 @@ extern "C" {
int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
const bcf_callaux_t *bca, const char *ref);
- int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
- const void *rghash);
+ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call);
#ifdef __cplusplus
diff --git a/samtools/bam2bcf_indel.c b/bcftools/bam2bcf_indel.c
similarity index 83%
copy from samtools/bam2bcf_indel.c
copy to bcftools/bam2bcf_indel.c
index 5b353fc..52837b5 100644
--- a/samtools/bam2bcf_indel.c
+++ b/bcftools/bam2bcf_indel.c
@@ -1,7 +1,7 @@
/* bam2bcf_indel.c -- indel caller.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2014,2016 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -23,70 +23,20 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#include <config.h>
-
#include <assert.h>
#include <ctype.h>
#include <string.h>
-#include "htslib/sam.h"
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
#include "bam2bcf.h"
-#include "kprobaln.h"
-#include "htslib/khash.h"
-KHASH_SET_INIT_STR(rg)
-#include "htslib/ksort.h"
+#include <htslib/ksort.h>
KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
#define INDEL_WINDOW_SIZE 50
-void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
-{
- const char *s, *p, *q, *r, *t;
- khash_t(rg) *hash;
- if (list == 0 || hdtext == 0) return _hash;
- if (_hash == 0) _hash = kh_init(rg);
- hash = (khash_t(rg)*)_hash;
- if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
- do {
- t = strstr(s + 4, "@RG\t"); // the next @RG
- if ((p = strstr(s, "\tID:")) != 0) p += 4;
- if ((q = strstr(s, "\tPL:")) != 0) q += 4;
- if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
- int lp, lq;
- char *x;
- for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { }
- lp = r - p;
- for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { }
- lq = r - q;
- x = calloc((lp > lq? lp : lq) + 1, 1);
- for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
- if (strstr(list, x)) { // insert ID to the hash table
- khint_t k;
- int ret;
- for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
- x[r-p] = 0;
- k = kh_get(rg, hash, x);
- if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
- else free(x);
- } else free(x);
- }
- s = t;
- } while (s);
- return hash;
-}
-
-void bcf_call_del_rghash(void *_hash)
-{
- khint_t k;
- khash_t(rg) *hash = (khash_t(rg)*)_hash;
- if (hash == 0) return;
- for (k = kh_begin(hash); k < kh_end(hash); ++k)
- if (kh_exist(hash, k))
- free((char*)kh_key(hash, k));
- kh_destroy(rg, hash);
-}
-
static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
{
int k, x = c->pos, y = 0, last_y = 0;
@@ -146,30 +96,13 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
- 8: estimated sequence quality .. (aux>>8)&0xff
- 8: indel quality .. aux&0xff
*/
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
- const void *rghash)
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
{
int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
int N, K, l_run, ref_type, n_alt;
char *inscns = 0, *ref2, *query, **ref_sample;
- khash_t(rg) *hash = (khash_t(rg)*)rghash;
if (ref == 0 || bca == 0) return -1;
- // mark filtered reads
- if (rghash) {
- N = 0;
- for (s = N = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- const uint8_t *rg = bam_aux_get(p->b, "RG");
- p->aux = 1; // filtered by default
- if (rg) {
- khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
- if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
- }
- }
- }
- if (N == 0) return -1; // no reads left
- }
+
// determine if there is a gap
for (s = N = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i)
@@ -182,19 +115,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
bca->max_support = bca->max_frac = 0;
int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
uint32_t *aux;
- aux = calloc(N + 1, 4);
+ aux = (uint32_t*) calloc(N + 1, 4);
m = max_rd_len = 0;
aux[m++] = MINUS_CONST; // zero indel is always a type
for (s = 0; s < n; ++s) {
int na = 0, nt = 0;
for (i = 0; i < n_plp[s]; ++i) {
const bam_pileup1_t *p = plp[s] + i;
- if (rghash == 0 || p->aux == 0) {
- ++nt;
- if (p->indel != 0) {
- ++na;
- aux[m++] = MINUS_CONST + p->indel;
- }
+ ++nt;
+ if (p->indel != 0) {
+ ++na;
+ aux[m++] = MINUS_CONST + p->indel;
}
j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
@@ -260,13 +191,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
int L = right - left + 1, max_i, max2_i;
uint32_t *cns, max, max2;
char *ref0, *r;
- ref_sample = calloc(n, sizeof(char*));
- cns = calloc(L, 4);
- ref0 = calloc(L, 1);
+ ref_sample = (char**) calloc(n, sizeof(char*));
+ cns = (uint32_t*) calloc(L, 4);
+ ref0 = (char*) calloc(L, 1);
for (i = 0; i < right - left; ++i)
ref0[i] = seq_nt16_table[(int)ref[i+left]];
for (s = 0; s < n; ++s) {
- r = ref_sample[s] = calloc(L, 1);
+ r = ref_sample[s] = (char*) calloc(L, 1);
memset(cns, 0, sizeof(int) * L);
// collect ref and non-ref counts
for (i = 0; i < n_plp[s]; ++i) {
@@ -317,7 +248,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
// construct the consensus sequence
max_ins = types[n_types - 1]; // max_ins is at least 0
if (max_ins > 0) {
- int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
+ int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
// count the number of occurrences of each base at each position for each type of insertion
for (t = 0; t < n_types; ++t) {
if (types[t] > 0) {
@@ -337,7 +268,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
}
// use the majority rule to construct the consensus
- inscns = calloc(n_types * max_ins, 1);
+ inscns = (char*) calloc(n_types * max_ins, 1);
for (t = 0; t < n_types; ++t) {
for (j = 0; j < types[t]; ++j) {
int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
@@ -352,14 +283,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
// compute the likelihood given each type of indel for each read
max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
- ref2 = calloc(max_ref2, 1);
- query = calloc(right - left + max_rd_len + max_ins + 2, 1);
- score1 = calloc(N * n_types, sizeof(int));
- score2 = calloc(N * n_types, sizeof(int));
+ ref2 = (char*) calloc(max_ref2, 1);
+ query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
+ score1 = (int*) calloc(N * n_types, sizeof(int));
+ score2 = (int*) calloc(N * n_types, sizeof(int));
bca->indelreg = 0;
for (t = 0; t < n_types; ++t) {
int l, ir;
- kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
apf1.bw = apf2.bw = abs(types[t]) + 3;
// compute indelreg
if (types[t] == 0) ir = 0;
@@ -404,7 +335,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
{ // do realignment; this is the bottleneck
const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
- qq = calloc(qend - qbeg, 1);
+ qq = (uint8_t*) calloc(qend - qbeg, 1);
bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
if (bq) ++bq; // skip type
for (l = qbeg; l < qend; ++l) {
@@ -412,14 +343,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
}
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
if (l > 255) l = 255;
score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
if (sc > 5) {
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499);
if (l > 255) l = 255;
score2[K*n_types + t] = sc<<8 | l;
@@ -439,10 +370,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
free(ref2); free(query);
{ // compute indelQ
- int *sc, tmp, *sumq;
- sc = alloca(n_types * sizeof(int));
- sumq = alloca(n_types * sizeof(int));
- memset(sumq, 0, sizeof(int) * n_types);
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
for (s = K = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
@@ -493,7 +427,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
// determine bca->indel_types[] and bca->inscns
bca->maxins = max_ins;
- bca->inscns = realloc(bca->inscns, bca->maxins * 4);
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
for (t = 0; t < n_types; ++t)
sumq[t] = sumq[t]<<6 | t;
for (t = 1; t < n_types; ++t) // insertion sort
@@ -523,6 +457,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
//fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
}
}
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
}
free(score1); free(score2);
// free
diff --git a/samtools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c
similarity index 83%
copy from samtools/bam2bcf_indel.c.pysam.c
copy to bcftools/bam2bcf_indel.c.pysam.c
index 21cbb03..0d36841 100644
--- a/samtools/bam2bcf_indel.c.pysam.c
+++ b/bcftools/bam2bcf_indel.c.pysam.c
@@ -3,7 +3,7 @@
/* bam2bcf_indel.c -- indel caller.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2014,2016 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -25,70 +25,20 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#include <config.h>
-
#include <assert.h>
#include <ctype.h>
#include <string.h>
-#include "htslib/sam.h"
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
#include "bam2bcf.h"
-#include "kprobaln.h"
-#include "htslib/khash.h"
-KHASH_SET_INIT_STR(rg)
-#include "htslib/ksort.h"
+#include <htslib/ksort.h>
KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
#define INDEL_WINDOW_SIZE 50
-void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
-{
- const char *s, *p, *q, *r, *t;
- khash_t(rg) *hash;
- if (list == 0 || hdtext == 0) return _hash;
- if (_hash == 0) _hash = kh_init(rg);
- hash = (khash_t(rg)*)_hash;
- if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
- do {
- t = strstr(s + 4, "@RG\t"); // the next @RG
- if ((p = strstr(s, "\tID:")) != 0) p += 4;
- if ((q = strstr(s, "\tPL:")) != 0) q += 4;
- if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
- int lp, lq;
- char *x;
- for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { }
- lp = r - p;
- for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { }
- lq = r - q;
- x = calloc((lp > lq? lp : lq) + 1, 1);
- for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
- if (strstr(list, x)) { // insert ID to the hash table
- khint_t k;
- int ret;
- for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
- x[r-p] = 0;
- k = kh_get(rg, hash, x);
- if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
- else free(x);
- } else free(x);
- }
- s = t;
- } while (s);
- return hash;
-}
-
-void bcf_call_del_rghash(void *_hash)
-{
- khint_t k;
- khash_t(rg) *hash = (khash_t(rg)*)_hash;
- if (hash == 0) return;
- for (k = kh_begin(hash); k < kh_end(hash); ++k)
- if (kh_exist(hash, k))
- free((char*)kh_key(hash, k));
- kh_destroy(rg, hash);
-}
-
static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
{
int k, x = c->pos, y = 0, last_y = 0;
@@ -148,30 +98,13 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
- 8: estimated sequence quality .. (aux>>8)&0xff
- 8: indel quality .. aux&0xff
*/
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
- const void *rghash)
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
{
int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
int N, K, l_run, ref_type, n_alt;
char *inscns = 0, *ref2, *query, **ref_sample;
- khash_t(rg) *hash = (khash_t(rg)*)rghash;
if (ref == 0 || bca == 0) return -1;
- // mark filtered reads
- if (rghash) {
- N = 0;
- for (s = N = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- const uint8_t *rg = bam_aux_get(p->b, "RG");
- p->aux = 1; // filtered by default
- if (rg) {
- khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
- if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
- }
- }
- }
- if (N == 0) return -1; // no reads left
- }
+
// determine if there is a gap
for (s = N = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i)
@@ -184,19 +117,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
bca->max_support = bca->max_frac = 0;
int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
uint32_t *aux;
- aux = calloc(N + 1, 4);
+ aux = (uint32_t*) calloc(N + 1, 4);
m = max_rd_len = 0;
aux[m++] = MINUS_CONST; // zero indel is always a type
for (s = 0; s < n; ++s) {
int na = 0, nt = 0;
for (i = 0; i < n_plp[s]; ++i) {
const bam_pileup1_t *p = plp[s] + i;
- if (rghash == 0 || p->aux == 0) {
- ++nt;
- if (p->indel != 0) {
- ++na;
- aux[m++] = MINUS_CONST + p->indel;
- }
+ ++nt;
+ if (p->indel != 0) {
+ ++na;
+ aux[m++] = MINUS_CONST + p->indel;
}
j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
@@ -262,13 +193,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
int L = right - left + 1, max_i, max2_i;
uint32_t *cns, max, max2;
char *ref0, *r;
- ref_sample = calloc(n, sizeof(char*));
- cns = calloc(L, 4);
- ref0 = calloc(L, 1);
+ ref_sample = (char**) calloc(n, sizeof(char*));
+ cns = (uint32_t*) calloc(L, 4);
+ ref0 = (char*) calloc(L, 1);
for (i = 0; i < right - left; ++i)
ref0[i] = seq_nt16_table[(int)ref[i+left]];
for (s = 0; s < n; ++s) {
- r = ref_sample[s] = calloc(L, 1);
+ r = ref_sample[s] = (char*) calloc(L, 1);
memset(cns, 0, sizeof(int) * L);
// collect ref and non-ref counts
for (i = 0; i < n_plp[s]; ++i) {
@@ -319,7 +250,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
// construct the consensus sequence
max_ins = types[n_types - 1]; // max_ins is at least 0
if (max_ins > 0) {
- int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
+ int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
// count the number of occurrences of each base at each position for each type of insertion
for (t = 0; t < n_types; ++t) {
if (types[t] > 0) {
@@ -339,7 +270,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
}
// use the majority rule to construct the consensus
- inscns = calloc(n_types * max_ins, 1);
+ inscns = (char*) calloc(n_types * max_ins, 1);
for (t = 0; t < n_types; ++t) {
for (j = 0; j < types[t]; ++j) {
int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
@@ -354,14 +285,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
// compute the likelihood given each type of indel for each read
max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
- ref2 = calloc(max_ref2, 1);
- query = calloc(right - left + max_rd_len + max_ins + 2, 1);
- score1 = calloc(N * n_types, sizeof(int));
- score2 = calloc(N * n_types, sizeof(int));
+ ref2 = (char*) calloc(max_ref2, 1);
+ query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
+ score1 = (int*) calloc(N * n_types, sizeof(int));
+ score2 = (int*) calloc(N * n_types, sizeof(int));
bca->indelreg = 0;
for (t = 0; t < n_types; ++t) {
int l, ir;
- kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
apf1.bw = apf2.bw = abs(types[t]) + 3;
// compute indelreg
if (types[t] == 0) ir = 0;
@@ -406,7 +337,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
{ // do realignment; this is the bottleneck
const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
- qq = calloc(qend - qbeg, 1);
+ qq = (uint8_t*) calloc(qend - qbeg, 1);
bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
if (bq) ++bq; // skip type
for (l = qbeg; l < qend; ++l) {
@@ -414,14 +345,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
}
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
if (l > 255) l = 255;
score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
if (sc > 5) {
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499);
if (l > 255) l = 255;
score2[K*n_types + t] = sc<<8 | l;
@@ -441,10 +372,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
free(ref2); free(query);
{ // compute indelQ
- int *sc, tmp, *sumq;
- sc = alloca(n_types * sizeof(int));
- sumq = alloca(n_types * sizeof(int));
- memset(sumq, 0, sizeof(int) * n_types);
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
for (s = K = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
@@ -495,7 +429,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
// determine bca->indel_types[] and bca->inscns
bca->maxins = max_ins;
- bca->inscns = realloc(bca->inscns, bca->maxins * 4);
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
for (t = 0; t < n_types; ++t)
sumq[t] = sumq[t]<<6 | t;
for (t = 1; t < n_types; ++t) // insertion sort
@@ -525,6 +459,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
//fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
}
}
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
}
free(score1); free(score2);
// free
diff --git a/bcftools/bam_sample.c b/bcftools/bam_sample.c
new file mode 100644
index 0000000..66f5729
--- /dev/null
+++ b/bcftools/bam_sample.c
@@ -0,0 +1,393 @@
+/* bam_sample.c -- group data by sample.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>, Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <khash_str2str.h>
+#include "bam_sample.h"
+#include "bcftools.h"
+
+
+typedef struct
+{
+ char *fname;
+ void *rg2idx; // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup
+ int default_idx; // default BCF output sample index, set only when all readgroups are treated as one sample
+}
+file_t;
+
+struct _bam_smpl_t
+{
+ kstring_t tmp;
+ file_t *files;
+ int ignore_rg, nsmpl, nfiles;
+ char **smpl; // list of BCF output sample names. Maintained by bsmpl_add_readgroup
+ void *sample_list; // hash: BAM input sample name to BCF output sample name. This is the -s/-S list
+ int sample_logic; // the -s/-S logic, 1: include, 0: exclude
+ void *rg_list; // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list
+ int rg_logic; // the -G logic, 1: include, 0: exclude
+ void *name2idx; // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup
+};
+
+bam_smpl_t *bam_smpl_init(void)
+{
+ bam_smpl_t *bsmpl;
+ bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t));
+ bsmpl->name2idx = khash_str2int_init();
+ return bsmpl;
+}
+
+void bam_smpl_destroy(bam_smpl_t *bsmpl)
+{
+ if ( !bsmpl ) return;
+ if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx);
+ if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list);
+ if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list);
+ int i;
+ for (i=0; i<bsmpl->nfiles; i++)
+ {
+ file_t *file = &bsmpl->files[i];
+ if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
+ free(file->fname);
+ }
+ free(bsmpl->smpl);
+ free(bsmpl->files);
+ free(bsmpl->tmp.s);
+ free(bsmpl);
+}
+
+void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl)
+{
+ bsmpl->ignore_rg = 1;
+}
+
+static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name)
+{
+ int ismpl = -1;
+ if ( smpl_name )
+ {
+ if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 )
+ {
+ // new sample
+ bsmpl->nsmpl++;
+ bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl);
+ bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name);
+ ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]);
+ }
+ }
+ if ( !strcmp("*",rg_id) )
+ {
+ // all read groups in the bam treated as the same sample
+ file->default_idx = ismpl;
+ return;
+ }
+ if ( !file->rg2idx ) file->rg2idx = khash_str2int_init();
+ if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return; // duplicate @RG:ID
+ khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl);
+}
+static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name)
+{
+ char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only
+ if ( !rg_smpl )
+ {
+ // read group specific to this bam
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl )
+ {
+ // any read group in this file?
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"*\t%s",file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl && bsmpl->rg_logic ) return 0;
+ if ( rg_smpl && !bsmpl->rg_logic ) return 0;
+
+ if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample
+ return 1;
+}
+
+/*
+ The logic of this function is a bit complicated because we want to work
+ also with broken bams containing read groups that are not listed in the
+ header. The desired behavior is as follows:
+ - when -G is given, read groups which are not listed in the header must
+ be given explicitly using the "?" symbol in -G.
+ Otherwise:
+ - if the bam has no header, all reads in the file are assigned to a
+ single sample named after the file
+ - if there is at least one sample defined in the header, reads with no
+ read group id or with a read group id not listed in the header are
+ assigned to the first sample encountered in the header
+*/
+int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
+{
+ bsmpl->nfiles++;
+ bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
+ file_t *file = &bsmpl->files[bsmpl->nfiles-1];
+ memset(file,0,sizeof(file_t));
+ file->fname = strdup(fname);
+ file->default_idx = -1;
+
+ if ( bsmpl->ignore_rg || !bam_hdr )
+ {
+ // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
+ bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
+ return bsmpl->nfiles-1;
+ }
+
+ void *bam_smpls = khash_str2int_init();
+ int first_smpl = -1, nskipped = 0;
+ const char *p = bam_hdr, *q, *r;
+ while ((q = strstr(p, "@RG")) != 0)
+ {
+ p = q + 3;
+ r = q = 0;
+ if ((q = strstr(p, "\tID:")) != 0) q += 4;
+ if ((r = strstr(p, "\tSM:")) != 0) r += 4;
+ if (r && q)
+ {
+ char *u, *v;
+ int ioq, ior;
+ for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
+ for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
+ ioq = *u; ior = *v; *u = *v = '\0';
+
+ // q now points to a null terminated read group id
+ // r points to a null terminated sample name
+ if ( !strcmp("*",q) || !strcmp("?",q) )
+ error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);
+
+ int accept_rg = 1;
+ if ( bsmpl->sample_list )
+ {
+ // restrict samples based on the -s/-S options
+ char *name = khash_str2str_get(bsmpl->sample_list,r);
+ if ( bsmpl->sample_logic==0 )
+ accept_rg = name ? 0 : 1;
+ else if ( !name )
+ accept_rg = 0;
+ else
+ r = name;
+ }
+ if ( accept_rg && bsmpl->rg_list )
+ {
+ // restrict readgroups based on the -G option, possibly renaming the sample
+ accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
+ }
+ if ( accept_rg )
+ bsmpl_add_readgroup(bsmpl,file,q,r);
+ else
+ {
+ bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
+ nskipped++;
+ }
+
+ if ( first_smpl<0 )
+ khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
+ if ( !khash_str2int_has_key(bam_smpls,r) )
+ khash_str2int_inc(bam_smpls,strdup(r));
+
+ *u = ioq; *v = ior;
+ }
+ else
+ break;
+ p = q > r ? q : r;
+ }
+ int nsmpls = khash_str2int_size(bam_smpls);
+ khash_str2int_destroy_free(bam_smpls);
+
+ const char *smpl_name = NULL;
+ int accept_null_rg = 1;
+ if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
+ if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;
+
+ if ( !accept_null_rg && first_smpl==-1 )
+ {
+ // no suitable read group is available in this bam: ignore the whole file.
+ free(file->fname);
+ bsmpl->nfiles--;
+ return -1;
+ }
+ if ( !accept_null_rg ) return bsmpl->nfiles-1;
+ if ( nsmpls==1 && !nskipped )
+ {
+ file->default_idx = first_smpl;
+ return bsmpl->nfiles-1;
+ }
+ if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];
+
+ bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
+ return bsmpl->nfiles-1;
+}
+
+const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl)
+{
+ *nsmpl = bsmpl->nsmpl;
+ return (const char**)bsmpl->smpl;
+}
+
+int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec)
+{
+ file_t *file = &bsmpl->files[bam_id];
+ if ( file->default_idx >= 0 ) return file->default_idx;
+
+ char *aux_rg = (char*) bam_aux_get(bam_rec, "RG");
+ aux_rg = aux_rg ? aux_rg+1 : "?";
+
+ int rg_id;
+ if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id;
+ if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id;
+ return -1;
+}
+
+int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->sample_logic = 1;
+ else list++;
+
+ int i, nsamples = 0;
+ char **samples = hts_readlist(list, is_file, &nsamples);
+ if ( !nsamples ) return 0;
+
+ kstring_t ori = {0,0,0};
+ kstring_t ren = {0,0,0};
+
+ bsmpl->sample_list = khash_str2str_init();
+ for (i=0; i<nsamples; i++)
+ {
+ char *ptr = samples[i];
+ ori.l = ren.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ori);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ren);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s));
+ free(samples[i]);
+ }
+ free(samples);
+ free(ori.s);
+ free(ren.s);
+ return nsamples;
+}
+
+int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->rg_logic = 1;
+ else list++;
+
+ int i, nrows = 0;
+ char **rows = hts_readlist(list, is_file, &nrows);
+ if ( !nrows ) return 0;
+
+ kstring_t fld1 = {0,0,0};
+ kstring_t fld2 = {0,0,0};
+ kstring_t fld3 = {0,0,0};
+
+ bsmpl->rg_list = khash_str2str_init();
+ for (i=0; i<nrows; i++)
+ {
+ char *ptr = rows[i];
+ fld1.l = fld2.l = fld3.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld1);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld2);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld3);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( fld3.l )
+ {
+ // ID FILE SAMPLE
+ kputc('\t',&fld1);
+ kputs(fld2.s,&fld1);
+ fld2.l = 0;
+ kputs(fld3.s,&fld2);
+ }
+ // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
+ char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
+ if ( !value )
+ khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
+ else if ( strcmp(value,fld2.l?fld2.s:"\t") )
+ error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
+ free(rows[i]);
+ }
+ free(rows);
+ free(fld1.s);
+ free(fld2.s);
+ free(fld3.s);
+ return nrows;
+}
+
+
diff --git a/bcftools/bam_sample.c.pysam.c b/bcftools/bam_sample.c.pysam.c
new file mode 100644
index 0000000..76d7a61
--- /dev/null
+++ b/bcftools/bam_sample.c.pysam.c
@@ -0,0 +1,395 @@
+#include "pysam.h"
+
+/* bam_sample.c -- group data by sample.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>, Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <khash_str2str.h>
+#include "bam_sample.h"
+#include "bcftools.h"
+
+
+typedef struct
+{
+ char *fname;
+ void *rg2idx; // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup
+ int default_idx; // default BCF output sample index, set only when all readgroups are treated as one sample
+}
+file_t;
+
+struct _bam_smpl_t
+{
+ kstring_t tmp;
+ file_t *files;
+ int ignore_rg, nsmpl, nfiles;
+ char **smpl; // list of BCF output sample names. Maintained by bsmpl_add_readgroup
+ void *sample_list; // hash: BAM input sample name to BCF output sample name. This is the -s/-S list
+ int sample_logic; // the -s/-S logic, 1: include, 0: exclude
+ void *rg_list; // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list
+ int rg_logic; // the -G logic, 1: include, 0: exclude
+ void *name2idx; // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup
+};
+
+bam_smpl_t *bam_smpl_init(void)
+{
+ bam_smpl_t *bsmpl;
+ bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t));
+ bsmpl->name2idx = khash_str2int_init();
+ return bsmpl;
+}
+
+void bam_smpl_destroy(bam_smpl_t *bsmpl)
+{
+ if ( !bsmpl ) return;
+ if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx);
+ if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list);
+ if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list);
+ int i;
+ for (i=0; i<bsmpl->nfiles; i++)
+ {
+ file_t *file = &bsmpl->files[i];
+ if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
+ free(file->fname);
+ }
+ free(bsmpl->smpl);
+ free(bsmpl->files);
+ free(bsmpl->tmp.s);
+ free(bsmpl);
+}
+
+void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl)
+{
+ bsmpl->ignore_rg = 1;
+}
+
+static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name)
+{
+ int ismpl = -1;
+ if ( smpl_name )
+ {
+ if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 )
+ {
+ // new sample
+ bsmpl->nsmpl++;
+ bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl);
+ bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name);
+ ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]);
+ }
+ }
+ if ( !strcmp("*",rg_id) )
+ {
+ // all read groups in the bam treated as the same sample
+ file->default_idx = ismpl;
+ return;
+ }
+ if ( !file->rg2idx ) file->rg2idx = khash_str2int_init();
+ if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return; // duplicate @RG:ID
+ khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl);
+}
+static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name)
+{
+ char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only
+ if ( !rg_smpl )
+ {
+ // read group specific to this bam
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl )
+ {
+ // any read group in this file?
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"*\t%s",file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl && bsmpl->rg_logic ) return 0;
+ if ( rg_smpl && !bsmpl->rg_logic ) return 0;
+
+ if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample
+ return 1;
+}
+
+/*
+ The logic of this function is a bit complicated because we want to work
+ also with broken bams containing read groups that are not listed in the
+ header. The desired behavior is as follows:
+ - when -G is given, read groups which are not listed in the header must
+ be given explicitly using the "?" symbol in -G.
+ Otherwise:
+ - if the bam has no header, all reads in the file are assigned to a
+ single sample named after the file
+ - if there is at least one sample defined in the header, reads with no
+ read group id or with a read group id not listed in the header are
+ assigned to the first sample encountered in the header
+*/
+int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
+{
+ bsmpl->nfiles++;
+ bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
+ file_t *file = &bsmpl->files[bsmpl->nfiles-1];
+ memset(file,0,sizeof(file_t));
+ file->fname = strdup(fname);
+ file->default_idx = -1;
+
+ if ( bsmpl->ignore_rg || !bam_hdr )
+ {
+ // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
+ bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
+ return bsmpl->nfiles-1;
+ }
+
+ void *bam_smpls = khash_str2int_init();
+ int first_smpl = -1, nskipped = 0;
+ const char *p = bam_hdr, *q, *r;
+ while ((q = strstr(p, "@RG")) != 0)
+ {
+ p = q + 3;
+ r = q = 0;
+ if ((q = strstr(p, "\tID:")) != 0) q += 4;
+ if ((r = strstr(p, "\tSM:")) != 0) r += 4;
+ if (r && q)
+ {
+ char *u, *v;
+ int ioq, ior;
+ for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
+ for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
+ ioq = *u; ior = *v; *u = *v = '\0';
+
+ // q now points to a null terminated read group id
+ // r points to a null terminated sample name
+ if ( !strcmp("*",q) || !strcmp("?",q) )
+ error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);
+
+ int accept_rg = 1;
+ if ( bsmpl->sample_list )
+ {
+ // restrict samples based on the -s/-S options
+ char *name = khash_str2str_get(bsmpl->sample_list,r);
+ if ( bsmpl->sample_logic==0 )
+ accept_rg = name ? 0 : 1;
+ else if ( !name )
+ accept_rg = 0;
+ else
+ r = name;
+ }
+ if ( accept_rg && bsmpl->rg_list )
+ {
+ // restrict readgroups based on the -G option, possibly renaming the sample
+ accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
+ }
+ if ( accept_rg )
+ bsmpl_add_readgroup(bsmpl,file,q,r);
+ else
+ {
+ bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
+ nskipped++;
+ }
+
+ if ( first_smpl<0 )
+ khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
+ if ( !khash_str2int_has_key(bam_smpls,r) )
+ khash_str2int_inc(bam_smpls,strdup(r));
+
+ *u = ioq; *v = ior;
+ }
+ else
+ break;
+ p = q > r ? q : r;
+ }
+ int nsmpls = khash_str2int_size(bam_smpls);
+ khash_str2int_destroy_free(bam_smpls);
+
+ const char *smpl_name = NULL;
+ int accept_null_rg = 1;
+ if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
+ if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;
+
+ if ( !accept_null_rg && first_smpl==-1 )
+ {
+ // no suitable read group is available in this bam: ignore the whole file.
+ free(file->fname);
+ bsmpl->nfiles--;
+ return -1;
+ }
+ if ( !accept_null_rg ) return bsmpl->nfiles-1;
+ if ( nsmpls==1 && !nskipped )
+ {
+ file->default_idx = first_smpl;
+ return bsmpl->nfiles-1;
+ }
+ if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];
+
+ bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
+ return bsmpl->nfiles-1;
+}
+
+const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl)
+{
+ *nsmpl = bsmpl->nsmpl;
+ return (const char**)bsmpl->smpl;
+}
+
+int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec)
+{
+ file_t *file = &bsmpl->files[bam_id];
+ if ( file->default_idx >= 0 ) return file->default_idx;
+
+ char *aux_rg = (char*) bam_aux_get(bam_rec, "RG");
+ aux_rg = aux_rg ? aux_rg+1 : "?";
+
+ int rg_id;
+ if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id;
+ if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id;
+ return -1;
+}
+
+int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->sample_logic = 1;
+ else list++;
+
+ int i, nsamples = 0;
+ char **samples = hts_readlist(list, is_file, &nsamples);
+ if ( !nsamples ) return 0;
+
+ kstring_t ori = {0,0,0};
+ kstring_t ren = {0,0,0};
+
+ bsmpl->sample_list = khash_str2str_init();
+ for (i=0; i<nsamples; i++)
+ {
+ char *ptr = samples[i];
+ ori.l = ren.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ori);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ren);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s));
+ free(samples[i]);
+ }
+ free(samples);
+ free(ori.s);
+ free(ren.s);
+ return nsamples;
+}
+
+int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->rg_logic = 1;
+ else list++;
+
+ int i, nrows = 0;
+ char **rows = hts_readlist(list, is_file, &nrows);
+ if ( !nrows ) return 0;
+
+ kstring_t fld1 = {0,0,0};
+ kstring_t fld2 = {0,0,0};
+ kstring_t fld3 = {0,0,0};
+
+ bsmpl->rg_list = khash_str2str_init();
+ for (i=0; i<nrows; i++)
+ {
+ char *ptr = rows[i];
+ fld1.l = fld2.l = fld3.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld1);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld2);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld3);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( fld3.l )
+ {
+ // ID FILE SAMPLE
+ kputc('\t',&fld1);
+ kputs(fld2.s,&fld1);
+ fld2.l = 0;
+ kputs(fld3.s,&fld2);
+ }
+ // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
+ char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
+ if ( !value )
+ khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
+ else if ( strcmp(value,fld2.l?fld2.s:"\t") )
+ error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
+ free(rows[i]);
+ }
+ free(rows);
+ free(fld1.s);
+ free(fld2.s);
+ free(fld3.s);
+ return nrows;
+}
+
+
diff --git a/samtools/errmod.h b/bcftools/bam_sample.h
similarity index 53%
rename from samtools/errmod.h
rename to bcftools/bam_sample.h
index 6db46f4..5cbcc39 100644
--- a/samtools/errmod.h
+++ b/bcftools/bam_sample.h
@@ -1,9 +1,9 @@
-/* errmod.h -- revised MAQ error model.
+/* bam_sample.h -- group data by sample.
Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012 Genome Research Ltd.
+ Copyright (C) 2016 Genome Research Ltd.
- Author: Heng Li <lh3 at sanger.ac.uk>
+ Author: Heng Li <lh3 at sanger.ac.uk>, Petr Danecek <pd3 at sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -23,27 +23,28 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#ifndef ERRMOD_H
-#define ERRMOD_H
+#ifndef BAM_SAMPLE_H
+#define BAM_SAMPLE_H
-#include <stdint.h>
+#include <htslib/sam.h>
-struct __errmod_coef_t;
+typedef struct _bam_smpl_t bam_smpl_t;
-typedef struct {
- double depcorr;
- struct __errmod_coef_t *coef;
-} errmod_t;
+bam_smpl_t *bam_smpl_init(void);
-errmod_t *errmod_init(double depcorr);
-void errmod_destroy(errmod_t *em);
+int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file);
+int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file);
+void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl);
-/*
- n: number of bases
- m: maximum base
- bases[i]: qual:6, strand:1, base:4
- q[i*m+j]: phred-scaled likelihood of (i,j)
- */
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
+// The above should be called only before bams are added. Returns the BAM id
+// to be passed to bam_smpl_get_sample_id() later. It is safe to assume
+// sequential numbering, starting from 0.
+//
+int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname);
+
+const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl);
+int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec);
+
+void bam_smpl_destroy(bam_smpl_t *bsmpl);
#endif
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h
index d4e856d..7d2d49f 100644
--- a/bcftools/bcftools.h
+++ b/bcftools/bcftools.h
@@ -30,6 +30,7 @@ THE SOFTWARE. */
#include <htslib/vcf.h>
#include <math.h>
+#define FT_TAB_TEXT 0 // custom tab-delimited text file
#define FT_GZ 1
#define FT_VCF 2
#define FT_VCF_GZ (FT_GZ|FT_VCF)
diff --git a/bcftools/bin.c b/bcftools/bin.c
new file mode 100644
index 0000000..b558b20
--- /dev/null
+++ b/bcftools/bin.c
@@ -0,0 +1,104 @@
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include "bcftools.h"
+#include "bin.h"
+
+struct _bin_t
+{
+ float *bins;
+ int nbins;
+};
+
+bin_t *bin_init(const char *list_def, float min, float max)
+{
+ bin_t *bin = (bin_t*) calloc(1,sizeof(bin_t));
+
+ // a comma indicates a list, otherwise a file
+ int is_file = strchr(list_def,',') ? 0 : 1;
+ int i, nlist;
+ char **list = hts_readlist(list_def, is_file, &nlist);
+ bin->nbins = nlist;
+ bin->bins = (float*) malloc(sizeof(float)*nlist);
+ for (i=0; i<nlist; i++)
+ {
+ char *tmp;
+ bin->bins[i] = strtod(list[i],&tmp);
+ if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
+ if ( min!=max && (bin->bins[i]<min || bin->bins[i]>max) )
+ error("Expected values from the interval [%f,%f], found %s\n", list[i]);
+ free(list[i]);
+ }
+ free(list);
+
+ if ( min!=max )
+ {
+ // make sure we've got both boundaries: min,max.
+ assert( nlist>1 );
+ float max_err = (bin->bins[1] - bin->bins[0])*1e-6;
+ if ( fabs(bin->bins[0] - min) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ memmove(bin->bins+1, bin->bins, sizeof(float)*(bin->nbins-1));
+ bin->bins[0] = min;
+ }
+ if ( fabs(bin->bins[bin->nbins-1] - max) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ bin->bins[bin->nbins-1] = max;
+ }
+ }
+ return bin;
+}
+
+void bin_destroy(bin_t *bin)
+{
+ free(bin->bins);
+ free(bin);
+}
+
+int bin_get_size(bin_t *bin) { return bin->nbins; }
+
+float bin_get_value(bin_t *bin, int idx) { return bin->bins[idx]; }
+
+int bin_get_idx(bin_t *bin, float value)
+{
+ if ( bin->bins[bin->nbins-1] < value ) return bin->nbins-1;
+
+ // Binary search in half-closed,half-open intervals [)
+ int imin = 0, imax = bin->nbins - 2;
+ while ( imin<imax )
+ {
+ int i = (imin+imax)/2;
+ if ( value < bin->bins[i] ) imax = i - 1;
+ else if ( value > bin->bins[i] ) imin = i + 1;
+ else return i;
+ }
+ if ( bin->bins[imax] <= value ) return imax;
+ return imin - 1;
+}
+
diff --git a/bcftools/bin.c.pysam.c b/bcftools/bin.c.pysam.c
new file mode 100644
index 0000000..6469b57
--- /dev/null
+++ b/bcftools/bin.c.pysam.c
@@ -0,0 +1,106 @@
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include "bcftools.h"
+#include "bin.h"
+
+struct _bin_t
+{
+ float *bins;
+ int nbins;
+};
+
+bin_t *bin_init(const char *list_def, float min, float max)
+{
+ bin_t *bin = (bin_t*) calloc(1,sizeof(bin_t));
+
+ // a comma indicates a list, otherwise a file
+ int is_file = strchr(list_def,',') ? 0 : 1;
+ int i, nlist;
+ char **list = hts_readlist(list_def, is_file, &nlist);
+ bin->nbins = nlist;
+ bin->bins = (float*) malloc(sizeof(float)*nlist);
+ for (i=0; i<nlist; i++)
+ {
+ char *tmp;
+ bin->bins[i] = strtod(list[i],&tmp);
+ if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
+ if ( min!=max && (bin->bins[i]<min || bin->bins[i]>max) )
+ error("Expected values from the interval [%f,%f], found %s\n", list[i]);
+ free(list[i]);
+ }
+ free(list);
+
+ if ( min!=max )
+ {
+ // make sure we've got both boundaries: min,max.
+ assert( nlist>1 );
+ float max_err = (bin->bins[1] - bin->bins[0])*1e-6;
+ if ( fabs(bin->bins[0] - min) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ memmove(bin->bins+1, bin->bins, sizeof(float)*(bin->nbins-1));
+ bin->bins[0] = min;
+ }
+ if ( fabs(bin->bins[bin->nbins-1] - max) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ bin->bins[bin->nbins-1] = max;
+ }
+ }
+ return bin;
+}
+
+void bin_destroy(bin_t *bin)
+{
+ free(bin->bins);
+ free(bin);
+}
+
+int bin_get_size(bin_t *bin) { return bin->nbins; }
+
+float bin_get_value(bin_t *bin, int idx) { return bin->bins[idx]; }
+
+int bin_get_idx(bin_t *bin, float value)
+{
+ if ( bin->bins[bin->nbins-1] < value ) return bin->nbins-1;
+
+ // Binary search in half-closed,half-open intervals [)
+ int imin = 0, imax = bin->nbins - 2;
+ while ( imin<imax )
+ {
+ int i = (imin+imax)/2;
+ if ( value < bin->bins[i] ) imax = i - 1;
+ else if ( value > bin->bins[i] ) imin = i + 1;
+ else return i;
+ }
+ if ( bin->bins[imax] <= value ) return imax;
+ return imin - 1;
+}
+
diff --git a/bcftools/bin.h b/bcftools/bin.h
new file mode 100644
index 0000000..ab9e5b1
--- /dev/null
+++ b/bcftools/bin.h
@@ -0,0 +1,65 @@
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+/*
+ Simple binning of float values into predefined bins
+*/
+
+#ifndef __BIN_H__
+#define __BIN_H__
+
+#include <stdio.h>
+
+typedef struct _bin_t bin_t;
+
+/*
+ * bin_init() - init bins
+ * @list: list of half-open intervals [). If the list does not contain commas,
+ * it is interpreted as a file name.
+ * @min,max: extreme values. This is for user convenience so that well-known
+ * extremes can be left out from the list. Ignored if min=max
+ */
+bin_t *bin_init(const char *list, float min, float max);
+void bin_destroy(bin_t *bin);
+
+/*
+ * bin_get_size() - number of boundaries, subtract 1 to get the number of bins
+ */
+int bin_get_size(bin_t *bin);
+
+/*
+ bin_get_idx() - find the bin index which corresponds to the value (binary search)
+ Returns the bin index 0 <= idx <= size-2 or -1,size-1 for out of range values.
+ */
+int bin_get_idx(bin_t *bin, float value);
+
+/*
+ bin_get_value() - get the i-th boundary value, i=0,..,size-1
+ */
+float bin_get_value(bin_t *bin, int ith);
+
+#endif
+
diff --git a/bcftools/call.h b/bcftools/call.h
index bbf0a52..0d707a0 100644
--- a/bcftools/call.h
+++ b/bcftools/call.h
@@ -72,6 +72,7 @@ typedef struct
double trio_Pm_SNPs, trio_Pm_del, trio_Pm_ins; // P(mendelian) for trio calling, see mcall_call_trio_genotypes()
int32_t *ugts, *cgts; // unconstraind and constrained GTs
uint32_t output_tags;
+ char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN)
// ccall only
double indel_frac, min_perm_p, min_lrt;
@@ -102,7 +103,7 @@ call_t;
void error(const char *format, ...);
/*
- * *call() - return negative value on error or the number of non-reference
+ * call() - return -1 value on critical error; -2 to skip the site; or the number of non-reference
* alleles on success.
*/
int mcall(call_t *call, bcf1_t *rec); // multiallic and rare-variant calling model
diff --git a/bcftools/ccall.c b/bcftools/ccall.c
index bb43d61..9f6958a 100644
--- a/bcftools/ccall.c
+++ b/bcftools/ccall.c
@@ -189,8 +189,6 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
bcf_update_info_string(call->hdr, rec, "CGT", tmp);
}
}
- if (pr == 0) return 1;
-
is_var = (pr->p_ref < call->pref);
r = is_var? pr->p_ref : pr->p_var;
@@ -232,11 +230,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
// Remove unused alleles
int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
- if ( call->flag & CALL_KEEPALT && call->unseen>0 )
- {
- assert( call->unseen==nals-1 );
- nals--;
- }
+ if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--;
if ( nals<rec->n_allele )
{
@@ -272,7 +266,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
int i;
for (i=0; i<rec->n_sample; i++)
{
- int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2;
+ int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2;
int gt = x&3;
if ( !call->ploidy || call->ploidy[i]==2 )
{
diff --git a/bcftools/ccall.c.pysam.c b/bcftools/ccall.c.pysam.c
index d4ceb01..1765d84 100644
--- a/bcftools/ccall.c.pysam.c
+++ b/bcftools/ccall.c.pysam.c
@@ -191,8 +191,6 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
bcf_update_info_string(call->hdr, rec, "CGT", tmp);
}
}
- if (pr == 0) return 1;
-
is_var = (pr->p_ref < call->pref);
r = is_var? pr->p_ref : pr->p_var;
@@ -234,11 +232,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
// Remove unused alleles
int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
- if ( call->flag & CALL_KEEPALT && call->unseen>0 )
- {
- assert( call->unseen==nals-1 );
- nals--;
- }
+ if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--;
if ( nals<rec->n_allele )
{
@@ -274,7 +268,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
int i;
for (i=0; i<rec->n_sample; i++)
{
- int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2;
+ int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2;
int gt = x&3;
if ( !call->ploidy || call->ploidy[i]==2 )
{
diff --git a/bcftools/consensus.c b/bcftools/consensus.c
index 051f353..4fccc4f 100644
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -27,6 +27,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <getopt.h>
#include <unistd.h>
@@ -35,7 +36,7 @@
#include <htslib/kstring.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
-#include <htslib/regidx.h>
+#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
@@ -68,6 +69,7 @@ typedef struct
int nvcf_buf, rid;
regidx_t *mask;
+ regitr_t *itr;
int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
@@ -202,6 +204,7 @@ static void init_data(args_t *args)
{
args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
if ( !args->mask ) error("Failed to initialize mask regions\n");
+ args->itr = regitr_init(args->mask);
}
// In case we want to store the chains
if ( args->chain_fname )
@@ -228,6 +231,7 @@ static void destroy_data(args_t *args)
free(args->vcf_buf);
free(args->fa_buf.s);
if ( args->mask ) regidx_destroy(args->mask);
+ if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
@@ -409,12 +413,27 @@ static void apply_variant(args_t *args, bcf1_t *rec)
rec->d.allele[1][0] = gt2iupac(ial,jal);
}
+ int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
- if ( idx<0 || idx>=args->fa_buf.l )
+ if ( idx<0 )
+ {
+ fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ return;
+ }
+ if ( rec->rlen > args->fa_buf.l - idx )
+ {
+ rec->rlen = args->fa_buf.l - idx;
+ alen = strlen(rec->d.allele[ialt]);
+ if ( alen > rec->rlen )
+ {
+ rec->d.allele[ialt][rec->rlen] = 0;
+ fprintf(stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ }
+ }
+ if ( idx>=args->fa_buf.l )
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
- int len_diff = 0, alen = 0;
if ( rec->d.allele[ialt][0]=='<' )
{
if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
@@ -495,18 +514,16 @@ static void mask_region(args_t *args, char *seq, int len)
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
- regitr_t itr;
- if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return;
+ if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
- while ( REGITR_OVERLAP(itr,start,end) )
+ while ( regitr_overlap(args->itr) )
{
- idx_start = REGITR_START(itr) - start;
- idx_end = REGITR_END(itr) - start;
+ idx_start = args->itr->beg - start;
+ idx_end = args->itr->end - start;
if ( idx_start < 0 ) idx_start = 0;
if ( idx_end >= len ) idx_end = len - 1;
for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
- itr.i++;
}
}
@@ -519,7 +536,7 @@ static void consensus(args_t *args)
{
if ( str.s[0]=='>' )
{
- // new sequence encountered, apply all chached variants
+ // new sequence encountered, apply all cached variants
while ( args->vcf_rbuf.n )
{
if (args->chain) {
@@ -576,7 +593,17 @@ static void consensus(args_t *args)
}
if ( !rec_ptr ) flush_fa_buffer(args, 60);
}
- if (args->chain) {
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
+ {
+ bcf1_t *rec = *rec_ptr;
+ if ( rec->rid!=args->rid ) break;
+ if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break;
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
+ apply_variant(args, rec);
+ }
+ if (args->chain)
+ {
print_chain(args);
destroy_chain(args);
}
@@ -588,8 +615,11 @@ static void consensus(args_t *args)
static void usage(args_t *args)
{
fprintf(stderr, "\n");
- fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference\n");
- fprintf(stderr, " fasta file.\n");
+ fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
+ fprintf(stderr, " file. By default, the program will apply all ALT variants. Using the\n");
+ fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
+ fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
+ fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c
index 91aa5ae..51d9339 100644
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -29,6 +29,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <getopt.h>
#include <unistd.h>
@@ -37,7 +38,7 @@
#include <htslib/kstring.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
-#include <htslib/regidx.h>
+#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
@@ -70,6 +71,7 @@ typedef struct
int nvcf_buf, rid;
regidx_t *mask;
+ regitr_t *itr;
int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
@@ -204,6 +206,7 @@ static void init_data(args_t *args)
{
args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
if ( !args->mask ) error("Failed to initialize mask regions\n");
+ args->itr = regitr_init(args->mask);
}
// In case we want to store the chains
if ( args->chain_fname )
@@ -230,6 +233,7 @@ static void destroy_data(args_t *args)
free(args->vcf_buf);
free(args->fa_buf.s);
if ( args->mask ) regidx_destroy(args->mask);
+ if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
@@ -411,12 +415,27 @@ static void apply_variant(args_t *args, bcf1_t *rec)
rec->d.allele[1][0] = gt2iupac(ial,jal);
}
+ int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
- if ( idx<0 || idx>=args->fa_buf.l )
+ if ( idx<0 )
+ {
+ fprintf(pysam_stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ return;
+ }
+ if ( rec->rlen > args->fa_buf.l - idx )
+ {
+ rec->rlen = args->fa_buf.l - idx;
+ alen = strlen(rec->d.allele[ialt]);
+ if ( alen > rec->rlen )
+ {
+ rec->d.allele[ialt][rec->rlen] = 0;
+ fprintf(pysam_stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ }
+ }
+ if ( idx>=args->fa_buf.l )
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
- int len_diff = 0, alen = 0;
if ( rec->d.allele[ialt][0]=='<' )
{
if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
@@ -497,18 +516,16 @@ static void mask_region(args_t *args, char *seq, int len)
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
- regitr_t itr;
- if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return;
+ if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
- while ( REGITR_OVERLAP(itr,start,end) )
+ while ( regitr_overlap(args->itr) )
{
- idx_start = REGITR_START(itr) - start;
- idx_end = REGITR_END(itr) - start;
+ idx_start = args->itr->beg - start;
+ idx_end = args->itr->end - start;
if ( idx_start < 0 ) idx_start = 0;
if ( idx_end >= len ) idx_end = len - 1;
for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
- itr.i++;
}
}
@@ -521,7 +538,7 @@ static void consensus(args_t *args)
{
if ( str.s[0]=='>' )
{
- // new sequence encountered, apply all chached variants
+ // new sequence encountered, apply all cached variants
while ( args->vcf_rbuf.n )
{
if (args->chain) {
@@ -578,7 +595,17 @@ static void consensus(args_t *args)
}
if ( !rec_ptr ) flush_fa_buffer(args, 60);
}
- if (args->chain) {
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
+ {
+ bcf1_t *rec = *rec_ptr;
+ if ( rec->rid!=args->rid ) break;
+ if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break;
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
+ apply_variant(args, rec);
+ }
+ if (args->chain)
+ {
print_chain(args);
destroy_chain(args);
}
@@ -590,8 +617,11 @@ static void consensus(args_t *args)
static void usage(args_t *args)
{
fprintf(pysam_stderr, "\n");
- fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference\n");
- fprintf(pysam_stderr, " fasta file.\n");
+ fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
+ fprintf(pysam_stderr, " file. By default, the program will apply all ALT variants. Using the\n");
+ fprintf(pysam_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
+ fprintf(pysam_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
+ fprintf(pysam_stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(pysam_stderr, "Options:\n");
fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
diff --git a/bcftools/convert.c b/bcftools/convert.c
index 3e289f0..05dce01 100644
--- a/bcftools/convert.c
+++ b/bcftools/convert.c
@@ -1,6 +1,6 @@
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -62,13 +62,19 @@ THE SOFTWARE. */
#define T_IUPAC_GT 23
#define T_GT_TO_HAP 24 // not publicly advertised
#define T_GT_TO_HAP2 25 // not publicly advertised
+#define T_TBCSQ 26
+#define T_END 27
+#define T_POS0 28
+#define T_END0 29
typedef struct _fmt_t
{
int type, id, is_gt_field, ready, subscript;
char *key;
bcf_fmt_t *fmt;
+ void *usr; // user data (optional)
void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *);
+ void (*destroy)(void*); // clean user data (optional)
}
fmt_t;
@@ -88,9 +94,19 @@ struct _convert_t
int allow_undef_tags;
};
+typedef struct
+{
+ kstring_t hap1,hap2;
+ char **str;
+ int n, m;
+}
+bcsq_t;
static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
+static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); }
+static void process_end(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen, str); }
+static void process_end0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen-1, str); }
static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); }
static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); }
static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
@@ -125,7 +141,7 @@ static void process_first_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( bcf_float_is_missing(line->qual) ) kputc('.', str);
- else ksprintf(str, "%g", line->qual);
+ else kputd(line->qual, str);
}
static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
@@ -193,7 +209,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
- case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break;
case BCF_BT_CHAR: kputc(info->v1.i, str); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
@@ -215,7 +231,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break;
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
@@ -226,6 +242,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,fmt->id) ) fmt->id = -1;
fmt->fmt = NULL;
if ( fmt->id >= 0 )
{
@@ -261,7 +278,7 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
kputc('.', str);
else
- ksprintf(str, "%g", ptr[fmt->subscript]);
+ kputd(ptr[fmt->subscript], str);
}
else if ( fmt->fmt->type != BCF_BT_CHAR )
{
@@ -316,6 +333,111 @@ static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isampl
}
if (l == 0) kputc('.', str);
}
+static void destroy_tbcsq(void *usr)
+{
+ if ( !usr ) return;
+ bcsq_t *csq = (bcsq_t*) usr;
+ free(csq->hap1.s);
+ free(csq->hap2.s);
+ if ( csq->n )
+ free(csq->str[0]);
+ free(csq->str);
+ free(csq);
+}
+static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ {
+ init_format(convert, line, fmt);
+
+ bcsq_t *csq;
+ if ( fmt->usr )
+ {
+ csq = (bcsq_t*) fmt->usr;
+ if ( csq->n )
+ free(csq->str[0]);
+ csq->n = 0;
+ }
+ else
+ csq = (bcsq_t*) calloc(1,sizeof(bcsq_t));
+ fmt->usr = csq;
+
+ int i=0, len = 0;
+ char *tmp = NULL;
+ if ( bcf_get_info_string(convert->header,line,fmt->key,&tmp,&len)<0 )
+ {
+ csq->n = 0;
+ return;
+ }
+ do
+ {
+ csq->n++;
+ hts_expand(char*, csq->n, csq->m, csq->str);
+ csq->str[ csq->n-1 ] = tmp + i;
+ while ( i<len && tmp[i]!=',' ) i++;
+ if ( i<len && tmp[i]==',' ) tmp[i++] = 0;
+ }
+ while ( i<len );
+ }
+
+ bcsq_t *csq = (bcsq_t*)fmt->usr;
+
+ if ( fmt->fmt==NULL || !csq->n ) return;
+
+ csq->hap1.l = 0;
+ csq->hap2.l = 0;
+
+ int mask = fmt->subscript==0 ? 3 : 1; // merge both haplotypes if subscript==0
+
+ #define BRANCH(type_t, nbits) { \
+ type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \
+ int i,j; \
+ if ( fmt->subscript<=0 || fmt->subscript==1 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=0; i<nbits; i+=2) \
+ if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+ } \
+ } \
+ if ( fmt->subscript<0 || fmt->subscript==2 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=1; i<nbits; i+=2) \
+ if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+ } \
+ } \
+ }
+ switch (fmt->fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(uint8_t, 8); break;
+ case BCF_BT_INT16: BRANCH(uint16_t,16); break;
+ case BCF_BT_INT32: BRANCH(uint32_t,32); break;
+ default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
+ }
+ #undef BRANCH
+
+ if ( !csq->hap1.l && !csq->hap2.l ) return;
+
+ if ( csq->hap1.l ) csq->hap1.s[--csq->hap1.l] = 0;
+ if ( csq->hap2.l ) csq->hap2.s[--csq->hap2.l] = 0;
+
+ if ( fmt->subscript<0 )
+ {
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ kputc_('\t', str);
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+ }
+ else if ( fmt->subscript<2 )
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ else
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+}
static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
init_format(convert, line, fmt);
@@ -409,6 +531,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; }
if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; }
if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; }
+ if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; }
}
static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
@@ -597,103 +720,260 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
// the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a
// heterozygous genotype of unknown phase.
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- {
- // Throw an error or silently proceed?
- //
- // for (i=0; i<convert->nsamples; i++) kputs(" ...", str);
- // return;
-
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- }
-
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? -", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 -", str); // first ALT allele
- else
- kputs("0 -", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ if ( ptr[1]==bcf_int8_vector_end )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
// same as process_gt_to_hap but converts haploid genotypes into diploid
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 -> 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? ?", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 1", str); // first ALT allele
- else
- kputs("0 0", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 -> 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
@@ -709,6 +989,8 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
fmt->key = key ? strdup(key) : NULL;
fmt->is_gt_field = is_gtf;
fmt->subscript = -1;
+ fmt->usr = NULL;
+ fmt->destroy = NULL;
// Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags.
if ( key )
@@ -718,6 +1000,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
{
if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
+ else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; }
+ else if ( !strcmp("END",key) ) { fmt->type = T_END; }
+ else if ( !strcmp("END0",key) ) { fmt->type = T_END0; }
else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
@@ -742,6 +1027,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break;
case T_CHROM: fmt->handler = &process_chrom; break;
case T_POS: fmt->handler = &process_pos; break;
+ case T_POS0: fmt->handler = &process_pos0; break;
+ case T_END: fmt->handler = &process_end; break;
+ case T_END0: fmt->handler = &process_end0; break;
case T_ID: fmt->handler = &process_id; break;
case T_REF: fmt->handler = &process_ref; break;
case T_ALT: fmt->handler = &process_alt; break;
@@ -759,15 +1047,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break;
- case T_LINE: fmt->handler = &process_line; break;
+ case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
- if ( key )
+ if ( key && fmt->type==T_INFO )
{
- if ( fmt->type==T_INFO )
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,fmt->id) )
{
- fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
- if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key);
+ fmt->id = -1;
+ convert->undef_info_tag = strdup(key);
}
}
return fmt;
@@ -797,6 +1087,16 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "TBCSQ") )
+ {
+ fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ if ( fmt->subscript==-1 )
+ {
+ if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; }
+ }
+ else fmt->subscript++;
+ }
else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "INFO") )
{
@@ -819,6 +1119,9 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
{
if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf);
else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
else if ( !strcmp(str.s, "ALT") )
@@ -903,6 +1206,8 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *
default: p = parse_sep(convert, p, is_gtf); break;
}
}
+ if ( is_gtf )
+ error("Could not parse the format string, missing the square bracket \"]\": %s\n", convert->format_str);
if ( nsamples )
{
@@ -923,7 +1228,10 @@ void convert_destroy(convert_t *convert)
{
int i;
for (i=0; i<convert->nfmt; i++)
+ {
+ if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr);
free(convert->fmt[i].key);
+ }
free(convert->fmt);
free(convert->undef_info_tag);
free(convert->dat);
@@ -984,7 +1292,7 @@ int convert_header(convert_t *convert, kstring_t *str)
int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
{
if ( !convert->allow_undef_tags && convert->undef_info_tag )
- error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag);
+ error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag);
int l_ori = str->l;
bcf_unpack(line, convert->max_unpack);
@@ -993,17 +1301,24 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
str->l = 0;
for (i=0; i<convert->nfmt; i++)
{
- // Genotype fields
+ // Genotype fields.
if ( convert->fmt[i].is_gt_field )
{
int j = i, js, k;
- while ( convert->fmt[j].is_gt_field )
+ while ( j<convert->nfmt && convert->fmt[j].is_gt_field )
{
convert->fmt[j].ready = 0;
j++;
}
for (js=0; js<convert->nsamples; js++)
{
+ // Here comes a hack designed for TBCSQ. When running on large files,
+ // such as 1000GP, there are too many empty fields in the output and
+ // it's very very slow. Therefore in case the handler does not add
+ // anything to the string, we trim all genotype fields enclosed in square
+ // brackets here. This may be changed in future, time will show...
+ size_t l_start = str->l;
+
int ks = convert->samples[js];
for (k=i; k<j; k++)
{
@@ -1013,7 +1328,11 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
}
else if ( convert->fmt[k].handler )
+ {
+ size_t l = str->l;
convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+ if ( l==str->l ) { str->l = l_start; break; } // only TBCSQ does this
+ }
}
}
i = j-1;
@@ -1027,6 +1346,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
}
else if ( convert->fmt[i].handler )
convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str);
+
}
return str->l - l_ori;
}
diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c
index 084ef50..95814b7 100644
--- a/bcftools/convert.c.pysam.c
+++ b/bcftools/convert.c.pysam.c
@@ -2,7 +2,7 @@
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -64,13 +64,19 @@ THE SOFTWARE. */
#define T_IUPAC_GT 23
#define T_GT_TO_HAP 24 // not publicly advertised
#define T_GT_TO_HAP2 25 // not publicly advertised
+#define T_TBCSQ 26
+#define T_END 27
+#define T_POS0 28
+#define T_END0 29
typedef struct _fmt_t
{
int type, id, is_gt_field, ready, subscript;
char *key;
bcf_fmt_t *fmt;
+ void *usr; // user data (optional)
void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *);
+ void (*destroy)(void*); // clean user data (optional)
}
fmt_t;
@@ -90,9 +96,19 @@ struct _convert_t
int allow_undef_tags;
};
+typedef struct
+{
+ kstring_t hap1,hap2;
+ char **str;
+ int n, m;
+}
+bcsq_t;
static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
+static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); }
+static void process_end(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen, str); }
+static void process_end0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen-1, str); }
static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); }
static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); }
static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
@@ -127,7 +143,7 @@ static void process_first_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( bcf_float_is_missing(line->qual) ) kputc('.', str);
- else ksprintf(str, "%g", line->qual);
+ else kputd(line->qual, str);
}
static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
@@ -195,7 +211,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
- case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break;
case BCF_BT_CHAR: kputc(info->v1.i, str); break;
default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
@@ -217,7 +233,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break;
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
@@ -228,6 +244,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,fmt->id) ) fmt->id = -1;
fmt->fmt = NULL;
if ( fmt->id >= 0 )
{
@@ -263,7 +280,7 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
kputc('.', str);
else
- ksprintf(str, "%g", ptr[fmt->subscript]);
+ kputd(ptr[fmt->subscript], str);
}
else if ( fmt->fmt->type != BCF_BT_CHAR )
{
@@ -318,6 +335,111 @@ static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isampl
}
if (l == 0) kputc('.', str);
}
+static void destroy_tbcsq(void *usr)
+{
+ if ( !usr ) return;
+ bcsq_t *csq = (bcsq_t*) usr;
+ free(csq->hap1.s);
+ free(csq->hap2.s);
+ if ( csq->n )
+ free(csq->str[0]);
+ free(csq->str);
+ free(csq);
+}
+static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ {
+ init_format(convert, line, fmt);
+
+ bcsq_t *csq;
+ if ( fmt->usr )
+ {
+ csq = (bcsq_t*) fmt->usr;
+ if ( csq->n )
+ free(csq->str[0]);
+ csq->n = 0;
+ }
+ else
+ csq = (bcsq_t*) calloc(1,sizeof(bcsq_t));
+ fmt->usr = csq;
+
+ int i=0, len = 0;
+ char *tmp = NULL;
+ if ( bcf_get_info_string(convert->header,line,fmt->key,&tmp,&len)<0 )
+ {
+ csq->n = 0;
+ return;
+ }
+ do
+ {
+ csq->n++;
+ hts_expand(char*, csq->n, csq->m, csq->str);
+ csq->str[ csq->n-1 ] = tmp + i;
+ while ( i<len && tmp[i]!=',' ) i++;
+ if ( i<len && tmp[i]==',' ) tmp[i++] = 0;
+ }
+ while ( i<len );
+ }
+
+ bcsq_t *csq = (bcsq_t*)fmt->usr;
+
+ if ( fmt->fmt==NULL || !csq->n ) return;
+
+ csq->hap1.l = 0;
+ csq->hap2.l = 0;
+
+ int mask = fmt->subscript==0 ? 3 : 1; // merge both haplotypes if subscript==0
+
+ #define BRANCH(type_t, nbits) { \
+ type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \
+ int i,j; \
+ if ( fmt->subscript<=0 || fmt->subscript==1 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=0; i<nbits; i+=2) \
+ if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+ } \
+ } \
+ if ( fmt->subscript<0 || fmt->subscript==2 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=1; i<nbits; i+=2) \
+ if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+ } \
+ } \
+ }
+ switch (fmt->fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(uint8_t, 8); break;
+ case BCF_BT_INT16: BRANCH(uint16_t,16); break;
+ case BCF_BT_INT32: BRANCH(uint32_t,32); break;
+ default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
+ }
+ #undef BRANCH
+
+ if ( !csq->hap1.l && !csq->hap2.l ) return;
+
+ if ( csq->hap1.l ) csq->hap1.s[--csq->hap1.l] = 0;
+ if ( csq->hap2.l ) csq->hap2.s[--csq->hap2.l] = 0;
+
+ if ( fmt->subscript<0 )
+ {
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ kputc_('\t', str);
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+ }
+ else if ( fmt->subscript<2 )
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ else
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+}
static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
init_format(convert, line, fmt);
@@ -411,6 +533,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; }
if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; }
if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; }
+ if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; }
}
static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
@@ -599,103 +722,260 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
// the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a
// heterozygous genotype of unknown phase.
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- {
- // Throw an error or silently proceed?
- //
- // for (i=0; i<convert->nsamples; i++) kputs(" ...", str);
- // return;
-
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- }
-
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? -", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 -", str); // first ALT allele
- else
- kputs("0 -", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ if ( ptr[1]==bcf_int8_vector_end )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
// same as process_gt_to_hap but converts haploid genotypes into diploid
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 -> 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? ?", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 1", str); // first ALT allele
- else
- kputs("0 0", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 -> 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
@@ -711,6 +991,8 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
fmt->key = key ? strdup(key) : NULL;
fmt->is_gt_field = is_gtf;
fmt->subscript = -1;
+ fmt->usr = NULL;
+ fmt->destroy = NULL;
// Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags.
if ( key )
@@ -720,6 +1002,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
{
if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
+ else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; }
+ else if ( !strcmp("END",key) ) { fmt->type = T_END; }
+ else if ( !strcmp("END0",key) ) { fmt->type = T_END0; }
else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
@@ -744,6 +1029,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break;
case T_CHROM: fmt->handler = &process_chrom; break;
case T_POS: fmt->handler = &process_pos; break;
+ case T_POS0: fmt->handler = &process_pos0; break;
+ case T_END: fmt->handler = &process_end; break;
+ case T_END0: fmt->handler = &process_end0; break;
case T_ID: fmt->handler = &process_id; break;
case T_REF: fmt->handler = &process_ref; break;
case T_ALT: fmt->handler = &process_alt; break;
@@ -761,15 +1049,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break;
- case T_LINE: fmt->handler = &process_line; break;
+ case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
- if ( key )
+ if ( key && fmt->type==T_INFO )
{
- if ( fmt->type==T_INFO )
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,fmt->id) )
{
- fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
- if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key);
+ fmt->id = -1;
+ convert->undef_info_tag = strdup(key);
}
}
return fmt;
@@ -799,6 +1089,16 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "TBCSQ") )
+ {
+ fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ if ( fmt->subscript==-1 )
+ {
+ if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; }
+ }
+ else fmt->subscript++;
+ }
else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "INFO") )
{
@@ -821,6 +1121,9 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
{
if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf);
else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
else if ( !strcmp(str.s, "ALT") )
@@ -905,6 +1208,8 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *
default: p = parse_sep(convert, p, is_gtf); break;
}
}
+ if ( is_gtf )
+ error("Could not parse the format string, missing the square bracket \"]\": %s\n", convert->format_str);
if ( nsamples )
{
@@ -925,7 +1230,10 @@ void convert_destroy(convert_t *convert)
{
int i;
for (i=0; i<convert->nfmt; i++)
+ {
+ if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr);
free(convert->fmt[i].key);
+ }
free(convert->fmt);
free(convert->undef_info_tag);
free(convert->dat);
@@ -986,7 +1294,7 @@ int convert_header(convert_t *convert, kstring_t *str)
int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
{
if ( !convert->allow_undef_tags && convert->undef_info_tag )
- error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag);
+ error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag);
int l_ori = str->l;
bcf_unpack(line, convert->max_unpack);
@@ -995,17 +1303,24 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
str->l = 0;
for (i=0; i<convert->nfmt; i++)
{
- // Genotype fields
+ // Genotype fields.
if ( convert->fmt[i].is_gt_field )
{
int j = i, js, k;
- while ( convert->fmt[j].is_gt_field )
+ while ( j<convert->nfmt && convert->fmt[j].is_gt_field )
{
convert->fmt[j].ready = 0;
j++;
}
for (js=0; js<convert->nsamples; js++)
{
+ // Here comes a hack designed for TBCSQ. When running on large files,
+ // such as 1000GP, there are too many empty fields in the output and
+ // it's very very slow. Therefore in case the handler does not add
+ // anything to the string, we trim all genotype fields enclosed in square
+ // brackets here. This may be changed in future, time will show...
+ size_t l_start = str->l;
+
int ks = convert->samples[js];
for (k=i; k<j; k++)
{
@@ -1015,7 +1330,11 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
}
else if ( convert->fmt[k].handler )
+ {
+ size_t l = str->l;
convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+ if ( l==str->l ) { str->l = l_start; break; } // only TBCSQ does this
+ }
}
}
i = j-1;
@@ -1029,6 +1348,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
}
else if ( convert->fmt[i].handler )
convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str);
+
}
return str->l - l_ori;
}
diff --git a/bcftools/csq.c b/bcftools/csq.c
new file mode 100644
index 0000000..b1db103
--- /dev/null
+++ b/bcftools/csq.c
@@ -0,0 +1,3824 @@
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ Things that would be nice to have
+ - for stop-lost events (also in frameshifts) report the number of truncated aa's
+ - memory could be greatly reduced by indexing gff (but it is quite compact already)
+ - deletions that go beyond transcript boundaries are not checked at sequence level
+ - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
+ - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
+
+ Read about transcript types here
+ http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+ http://www.ensembl.org/info/genome/variation/predicted_data.html
+ http://www.gencodegenes.org/gencode_biotypes.html
+
+ List of supported biotypes
+ antisense
+ IG_C_gene
+ IG_D_gene
+ IG_J_gene
+ IG_LV_gene
+ IG_V_gene
+ lincRNA
+ macro_lncRNA
+ miRNA
+ misc_RNA
+ Mt_rRNA
+ Mt_tRNA
+ polymorphic_pseudogene
+ processed_transcript
+ protein_coding
+ ribozyme
+ rRNA
+ sRNA
+ scRNA
+ scaRNA
+ sense_intronic
+ sense_overlapping
+ snRNA
+ snoRNA
+ TR_C_gene
+ TR_D_gene
+ TR_J_gene
+ TR_V_gene
+
+ The gff parsing logic
+ We collect features such by combining gff lines A,B,C as follows:
+ A .. gene line with a supported biotype
+ A.ID=~/^gene:/
+
+ B .. transcript line referencing A
+ B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
+
+ C .. corresponding CDS, exon, and UTR lines:
+ C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+
+ For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
+ complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
+
+
+ The supported consequence types, sorted by impact:
+ splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
+ splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
+ stop_gained .. DNA sequence variant resulting in a stop codon
+ frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
+ stop_lost .. elongated transcript, stop codon changed
+ start_lost .. the first codon changed
+ inframe_altering .. combination of indels leading to unchanged reading frame and length
+ inframe_insertion .. inserted coding sequence, unchanged reading frame
+ inframe_deletion .. deleted coding sequence, unchanged reading frame
+ missense_variant .. amino acid (aa) change, unchanged length
+ splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron
+ synonymous_variant .. DNA sequence variant resulting in no amino acid change
+ stop_retained_variant .. different stop codon
+ non_coding_variant .. variant in non-coding sequence, such as RNA gene
+ 5_prime_UTR_variant
+ 3_prime_UTR_variant
+ intron_variant .. reported only if none of the above
+ intergenic_variant .. reported only if none of the above
+
+
+ The annotation algorithm.
+ The algorithm checks if the variant falls in a region of a supported type. The
+ search is performed in the following order, until a match is found:
+ 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
+ 2. idx_utr(gf_utr_t) - check UTR hits
+ 3. idx_exon(gf_exon_t) - check for splice variants
+ 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
+
+ These regidx indexes are created by parsing a gff3 file as follows:
+ 1. create the array "ftr" of all UTR, CDS, exons. This will be
+ processed later and pruned based on transcript types we want to keep.
+ In the same go, create the hash "id2tr" of transcripts to keep
+ (based on biotype) which maps from transcript_id to a transcript. At
+ the same time also build the hash "gid2gene" which maps from gene_id to
+ gf_gene_t pointer.
+
+ 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
+ Use only features from "ftr" which are present in "id2tr".
+
+ 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
+
+ Data structures.
+ idx_cds, idx_utr, idx_exon, idx_tscript:
+ as described above, regidx structures for fast lookup of exons/transcripts
+ overlapping a region, the payload is a pointer to tscript.cds
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/faidx.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "regidx.h"
+#include "kheap.h"
+#include "smpl_ilist.h"
+#include "rbuf.h"
+
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+// Definition of splice_region, splice_acceptor and splice_donor
+#define N_SPLICE_DONOR 2
+#define N_SPLICE_REGION_EXON 3
+#define N_SPLICE_REGION_INTRON 8
+
+// Ensembl ID format, e.g.
+// ENST00000423372 for human .. ENST%011d
+// ENSMUST00000120394 for mouse .. ENSMUST%011d
+char ENSID_BUF[32], *ENSID_FMT = NULL;
+static inline char *ENSID(uint32_t id)
+{
+ sprintf(ENSID_BUF,ENSID_FMT,id);
+ return ENSID_BUF;
+}
+
+
+#define N_REF_PAD 10 // number of bases to avoid boundary effects
+
+#define STRAND_REV 0
+#define STRAND_FWD 1
+
+#define TRIM_NONE 0
+#define TRIM_5PRIME 1
+#define TRIM_3PRIME 2
+
+// How to treat phased/unphased genotypes
+#define PHASE_REQUIRE 0 // --phase r
+#define PHASE_MERGE 1 // --phase m
+#define PHASE_AS_IS 2 // --phase a
+#define PHASE_SKIP 3 // --phase s
+#define PHASE_NON_REF 4 // --phase R
+#define PHASE_DROP_GT 5 // --samples -
+
+// Node types in the haplotype tree
+#define HAP_CDS 0
+#define HAP_ROOT 1
+#define HAP_SSS 2 // start/stop/splice
+
+#define CSQ_PRINTED_UPSTREAM (1<<0)
+#define CSQ_SYNONYMOUS_VARIANT (1<<1)
+#define CSQ_MISSENSE_VARIANT (1<<2)
+#define CSQ_STOP_LOST (1<<3)
+#define CSQ_STOP_GAINED (1<<4)
+#define CSQ_INFRAME_DELETION (1<<5)
+#define CSQ_INFRAME_INSERTION (1<<6)
+#define CSQ_FRAMESHIFT_VARIANT (1<<7)
+#define CSQ_SPLICE_ACCEPTOR (1<<8)
+#define CSQ_SPLICE_DONOR (1<<9)
+#define CSQ_START_LOST (1<<10)
+#define CSQ_SPLICE_REGION (1<<11)
+#define CSQ_STOP_RETAINED (1<<12)
+#define CSQ_UTR5 (1<<13)
+#define CSQ_UTR3 (1<<14)
+#define CSQ_NON_CODING (1<<15)
+#define CSQ_INTRON (1<<16)
+//#define CSQ_INTERGENIC (1<<17)
+#define CSQ_INFRAME_ALTERING (1<<18)
+#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string
+#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
+#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence
+
+// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
+#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
+ CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
+ CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
+ CSQ_UPSTREAM_STOP)
+#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST)
+
+#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
+#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
+
+// see kput_vcsq()
+const char *csq_strings[] =
+{
+ NULL,
+ "synonymous",
+ "missense",
+ "stop_lost",
+ "stop_gained",
+ "inframe_deletion",
+ "inframe_insertion",
+ "frameshift",
+ "splice_acceptor",
+ "splice_donor",
+ "start_lost",
+ "splice_region",
+ "stop_retained",
+ "5_prime_utr",
+ "3_prime_utr",
+ "non_coding",
+ "intron",
+ "intergenic",
+ "inframe_altering",
+ NULL,
+ NULL,
+ "coding_sequence"
+};
+
+
+// GFF line types
+#define GFF_TSCRIPT_LINE 1
+#define GFF_GENE_LINE 2
+
+
+/*
+ Genomic features, for fast lookup by position to overlapping features
+*/
+#define GF_coding_bit 6
+#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
+#define GF_MT_rRNA 1 // non-coding: 1, 2, ...
+#define GF_MT_tRNA 2
+#define GF_lincRNA 3
+#define GF_miRNA 4
+#define GF_MISC_RNA 5
+#define GF_rRNA 6
+#define GF_snRNA 7
+#define GF_snoRNA 8
+#define GF_PROCESSED_TRANSCRIPT 9
+#define GF_ANTISENSE 10
+#define GF_macro_lncRNA 11
+#define GF_ribozyme 12
+#define GF_sRNA 13
+#define GF_scRNA 14
+#define GF_scaRNA 15
+#define GF_SENSE_INTRONIC 16
+#define GF_SENSE_OVERLAPPING 17
+#define GF_PSEUDOGENE 18
+#define GF_PROCESSED_PSEUDOGENE 19
+#define GF_ARTIFACT 20
+#define GF_IG_PSEUDOGENE 21
+#define GF_IG_C_PSEUDOGENE 22
+#define GF_IG_J_PSEUDOGENE 23
+#define GF_IG_V_PSEUDOGENE 24
+#define GF_TR_V_PSEUDOGENE 25
+#define GF_TR_J_PSEUDOGENE 26
+#define GF_MT_tRNA_PSEUDOGENE 27
+#define GF_misc_RNA_PSEUDOGENE 28
+#define GF_miRNA_PSEUDOGENE 29
+#define GF_RIBOZYME 30
+#define GF_RETAINED_INTRON 31
+#define GF_RETROTRANSPOSED 32
+#define GF_tRNA_PSEUDOGENE 33
+#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
+#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
+#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
+#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
+#define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
+#define GF_KNOWN_NCRNA 39
+#define GF_UNITARY_PSEUDOGENE 40
+#define GF_UNPROCESSED_PSEUDOGENE 41
+#define GF_LRG_GENE 42
+#define GF_3PRIME_OVERLAPPING_ncRNA 43
+#define GF_DISRUPTED_DOMAIN 44
+#define GF_vaultRNA 45
+#define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
+#define GF_AMBIGUOUS_ORF 47
+#define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
+#define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
+#define GF_IG_C (3|(1<<GF_coding_bit))
+#define GF_IG_D (4|(1<<GF_coding_bit))
+#define GF_IG_J (5|(1<<GF_coding_bit))
+#define GF_IG_LV (6|(1<<GF_coding_bit))
+#define GF_IG_V (7|(1<<GF_coding_bit))
+#define GF_TR_C (8|(1<<GF_coding_bit))
+#define GF_TR_D (9|(1<<GF_coding_bit))
+#define GF_TR_J (10|(1<<GF_coding_bit))
+#define GF_TR_V (11|(1<<GF_coding_bit))
+#define GF_NMD (12|(1<<GF_coding_bit))
+#define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
+#define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
+#define GF_EXON ((1<<(GF_coding_bit+1))+2)
+#define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
+#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
+// GF_MAX = (1<<30)-1, see hap_node_t
+
+typedef struct _tscript_t tscript_t;
+typedef struct
+{
+ tscript_t *tr; // transcript
+ uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
+ uint32_t pos; // 0-based index of the first exon base within the transcript (only to
+ // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
+ uint32_t len; // exon length
+ uint32_t icds:30, // exon index within the transcript
+ phase:2; // offset of the CDS
+}
+gf_cds_t;
+typedef struct
+{
+ char *name; // human readable name, e.g. ORF45
+ uint8_t iseq;
+}
+gf_gene_t;
+typedef struct
+{
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_exon_t;
+typedef enum { prime3, prime5 } utr_t;
+typedef struct
+{
+ utr_t which;
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_utr_t;
+
+
+/*
+ Structures related to VCF output:
+
+ vcsq_t
+ information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
+
+ vcrec_t
+ single VCF record and csq tied to this record. (Haplotype can have multiple
+ consequences in several VCF records. Each record can have multiple consequences
+ from multiple haplotypes.)
+
+ csq_t
+ a top-level consequence tied to a haplotype
+
+ vbuf_t
+ pos2vbuf
+ VCF records with the same position clustered together for a fast lookup via pos2vbuf
+*/
+typedef struct _vbuf_t vbuf_t;
+typedef struct _vcsq_t vcsq_t;
+struct _vcsq_t
+{
+ uint32_t strand:1,
+ type:31; // one of CSQ_* types
+ uint32_t trid;
+ uint32_t biotype; // one of GF_* types
+ char *gene; // gene name
+ bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
+ kstring_t vstr; // variant string, eg 5TY>5I|121ACG>A+124TA>T
+};
+typedef struct
+{
+ bcf1_t *line;
+ uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved
+ uint32_t nfmt:4, nvcsq:28, mvcsq;
+ vcsq_t *vcsq; // there can be multiple consequences for a single VCF record
+}
+vrec_t;
+typedef struct
+{
+ uint32_t pos;
+ vrec_t *vrec; // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf)
+ int idx; // 0-based index of the csq at the VCF line, for FMT/BCSQ
+ vcsq_t type;
+}
+csq_t;
+struct _vbuf_t
+{
+ vrec_t **vrec; // buffer of VCF lines with the same position
+ int n, m;
+};
+KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*)
+
+
+/*
+ Structures related to haplotype-aware consequences in coding regions
+
+ hap_node_t
+ node of a haplotype tree. Each transcript has one tree
+
+ tscript_t
+ despite its general name, it is intended for coding transcripts only
+
+ hap_t
+ hstack_t
+ for traversal of the haplotype tree and braking combined
+ consequences into independent parts
+*/
+typedef struct _hap_node_t hap_node_t;
+struct _hap_node_t
+{
+ char *seq; // cds segment [parent_node,this_node)
+ char *var; // variant "ref>alt"
+ uint32_t type:2, // HAP_ROOT or HAP_CDS
+ csq:30; // this node's consequence
+ int dlen; // alt minus ref length: <0 del, >0 ins, 0 substitution
+ uint32_t rbeg; // variant's VCF position (0-based, inclusive)
+ int32_t rlen; // variant's rlen; alen=rlen+dlen; fake for non CDS types
+ uint32_t sbeg; // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included)
+ uint32_t icds; // which exon does this node's variant overlaps
+ hap_node_t **child, *prev; // children haplotypes and previous coding node
+ int nchild, mchild;
+ bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record
+ uint32_t nend; // number of haplotypes ending in this node
+ int *cur_child, mcur_child; // mapping from the allele to the currently active child
+ csq_t *csq_list; // list of haplotype's consequences, broken by position
+ int ncsq_list, mcsq_list;
+};
+struct _tscript_t
+{
+ uint32_t id; // transcript id
+ uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
+ uint32_t strand:1, // STRAND_REV or STRAND_FWD
+ ncds:31, // number of exons
+ mcds;
+ gf_cds_t **cds; // ordered list of exons
+ char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
+ char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
+ hap_node_t *root; // root of the haplotype tree
+ hap_node_t **hap; // pointer to haplotype leaves, two for each sample
+ int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
+ uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
+ type:30; // one of GF_* types
+ gf_gene_t *gene;
+};
+static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+{
+ return ( (*a)->end < (*b)->end ) ? 1 : 0;
+}
+KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+typedef khp_trhp_t tr_heap_t;
+typedef struct
+{
+ hap_node_t *node; // current node
+ int ichild; // current child in the active node
+ int dlen; // total dlen, from the root to the active node
+ size_t slen; // total sequence length, from the root to the active node
+}
+hstack_t;
+typedef struct
+{
+ int mstack;
+ hstack_t *stack;
+ tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ kstring_t sseq; // spliced haplotype sequence on ref strand
+ kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
+ kstring_t tref; // the variable part of translated reference transcript, coding strand
+ uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
+ int upstream_stop;
+}
+hap_t;
+
+
+/*
+ Helper structures, only for initialization
+
+ ftr_t
+ temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
+KHASH_MAP_INIT_INT(int2int, int)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+ int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+ uint32_t beg;
+ uint32_t end;
+ uint32_t trid;
+ uint32_t strand:1; // STRAND_REV,STRAND_FWD
+ uint32_t phase:2; // 0, 1 or 2
+ uint32_t iseq:29;
+}
+ftr_t;
+typedef struct
+{
+ // all exons, CDS, UTRs
+ ftr_t *ftr;
+ int nftr, mftr;
+
+ // mapping from transcript ensembl id to gene id
+ kh_int2gene_t *gid2gene;
+
+ // mapping from transcript id to tscript, for quick CDS anchoring
+ kh_int2tscript_t *id2tr;
+
+ // sequences
+ void *seq2int;
+ char **seq;
+ int nseq, mseq;
+
+ // ignored biotypes
+ void *ignored_biotypes;
+}
+aux_t;
+
+typedef struct _args_t
+{
+ // the main regidx lookups, from chr:beg-end to overlapping features and
+ // index iterator
+ regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+ regitr_t *itr;
+
+ // temporary structures, deleted after initializtion
+ aux_t init;
+
+ // text tab-delimited output (out) or vcf/bcf output (out_fh)
+ FILE *out;
+ htsFile *out_fh;
+
+ // vcf
+ bcf_srs_t *sr;
+ bcf_hdr_t *hdr;
+ int hdr_nsmpl; // actual number of samples in the vcf, for bcf_update_format_values()
+
+ // include or exclude sites which match the filters
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // FLT_INCLUDE or FLT_EXCLUDE
+
+ // samples to process
+ int sample_is_file;
+ char *sample_list;
+ smpl_ilist_t *smpl;
+
+ char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
+ char *bcsq_tag;
+ int argc, output_type;
+ int phase, quiet, local_csq;
+ int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ
+ int ncsq_small_warned;
+
+ int rid; // current chromosome
+ tr_heap_t *active_tr; // heap of active transcripts for quick flushing
+ hap_t *hap; // transcript haplotype recursion
+ vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
+ rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
+ kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
+ tscript_t **rm_tr; // buffer of transcripts to clean
+ int nrm_tr, mrm_tr;
+ csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
+ int ncsq_buf, mcsq_buf;
+
+ faidx_t *fai;
+ kstring_t str, str2;
+ int32_t *gt_arr, mgt_arr;
+}
+args_t;
+
+// AAA, AAC, ...
+const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
+const uint8_t nt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3
+};
+const uint8_t cnt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0
+};
+#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+
+static const char *gf_strings_noncoding[] =
+{
+ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+ "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
+ "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+ "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+const char *gf_type2gff_string(int type)
+{
+ if ( !GF_is_coding(type) )
+ {
+ if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+ type &= (1<<(GF_coding_bit+1)) - 1;
+ return gf_strings_special[type - 1];
+ }
+ type &= (1<<GF_coding_bit) - 1;
+ return gf_strings_coding[type - 1];
+}
+
+/*
+ gff parsing functions
+*/
+static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
+{
+ aux_t *aux = &args->init;
+ char c = chr_end[1];
+ chr_end[1] = 0;
+ int iseq;
+ if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ {
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = strdup(chr_beg);
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 256 ); // see gf_gene_t.iseq
+ }
+ chr_end[1] = c;
+ return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+ while ( *ss && *ss!='\t' ) ss++;
+ if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return ss+1;
+}
+static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
+{
+ char *se = (char*) line;
+ while ( *se && *se!='\t' ) se++;
+ if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ *chr_beg = (char*) line;
+ *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+ char *se = ss;
+ *beg = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+ ss = se+1;
+ *end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return se+1;
+}
+static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ while ( *ss && !isdigit(*ss) ) ss++;
+ if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ char *se;
+ uint32_t id = strtol(ss, &se, 10);
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice
+ return id;
+}
+static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ char *se = ss;
+ while ( *se && !isdigit(*se) ) se++;
+ kstring_t str = {0,0,0};
+ kputsn(ss,se-ss,&str);
+ ss = se;
+ while ( *se && isdigit(*se) ) se++;
+ ksprintf(&str,"%%0%dd",(int)(se-ss));
+ ENSID_FMT = str.s;
+}
+static inline int gff_parse_type(char *line)
+{
+ line = strstr(line,"ID=");
+ if ( !line ) return -1;
+ line += 3;
+ if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
+ else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
+ return -1;
+}
+static inline int gff_parse_biotype(char *_line)
+{
+ char *line = strstr(_line,"biotype=");
+ if ( !line ) return -1;
+
+ line += 8;
+ switch (*line)
+ {
+ case 'p':
+ if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+ else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+ else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+ else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+ break;
+ case 'a':
+ if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+ else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+ else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+ break;
+ case 'I':
+ if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
+ else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
+ else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
+ else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
+ else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
+ else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+ break;
+ case 'T':
+ if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
+ else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
+ else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
+ else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
+ else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+ break;
+ case 'M':
+ if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+ break;
+ case 'l':
+ if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+ break;
+ case 'm':
+ if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+ else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+ else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+ break;
+ case 'r':
+ if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+ else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+ else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+ else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+ break;
+ case 's':
+ if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+ else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+ else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+ else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+ else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+ else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+ else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+ break;
+ case 't':
+ if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+ break;
+ case 'n':
+ if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+ else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+ break;
+ case 'k':
+ if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+ break;
+ case 'u':
+ if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+ break;
+ case 'L':
+ if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+ break;
+ case '3':
+ if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ break;
+ case 'd':
+ if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+ break;
+ case 'v':
+ if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+ break;
+ case 'b':
+ if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+ break;
+ }
+ return 0;
+}
+static inline int gff_ignored_biotype(args_t *args, char *ss)
+{
+ ss = strstr(ss,"biotype=");
+ if ( !ss ) return 0;
+
+ ss += 8;
+ char *se = ss, tmp;
+ while ( *se && *se!=';' ) se++;
+ tmp = *se;
+ *se = 0;
+
+ char *key = ss;
+ int n = 0;
+ if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+ khash_str2int_set(args->init.ignored_biotypes, key, n+1);
+
+ *se = tmp;
+ return 1;
+}
+gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+ khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+ gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+ if ( !gene )
+ {
+ gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+ int ret;
+ k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+ kh_val(aux->gid2gene,k) = gene;
+ }
+ return gene;
+}
+void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored transcript: %s\n",line);
+ return;
+ }
+
+ // create a mapping from transcript_id to gene_id
+ uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
+ uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
+
+ if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species
+
+ tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
+ tr->id = trid;
+ tr->strand = ftr->strand;
+ tr->gene = gene_init(aux, gene_id);
+ tr->type = biotype;
+ tr->beg = ftr->beg;
+ tr->end = ftr->end;
+
+ khint_t k;
+ int ret;
+ k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+ kh_val(aux->id2tr,k) = tr;
+}
+void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
+{
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored gene: %s\n",line);
+ return;
+ }
+
+ aux_t *aux = &args->init;
+
+ // substring search for "ID=gene:ENSG00000437963"
+ uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+ gf_gene_t *gene = gene_init(aux, gene_id);
+ assert( !gene->name ); // the gene_id should be unique
+
+ gene->iseq = feature_set_seq(args, chr_beg,chr_end);
+
+ // substring search for "Name=OR4F5"
+ ss = strstr(chr_end+2,"Name=");
+ if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
+ ss += 5;
+ char *se = ss;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ gene->name = (char*) malloc(se-ss+1);
+ memcpy(gene->name,ss,se-ss);
+ gene->name[se-ss] = 0;
+}
+int gff_parse(args_t *args, char *line, ftr_t *ftr)
+{
+ // - skip empty lines and commented lines
+ // - columns
+ // 1. chr
+ // 2. <skip>
+ // 3. CDS, transcript, gene, ...
+ // 4-5. beg,end
+ // 6. <skip>
+ // 7. strand
+ // 8. phase
+ // 9. Parent=transcript:ENST(\d+);ID=... etc
+
+ char *ss = line;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *chr_beg, *chr_end;
+ gff_parse_chr(line, &chr_beg, &chr_end);
+ ss = gff_skip(line, chr_end + 2);
+
+ // 3. column: is this a CDS, transcript, gene, etc.
+ if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+ else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+ else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+ else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+ else
+ {
+ ss = gff_skip(line, ss);
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+ int type = gff_parse_type(ss);
+ if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
+ {
+ // we ignore these, debug print to see new types:
+ ss = strstr(ss,"ID=");
+ if ( !ss ) return -1; // no ID, ignore the line
+ if ( !strncmp("chromosome",ss+3,10) ) return -1;
+ if ( !strncmp("supercontig",ss+3,11) ) return -1;
+ if ( args->quiet<2 ) fprintf(stderr,"ignored: %s\n", line);
+ return -1;
+ }
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else error("Unknown strand: %c .. %s\n", *ss,ss);
+
+ if ( type==GFF_TSCRIPT_LINE )
+ gff_parse_transcript(args, line, ss, ftr);
+ else
+ gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
+
+ return -1;
+ }
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
+ ss += 2;
+
+ // 8. column: phase (codon offset)
+ if ( *ss == '0' ) ftr->phase = 0;
+ else if ( *ss == '1' ) ftr->phase = 1;
+ else if ( *ss == '2' ) ftr->phase = 2;
+ else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase
+ else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
+ ss += 2;
+
+ // substring search for "Parent=transcript:ENST00000437963"
+ ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+ ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
+ return 0;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+ // comparison function for qsort of transcripts's CDS
+ if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+ if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+ return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+ *chr_beg = *chr_end = aux->seq[iseq];
+ while ( (*chr_end)[1] ) (*chr_end)++;
+}
+tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+ tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+ assert( tr );
+ return tr;
+}
+void register_cds(args_t *args, ftr_t *ftr)
+{
+ // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+ // ftr is the result of parsing a gff CDS line
+ aux_t *aux = &args->init;
+
+ tscript_t *tr = tscript_init(aux, ftr->trid);
+ if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+ gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+ cds->tr = tr;
+ cds->beg = ftr->beg;
+ cds->len = ftr->end - ftr->beg + 1;
+ cds->icds = 0; // to keep valgrind on mac happy
+ cds->phase = ftr->phase;
+
+ hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+ tr->cds[tr->ncds++] = cds;
+}
+void register_utr(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+ utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+ utr->beg = ftr->beg;
+ utr->end = ftr->end;
+ utr->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+void register_exon(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+ exon->beg = ftr->beg;
+ exon->end = ftr->end;
+ exon->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+void tscript_init_cds(args_t *args)
+{
+ aux_t *aux = &args->init;
+
+ // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+ khint_t k;
+ for (k=0; k<kh_end(aux->id2tr); k++)
+ {
+ if ( !kh_exist(aux->id2tr, k) ) continue;
+ tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
+
+ // position-to-tscript lookup
+ char *chr_beg, *chr_end;
+ chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+ if ( !tr->ncds ) continue; // transcript with no CDS
+
+ // sort CDs
+ qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+ // trim non-coding start
+ int i, len = 0;
+ if ( tr->strand==STRAND_FWD )
+ {
+ if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+ tr->cds[0]->beg += tr->cds[0]->phase;
+ tr->cds[0]->len -= tr->cds[0]->phase;
+ tr->cds[0]->phase = 0;
+
+ // sanity check phase
+ for (i=0; i<tr->ncds; i++)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ assert( phase == len%3 );
+ len += tr->cds[i]->len;
+ }
+ }
+ else
+ {
+ // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+ // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
+ // todo: the same for the fwd strand
+ i = tr->ncds - 1;
+ int phase = tr->cds[i]->phase;
+ if ( phase ) tr->trim |= TRIM_5PRIME;
+ while ( i>=0 && phase > tr->cds[i]->len )
+ {
+ phase -= tr->cds[i]->len;
+ tr->cds[i]->phase = 0;
+ tr->cds[i]->len = 0;
+ i--;
+ }
+ tr->cds[i]->len -= tr->cds[i]->phase;
+ tr->cds[i]->phase = 0;
+
+ // sanity check phase
+ for (i=tr->ncds-1; i>=0; i--)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ len += tr->cds[i]->len;
+ }
+ }
+
+ // set len. At the same check that CDS within a transcript do not overlap
+ len = 0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->icds = i;
+ len += tr->cds[i]->len;
+ if ( !i ) continue;
+
+ gf_cds_t *a = tr->cds[i-1];
+ gf_cds_t *b = tr->cds[i];
+ if ( a->beg + a->len - 1 >= b->beg )
+ error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n",
+ kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
+ if ( len%3 != 0 )
+ {
+ // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+ // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+ // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+ tr->trim |= TRIM_3PRIME;
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = tr->ncds - 1;
+ while ( i>=0 && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ len -= dlen;
+ i--;
+ }
+ }
+ else
+ {
+ i = 0;
+ while ( i<tr->ncds && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ tr->cds[i]->beg += dlen;
+ len -= dlen;
+ i++;
+ }
+ }
+ }
+
+ // set CDS offsets and insert into regidx
+ len=0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->pos = len;
+ len += tr->cds[i]->len;
+ regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+ }
+ }
+}
+
+void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
+
+void init_gff(args_t *args)
+{
+ aux_t *aux = &args->init;
+ aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
+ aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
+ args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
+ aux->ignored_biotypes = khash_str2int_init();
+
+ // parse gff
+ kstring_t str = {0,0,0};
+ htsFile *fp = hts_open(args->gff_fname,"r");
+ if ( !fp ) error("Failed to read %s\n", args->gff_fname);
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+ int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
+ if ( !ret ) aux->nftr++;
+ }
+ free(str.s);
+ if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
+
+
+ // process gff information: connect CDS and exons to transcripts
+ args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+ args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+ args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+ args->itr = regitr_init(NULL);
+
+ int i;
+ for (i=0; i<aux->nftr; i++)
+ {
+ ftr_t *ftr = &aux->ftr[i];
+
+ // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+ if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
+
+ tscript_t *tr = kh_val(aux->id2tr,k);
+ if ( !tr->gene->name )
+ {
+ // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
+ regidx_free_tscript(&tr);
+ kh_del(int2tscript, aux->id2tr,k);
+ continue;
+ }
+
+ // populate regidx by category:
+ // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+ // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+ if ( ftr->type==GF_CDS ) register_cds(args, ftr);
+ else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
+ else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
+ else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
+ else
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+ }
+ tscript_init_cds(args);
+
+ if ( !args->quiet )
+ {
+ fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ regidx_nregs(args->idx_tscript),
+ regidx_nregs(args->idx_exon),
+ regidx_nregs(args->idx_cds),
+ regidx_nregs(args->idx_utr));
+ }
+
+ free(aux->ftr);
+ khash_str2int_destroy_free(aux->seq2int);
+ // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+ kh_destroy(int2tscript,aux->id2tr);
+ free(aux->seq);
+
+ if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
+ {
+ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+ fprintf(stderr,"Ignored the following biotypes:\n");
+ for (i = kh_begin(ign); i < kh_end(ign); i++)
+ {
+ if ( !kh_exist(ign,i)) continue;
+ fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i));
+ }
+ }
+ khash_str2int_destroy_free(aux->ignored_biotypes);
+}
+
+void init_data(args_t *args)
+{
+ args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32;
+
+ if ( !args->quiet ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
+ init_gff(args);
+
+ args->rid = -1;
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ args->fai = fai_load(args->fa_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
+
+ args->pos2vbuf = kh_init(pos2vbuf);
+ args->active_tr = khp_init(trhp);
+ args->hap = (hap_t*) calloc(1,sizeof(hap_t));
+
+ // init samples
+ if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT;
+ if ( args->sample_list && !strcmp("-",args->sample_list) )
+ {
+ // ignore all samples
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ // significant speedup for plain VCFs
+ bcf_hdr_set_samples(args->hdr,NULL,0);
+ }
+ args->phase = PHASE_DROP_GT;
+ }
+ else
+ args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT);
+ args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr);
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ args->out = args->output_fname ? fopen(args->output_fname,"w") : stdout;
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno));
+
+ fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++)
+ fprintf(args->out," %s",args->argv[i]);
+ fprintf(args->out,"\n");
+ fprintf(args->out,"# LOG\t[2]Message\n");
+ fprintf(args->out,"# CSQ"); i = 1;
+ fprintf(args->out,"\t[%d]Sample", ++i);
+ fprintf(args->out,"\t[%d]Haplotype", ++i);
+ fprintf(args->out,"\t[%d]Chromosome", ++i);
+ fprintf(args->out,"\t[%d]Position", ++i);
+ fprintf(args->out,"\t[%d]Consequence", ++i);
+ fprintf(args->out,"\n");
+ }
+ else
+ {
+ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
+ bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
+ bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq. Format: '[*]consequence|gene|transcript|biotype[|strand|amino_acid_change|dna_change]' or, for consequences of variants split across multiple sites, a pointer to the record storing the consequences '@position'. '*' prefix indicates a consequence downstream from a stop \">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
+ if ( args->hdr_nsmpl )
+ bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
+ bcf_hdr_write(args->out_fh, args->hdr);
+ }
+ if ( !args->quiet ) fprintf(stderr,"Calling...\n");
+}
+
+void destroy_data(args_t *args)
+{
+ regidx_destroy(args->idx_cds);
+ regidx_destroy(args->idx_utr);
+ regidx_destroy(args->idx_exon);
+ regidx_destroy(args->idx_tscript);
+ regitr_destroy(args->itr);
+
+ khint_t k,i,j;
+ for (k=0; k<kh_end(args->init.gid2gene); k++)
+ {
+ if ( !kh_exist(args->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
+ free(gene->name);
+ free(gene);
+ }
+ kh_destroy(int2gene,args->init.gid2gene);
+
+ if ( args->filter )
+ filter_destroy(args->filter);
+
+ khp_destroy(trhp,args->active_tr);
+ kh_destroy(pos2vbuf,args->pos2vbuf);
+ if ( args->smpl ) smpl_ilist_destroy(args->smpl);
+ int ret;
+ if ( args->out_fh )
+ ret = hts_close(args->out_fh);
+ else
+ ret = fclose(args->out);
+ if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ for (i=0; i<args->vcf_rbuf.m; i++)
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ if ( !vbuf ) continue;
+ for (j=0; j<vbuf->m; j++)
+ {
+ if ( !vbuf->vrec[j] ) continue;
+ if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
+ free(vbuf->vrec[j]->smpl);
+ free(vbuf->vrec[j]->vcsq);
+ free(vbuf->vrec[j]);
+ }
+ free(vbuf->vrec);
+ free(vbuf);
+ }
+ free(args->vcf_buf);
+ free(args->rm_tr);
+ free(args->csq_buf);
+ free(args->hap->stack);
+ free(args->hap->sseq.s);
+ free(args->hap->tseq.s);
+ free(args->hap->tref.s);
+ free(args->hap);
+ fai_destroy(args->fai);
+ free(args->gt_arr);
+ free(args->str.s);
+ free(args->str2.s);
+ free(ENSID_FMT);
+}
+
+/*
+ The splice_* functions are for consquences around splice sites: start,stop,splice_*
+ */
+#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely
+#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region
+#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
+#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
+typedef struct
+{
+ tscript_t *tr;
+ struct {
+ int32_t pos, rlen, alen;
+ char *ref, *alt;
+ bcf1_t *rec;
+ } vcf;
+ uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev)
+ check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
+ check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
+ check_donor:1, // as with check_acceptor
+ check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon
+ check_region_end:1, //
+ check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr
+ set_refalt:1; // set kref,kalt, if set, check also for synonymous events
+ uint32_t csq;
+ int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele
+ uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
+ ref_end; // a more conservative csq (the first and last base in kref.s)
+ kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP
+}
+splice_t;
+void splice_init(splice_t *splice, bcf1_t *rec)
+{
+ memset(splice,0,sizeof(*splice));
+ splice->vcf.rec = rec;
+ splice->vcf.pos = rec->pos;
+ splice->vcf.rlen = rec->rlen;
+ splice->vcf.ref = rec->d.allele[0];
+}
+static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
+{
+ // len>0 .. beg is the first base, del filled from right
+ // len<0 .. beg is the last base, del filled from left
+
+ int rlen, alen, rbeg, abeg; // first base to include (ref coordinates)
+ if ( len<0 )
+ {
+ rlen = alen = -len;
+ rbeg = beg - rlen + 1;
+ int dlen = splice->vcf.alen - splice->vcf.rlen;
+ if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle
+ dlen += splice->ref_end - beg;
+ abeg = rbeg + dlen;
+ }
+ else
+ {
+ rbeg = abeg = beg;
+ rlen = alen = len;
+ // check for incomplete del as above??
+ }
+
+#define XDBG 0
+#if XDBG
+fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
+#endif
+ splice->kref.l = 0;
+ splice->kalt.l = 0;
+
+ // add the part before vcf.ref, in the vcf.ref and after vcf.ref
+ int roff; // how many vcf.ref bases already used
+ if ( rbeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
+ kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+ roff = 0;
+ }
+ else
+ roff = rbeg - splice->vcf.pos;
+#if XDBG
+fprintf(stderr,"r1: %s roff=%d\n",splice->kref.s,roff);
+#endif
+
+ if ( roff < splice->vcf.rlen && splice->kref.l < rlen )
+ {
+ int len = splice->vcf.rlen - roff; // len still available in vcf.ref
+ if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed
+ kputsn(splice->vcf.ref + roff, len, &splice->kref);
+ }
+#if XDBG
+fprintf(stderr,"r2: %s\n",splice->kref.s);
+#endif
+
+ uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kref.l < rlen )
+ {
+ if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
+ rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
+ if ( splice->kref.l < rlen )
+ kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+ }
+#if XDBG
+fprintf(stderr,"r3: %s\n",splice->kref.s);
+#endif
+
+
+ int aoff;
+ if ( abeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= abeg );
+ kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+ aoff = 0;
+ }
+ else
+ aoff = abeg - splice->vcf.pos;
+#if XDBG
+fprintf(stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ if ( aoff < splice->vcf.alen && splice->kalt.l < alen )
+ {
+ int len = splice->vcf.alen - aoff; // len still available in vcf.alt
+ if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed
+ kputsn(splice->vcf.alt + aoff, len, &splice->kalt);
+ aoff -= len;
+ }
+ if ( aoff < 0 ) aoff = 0;
+ else aoff--;
+#if XDBG
+fprintf(stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kalt.l < alen )
+ {
+ if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
+ alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
+ if ( alen > 0 && alen > splice->kalt.l )
+ kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+ }
+#if XDBG
+fprintf(stderr,"a3: %s\n",splice->kalt.s);
+fprintf(stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s);
+#endif
+}
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid)
+{
+ while ( regitr_overlap(itr) )
+ {
+ gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
+ tscript_t *tr = utr->tr;
+ if ( tr->id != trid ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ return csq.type.type;
+ }
+ return 0;
+}
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type)
+{
+#if XDBG
+fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
+#endif
+ if ( !type ) return;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = type;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+}
+static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
+ // before and after the inserted bases
+ if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] )
+ {
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+ else
+ {
+ if ( splice->tend ) splice->tend--;
+ splice->ref_beg = splice->vcf.pos;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+#if XDBG
+fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ int ret;
+ if ( splice->ref_beg >= ex_end ) // fully outside, beyond the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_end ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_beg ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ // overlaps the exon or inside the exon
+ // possible todo: find better alignment for frameshifting variants?
+ if ( splice->ref_beg <= ex_beg + 2 ) // in the first 3bp
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 2 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ // Make sure the variant will not end up left aligned to avoid overlapping vcf records
+ // splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ // splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ // if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ if ( splice->ref_beg < splice->vcf.pos ) // this must have been caused by too much trimming from right
+ {
+ int dlen = splice->vcf.pos - splice->ref_beg;
+ assert( dlen==1 );
+ splice->tbeg += dlen;
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen;
+ splice->ref_beg = splice->vcf.pos;
+ }
+ if ( splice->ref_end==ex_beg ) splice->tend--; // prevent zero-length ref allele
+ splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base
+
+#if XDBG
+fprintf(stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ // filling from the left does not work for ENST00000341065/frame3.vcf
+ // CAG.GTGGCCAG CAG.GTGGCCAG
+ // CA-.--GGCCAG vs CAG.---GCCAG
+ // splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON);
+ //
+ // filling from the right:
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
+ splice->ref_beg = ex_beg - 1;
+ if ( splice->tbeg + splice->tend == splice->vcf.alen )
+ {
+ // the deletion overlaps ex_beg and cannot be easily realigned to the right
+ if ( !splice->tend )
+ {
+ splice->csq |= CSQ_CODING_SEQUENCE;
+ return SPLICE_OVERLAP;
+ }
+ splice->tend--;
+ }
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); // ref,alt positioned at the first intron base
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_beg < ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ if ( splice->tbeg>0 ) splice->tbeg--; //why is this?
+ if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->vcf.alen -= splice->tbeg + splice->tend;
+ }
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
+ if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
+ {
+ splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
+ return SPLICE_OVERLAP;
+ }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // not a real variant, can be ignored: eg ACGT>ACGT
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF;
+
+ splice->ref_beg = splice->vcf.pos + splice->tbeg;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1;
+
+#if XDBG
+fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg < ex_beg ) // the part before the exon
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos;
+ splice->ref_beg = ex_beg;
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_beg <= ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 3 )
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ splice->csq = 0;
+ splice->vcf.alen = strlen(splice->vcf.alt);
+
+ int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0;
+ splice->tbeg = 0, splice->tend = 0;
+
+ // trim from the right, then from the left
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break;
+ i++;
+ }
+ splice->tend = i;
+ rlen1 -= i, alen1 -= i, i = 0;
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break;
+ i++;
+ }
+ splice->tbeg = i;
+
+ // The mnp, ins and del code was split into near-identical functions for clarity and debugging;
+ // possible todo: generalize once stable
+ if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end);
+
+ return 0;
+}
+
+// return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref)
+int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial)
+{
+ int i;
+ kstring_t str = {0,0,0};
+ tscript_t *tr = cds->tr;
+ child->icds = cds->icds; // index of cds in the tscript's list of exons
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.tr = tr;
+ splice.vcf.alt = rec->d.allele[ial];
+ splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
+ if ( !(tr->trim & TRIM_5PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
+ else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
+ }
+ if ( !(tr->trim & TRIM_3PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
+ else { if ( child->icds==0 ) splice.check_stop = 1; }
+ }
+ if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ }
+ if ( child->icds!=0 ) splice.check_region_beg = 1;
+ if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
+
+#if XDBG
+fprintf(stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop);
+#endif
+ int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1);
+#if XDBG
+fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq);
+#endif
+
+ if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA
+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq
+ {
+ free(splice.kref.s);
+ free(splice.kalt.s);
+
+ if ( !splice.csq ) return 2; // fully intronic, no csq
+
+ // splice_region/acceptor/donor
+ child->seq = NULL;
+ child->sbeg = 0;
+ child->rbeg = rec->pos;
+ child->rlen = 0;
+ child->dlen = 0;
+ kputs(rec->d.allele[0],&str);
+ kputc('>',&str);
+ kputs(rec->d.allele[ial],&str);
+ child->var = str.s;
+ child->type = HAP_SSS;
+ child->csq = splice.csq;
+ child->prev = parent->type==HAP_SSS ? parent->prev : parent;
+ child->rec = rec;
+ return 0;
+ }
+ if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; // synonymous&splice,frame could become synonymous&frame,splice
+
+ int dbeg = 0;
+ if ( splice.ref_beg < cds->beg )
+ {
+ // The vcf record overlaps the exon boundary, but the variant itself
+ // should fit inside since we are here. This will need more work.
+ // #1475227917
+ dbeg = cds->beg - splice.ref_beg;
+ splice.kref.l -= dbeg;
+ splice.ref_beg = cds->beg;
+ assert( dbeg <= splice.kalt.l );
+ }
+
+ if ( parent->type==HAP_SSS ) parent = parent->prev;
+ if ( parent->type==HAP_CDS )
+ {
+ i = parent->icds;
+ if ( i!=cds->icds )
+ {
+ // the variant is on a new exon, finish up the previous
+ int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
+ if ( len > 0 )
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+
+ // append any skipped non-variant exons
+ while ( ++i < cds->icds )
+ kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+
+ if ( parent->icds==child->icds )
+ {
+ int len = splice.ref_beg - parent->rbeg - parent->rlen;
+ if ( len < 0 ) // overlapping variants
+ {
+ free(str.s);
+ return 1;
+ }
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+ else
+ kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+ }
+ kputs(splice.kalt.s + dbeg, &str);
+
+ child->seq = str.s;
+ child->sbeg = cds->pos + (splice.ref_beg - cds->beg);
+ child->rbeg = splice.ref_beg;
+ child->rlen = splice.kref.l;
+ child->type = HAP_CDS;
+ child->prev = parent;
+ child->rec = rec;
+ child->csq = splice.csq;
+
+ // set vlen and the "ref>alt" string
+ {
+ int rlen = strlen(rec->d.allele[0]);
+ int alen = strlen(rec->d.allele[ial]);
+ child->dlen = alen - rlen;
+ child->var = (char*) malloc(rlen+alen+2);
+ memcpy(child->var,rec->d.allele[0],rlen);
+ child->var[rlen] = '>';
+ memcpy(child->var+rlen+1,rec->d.allele[ial],alen);
+ child->var[rlen+alen+1] = 0;
+ }
+
+ // yuck, the whole CDS is modified/deleted, not ready for this, todo.
+ if ( child->rbeg + child->rlen > cds->beg + cds->len )
+ {
+ child->type = HAP_SSS;
+ if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf
+ }
+
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return 0;
+}
+void hap_destroy(hap_node_t *hap)
+{
+ int i;
+ for (i=0; i<hap->nchild; i++)
+ if ( hap->child[i] ) hap_destroy(hap->child[i]);
+ for (i=0; i<hap->mcsq_list; i++) free(hap->csq_list[i].type.vstr.s);
+ free(hap->csq_list);
+ free(hap->child);
+ free(hap->cur_child);
+ free(hap->seq);
+ free(hap->var);
+ free(hap);
+}
+
+
+/*
+ ref: spliced reference and its length (ref.l)
+ seq: part of the spliced query transcript on the reference strand to translate, its
+ length (seq.l) and the total length of the complete transcript (seq.m)
+ sbeg: seq offset within the spliced query transcript
+ rbeg: seq offset within ref, 0-based
+ rend: last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l
+ strand: coding strand - 0:rev, 1:fwd
+ tseq: translated sequence (aa)
+ fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
+ */
+void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
+{
+#if XDBG
+fprintf(stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+#endif
+ char tmp[3], *codon, *end;
+ int i, len, npad;
+
+ kstring_t ref = *_ref;
+ kstring_t seq = *_seq;
+
+ tseq->l = 0;
+ if ( !seq.l )
+ {
+ kputc('?', tseq);
+ return;
+ }
+
+#define DBG 0
+#if DBG
+ fprintf(stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+ fprintf(stderr," ref: l=%d %s\n", (int)ref.l,ref.s);
+ fprintf(stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m);
+ for (i=0; i<seq.l; i++) fprintf(stderr,"%c",seq.s[i]); fprintf(stderr,"\n");
+ fprintf(stderr," sbeg,rbeg,rend: %d,%d,%d\n", sbeg,rbeg,rend);
+ fprintf(stderr," strand,fill: %d,%d\n", strand,fill);
+#endif
+
+ if ( strand==STRAND_FWD )
+ {
+ // left padding
+ npad = sbeg % 3;
+#if DBG>1
+ fprintf(stderr," npad: %d\n",npad);
+#endif
+ assert( npad<=rbeg );
+
+ for (i=0; i<npad; i++)
+ tmp[i] = ref.s[rbeg+i-npad+N_REF_PAD];
+ for (; i<3 && i-npad<seq.l; i++)
+ tmp[i] = seq.s[i-npad];
+ len = seq.l - i + npad; // the remaining length of padded sseq
+#if DBG>1
+ fprintf(stderr,"\t i=%d\n", i);
+#endif
+ if ( i==3 )
+ {
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ codon = seq.s + 3 - npad; // next codon
+ end = codon + len - 1 - (len % 3); // last position of a valid codon
+ while ( codon < end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
+#endif
+ codon += 3;
+ }
+ end = seq.s + seq.l - 1;
+ for (i=0; codon+i<=end; i++) tmp[i] = codon[i];
+ }
+
+ // right padding
+ codon = ref.s + rend + N_REF_PAD;
+ if ( i>0 )
+ {
+#if DBG>1
+ if(i==1)fprintf(stderr,"[3]%c\n",tmp[0]);
+ if(i==2)fprintf(stderr,"[3]%c%c\n",tmp[0],tmp[1]);
+#endif
+ for (; i<3; i++)
+ {
+ tmp[i] = *codon;
+ codon++;
+ }
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ }
+ if ( fill!=0 )
+ {
+ end = ref.s + ref.l - N_REF_PAD;
+ while ( codon+3 <= end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
+#endif
+ codon += 3;
+ }
+ }
+ }
+ else // STRAND_REV
+ {
+ // right padding - number of bases to take from ref
+ npad = (seq.m - (sbeg + seq.l)) % 3;
+#if DBG>1
+ fprintf(stderr," npad: %d\n",npad);
+#endif
+if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m);
+ assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand
+
+ if ( npad==2 )
+ {
+ tmp[1] = ref.s[rend+N_REF_PAD];
+ tmp[2] = ref.s[rend+N_REF_PAD+1];
+ i = 0;
+ }
+ else if ( npad==1 )
+ {
+ tmp[2] = ref.s[rend+N_REF_PAD];
+ i = 1;
+ }
+ else
+ i = 2;
+
+ end = seq.s + seq.l;
+ for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end);
+#if DBG>1
+ fprintf(stderr,"\t i=%d\n", i);
+ if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]);
+ if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]);
+#endif
+ if ( i==-1 )
+ {
+#if DBG>1
+ fprintf(stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
+#endif
+ kputc_(cdna2aa(tmp), tseq);
+ codon = end - 3;
+ while ( codon >= seq.s )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ if ( seq.s-codon==2 )
+ {
+ tmp[2] = seq.s[0];
+ i = 1;
+ }
+ else if ( seq.s-codon==1 )
+ {
+ tmp[1] = seq.s[0];
+ tmp[2] = seq.s[1];
+ i = 0;
+ }
+ else
+ i = -1;
+#if DBG>1
+ if(i==1)fprintf(stderr,"[3] %c\n",tmp[2]);
+ if(i==0)fprintf(stderr,"[3] %c%c\n",tmp[1],tmp[2]);
+#endif
+ }
+ // left padding
+ end = ref.s + N_REF_PAD + rbeg;
+ if ( i>=0 )
+ {
+ for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
+ kputc_(cdna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
+#endif
+ }
+ if ( fill!=0 )
+ {
+ codon = end - 3;
+ while ( codon >= ref.s + N_REF_PAD )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ }
+ }
+ kputc_(0,tseq); tseq->l--;
+#if DBG
+ fprintf(stderr," tseq: %s\n", tseq->s);
+#endif
+}
+
+void tscript_splice_ref(tscript_t *tr)
+{
+ int i, len = 0;
+ for (i=0; i<tr->ncds; i++)
+ len += tr->cds[i]->len;
+
+ tr->nsref = len + 2*N_REF_PAD;
+ tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
+ len = 0;
+
+ memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ for (i=0; i<tr->ncds; i++)
+ {
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+ len += tr->cds[i]->len;
+ }
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ tr->sref[len] = 0;
+}
+
+// returns: 0 if consequence was added, 1 if it already exists or could not be added
+int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+#if XDBG
+fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
+#endif
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
+ vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
+ if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+
+ int i;
+ for (i=0; i<vbuf->n; i++)
+ if ( vbuf->vrec[i]->line==rec ) break;
+ if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+ vrec_t *vrec = vbuf->vrec[i];
+
+ // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
+ if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
+ csq->type.type &= ~CSQ_SPLICE_REGION;
+
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ // Same as below, to avoid records like
+ // 3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // 3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i] = csq->type;
+ goto exit_duplicate;
+ }
+ if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue;
+ if ( csq->type.ref != vrec->vcsq[i].ref ) continue;
+ goto exit_duplicate;
+ }
+ }
+ else if ( csq->type.type & CSQ_COMPOUND )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
+ if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
+ {
+ // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
+ // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
+ // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two
+ // consequences:
+ // stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
+ if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
+ {
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+
+ // remove stop_lost&synonymous if stop_retained set
+ if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
+ vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
+
+ if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
+ goto exit_duplicate;
+ }
+ continue;
+ }
+ if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
+ }
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ }
+ else
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate;
+ }
+ }
+ // no such csq yet in this vcf record
+ csq->vrec = vrec;
+ csq->idx = i;
+ vrec->nvcsq++;
+ hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
+ vrec->vcsq[i] = csq->type;
+ return 0;
+
+exit_duplicate:
+ csq->vrec = vrec;
+ csq->idx = i;
+ return 1;
+}
+
+// soff .. position of the variant within the trimmed query transcript
+// sbeg .. position of the variant within the query transcript
+// rbeg .. position on the reference transcript (if there are no indels, then rbeg=send)
+// rpos .. VCF position
+#define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen))
+#define node2sbeg(i) (hap->sbeg + node2soff(i))
+#define node2send(i) (hap->sbeg + hap->stack[i].slen)
+#define node2rbeg(i) (hap->stack[i].node->sbeg)
+#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
+#define node2rpos(i) (hap->stack[i].node->rec->pos)
+
+void kput_vcsq(vcsq_t *csq, kstring_t *str)
+{
+ // Remove start/stop from incomplete CDS, but only if there is another
+ // consequence as something must be reported
+ if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS);
+
+ // Remove missense from start/stops
+ if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT;
+
+ if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref )
+ {
+ kputc_('@',str);
+ kputw(csq->ref->pos+1, str);
+ return;
+ }
+ if ( csq->type & CSQ_UPSTREAM_STOP )
+ kputc_('*',str);
+
+ int i, n = sizeof(csq_strings)/sizeof(char*);
+ for (i=1; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+ i++;
+ for (; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+
+ kputc_('|', str);
+ if ( csq->gene ) kputs(csq->gene , str);
+
+ kputc_('|', str);
+ if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+
+ kputc_('|', str);
+ kputs(gf_type2gff_string(csq->biotype), str);
+
+ if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
+ kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
+
+ if ( csq->vstr.l )
+ kputs(csq->vstr.s, str);
+}
+
+void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
+{
+ int i;
+ tscript_t *tr = hap->tr;
+ int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
+
+ int icsq = node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *csq = &node->csq_list[icsq];
+ csq->pos = hap->stack[ref_node].node->rec->pos;
+ csq->type.trid = tr->id;
+ csq->type.gene = tr->gene->name;
+ csq->type.strand = tr->strand;
+ csq->type.biotype = tr->type;
+
+ // only now we see the translated sequence and can determine if the stop/start changes are real
+ int rm_csq = 0;
+ csq->type.type = 0;
+ for (i=ibeg; i<=iend; i++)
+ csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
+ if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING;
+
+ int has_upstream_stop = hap->upstream_stop;
+ if ( hap->stack[ibeg].node->type != HAP_SSS )
+ {
+ // check for truncating stops
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i]=='*' ) break;
+ if ( i!=hap->tref.l )
+ {
+ hap->tref.l = i+1;
+ hap->tref.s[i+1] = 0;
+ }
+ for (i=0; i<hap->tseq.l; i++)
+ if ( hap->tseq.s[i]=='*' ) break;
+ if ( i!=hap->tseq.l )
+ {
+ hap->tseq.l = i+1;
+ hap->tseq.s[i+1] = 0;
+ hap->upstream_stop = 1;
+ }
+ if ( csq->type.type & CSQ_STOP_LOST )
+ {
+ if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ {
+ rm_csq |= CSQ_STOP_LOST;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else if ( hap->tref.s[hap->tref.l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( hap->tseq.s[hap->tseq.l-1] == '*' )
+ {
+ rm_csq |= CSQ_STOP_GAINED;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq->type.type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
+ {
+ rm_csq |= CSQ_START_LOST;
+ csq->type.type &= ~CSQ_START_LOST;
+ }
+ if ( dlen!=0 )
+ {
+ if ( dlen%3 )
+ csq->type.type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( dlen<0 )
+ csq->type.type |= CSQ_INFRAME_DELETION;
+ else
+ csq->type.type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
+ if ( i==hap->tref.l )
+ csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( hap->tref.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_LOST;
+ else if ( hap->tseq.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_GAINED;
+ else
+ csq->type.type |= CSQ_MISSENSE_VARIANT;
+ }
+ }
+ if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
+ csq->type.type &= ~rm_csq;
+
+ if ( hap->stack[ibeg].node->type == HAP_SSS )
+ {
+ node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq;
+ node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec;
+ node->csq_list[icsq].type.biotype = tr->type;
+ csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec);
+ return;
+ }
+
+ kstring_t str = node->csq_list[icsq].type.vstr;
+ str.l = 0;
+
+ // create the aa variant string
+ int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(hap->tref.s, &str);
+ if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(hap->tseq.s, &str);
+ }
+ kputc_('|', &str);
+
+ // create the dna variant string and, in case of combined variants,
+ // insert silent CSQ_PRINTED_UPSTREAM variants
+ for (i=ibeg; i<=iend; i++)
+ {
+ if ( i>ibeg ) kputc_('+', &str);
+ kputw(node2rpos(i)+1, &str);
+ kputs(hap->stack[i].node->var, &str);
+ }
+ node->csq_list[icsq].type.vstr = str;
+ csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec);
+
+ for (i=ibeg; i<=iend; i++)
+ {
+ // csq are printed at one position only for combined variants, the rest is
+ // silent and references the first
+ if ( hap->stack[i].node->csq & ~CSQ_COMPOUND )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.vstr.l = 0;
+ kputs(str.s,&tmp_csq->type.vstr);
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.ref = hap->stack[ref_node].node->rec;
+ tmp_csq->type.vstr.l = 0;
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ }
+}
+
+void hap_finalize(args_t *args, hap_t *hap)
+{
+ tscript_t *tr = hap->tr;
+ if ( !tr->sref )
+ tscript_splice_ref(tr);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ int istack = 0;
+ hts_expand(hstack_t,1,hap->mstack,hap->stack);
+
+ hap->sseq.l = 0;
+ hap->tseq.l = 0;
+ hap->stack[0].node = tr->root;
+ hap->stack[0].ichild = -1;
+ hap->stack[0].slen = 0;
+ hap->stack[0].dlen = 0;
+
+ while ( istack>=0 )
+ {
+ hstack_t *stack = &hap->stack[istack];
+ hap_node_t *node = hap->stack[istack].node;
+ while ( ++hap->stack[istack].ichild < node->nchild )
+ {
+ if ( node->child[stack->ichild] ) break;
+ }
+ if ( stack->ichild == node->nchild ) { istack--; continue; }
+
+ node = node->child[stack->ichild];
+
+ istack++;
+ hts_expand(hstack_t,istack+1,hap->mstack,hap->stack);
+ stack = &hap->stack[istack-1];
+
+ hap->stack[istack].node = node;
+ hap->stack[istack].ichild = -1;
+
+ hap->sseq.l = stack->slen;
+ if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq);
+ hap->stack[istack].slen = hap->sseq.l;
+ hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen;
+
+ if ( !node->nend ) continue; // not a leaf node
+
+ // The spliced sequence has been built for the current haplotype and stored
+ // in hap->sseq. Now we break it and output as independent parts
+
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript
+ hap->upstream_stop = 0;
+
+ int i = 1, dlen = 0, ibeg, indel = 0;
+ while ( i<istack && hap->stack[i].node->type == HAP_SSS ) i++;
+ hap->sbeg = hap->stack[i].node->sbeg;
+
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = 0, ibeg = -1;
+ while ( ++i <= istack )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ // start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i<istack )
+ {
+ if ( dlen%3 ) // frameshift
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = node2sbeg(i);
+ int inext = node2sbeg(i+1);
+ if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+
+ int ioff = node2soff(ibeg);
+ int icur = node2sbeg(ibeg);
+ int rbeg = node2rbeg(ibeg);
+ int rend = node2rend(i);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[i].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(i) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ else
+ {
+ i = istack + 1, ibeg = -1;
+ while ( --i > 0 )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i>1 && hap->stack[i-1].node->type != HAP_SSS )
+ {
+ if ( dlen%3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = sseq.m - 1 - node2sbeg(i);
+ int inext = sseq.m - 1 - node2sbeg(i-1);
+ if ( icur/3 == inext/3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+ int ioff = node2soff(i);
+ int icur = node2sbeg(i);
+ int rbeg = node2rbeg(i);
+ int rend = node2rend(ibeg);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[ibeg].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(ibeg) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ }
+}
+
+static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
+{
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+}
+static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue;
+ assert( csq->type.vstr.l );
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+ }
+}
+
+static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list || ismpl<0 ) return;
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ vrec_t *vrec = csq->vrec;
+ int icsq = 2*csq->idx + ihap;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+}
+
+void hap_flush(args_t *args, uint32_t pos)
+{
+ int i,j;
+ tr_heap_t *heap = args->active_tr;
+
+ while ( heap->ndat && heap->dat[0]->end<=pos )
+ {
+ tscript_t *tr = heap->dat[0];
+ khp_delete(trhp, heap);
+
+ args->hap->tr = tr;
+ if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+ {
+ hap_finalize(args, args->hap);
+
+ if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
+ {
+ if ( args->phase==PHASE_DROP_GT )
+ hap_print_text(args, tr, -1,0, tr->hap[0]);
+ else
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+ }
+ }
+ }
+ else if ( args->phase!=PHASE_DROP_GT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+ }
+ }
+ }
+
+ // mark the transcript for deletion. Cannot delete it immediately because
+ // by-position VCF output will need them when flushed by vcf_buf_push
+ args->nrm_tr++;
+ hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+ args->rm_tr[args->nrm_tr-1] = tr;
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+
+void vbuf_push(args_t *args, bcf1_t **rec_ptr)
+{
+ int i;
+
+ assert(rec_ptr);
+ bcf1_t *rec = *rec_ptr;
+
+ // check for duplicate records
+ i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
+ if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
+ {
+ // vcf record with a new pos
+ rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
+ i = rbuf_append(&args->vcf_rbuf);
+ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t));
+ args->vcf_buf[i]->n = 0;
+ }
+ vbuf_t *vbuf = args->vcf_buf[i];
+ vbuf->n++;
+ hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec);
+ if ( !vbuf->vrec[vbuf->n - 1] )
+ vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t));
+
+ vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
+ if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
+ {
+ if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ }
+ if ( !vrec->line ) vrec->line = bcf_init1();
+ SWAP(bcf1_t*, (*rec_ptr), vrec->line);
+
+ int ret;
+ khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret);
+ kh_val(args->pos2vbuf,k) = vbuf;
+}
+
+void vbuf_flush(args_t *args)
+{
+ if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone
+
+ int i,j;
+ while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 )
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ for (i=0; i<vbuf->n; i++)
+ {
+ vrec_t *vrec = vbuf->vrec[i];
+ if ( !args->out_fh ) // not a VCF output
+ {
+ vrec->nvcsq = 0;
+ continue;
+ }
+ if ( !vrec->nvcsq )
+ {
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ continue;
+ }
+
+ args->str.l = 0;
+ kput_vcsq(&vrec->vcsq[0], &args->str);
+ for (j=1; j<vrec->nvcsq; j++)
+ {
+ kputc_(',', &args->str);
+ kput_vcsq(&vrec->vcsq[j], &args->str);
+ }
+ bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
+ if ( args->hdr_nsmpl )
+ {
+ if ( vrec->nfmt < args->nfmt_bcsq )
+ for (j=1; j<args->hdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl));
+ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt);
+ }
+ vrec->nvcsq = 0;
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ }
+ if ( vbuf->n )
+ {
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos);
+ if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
+ }
+ vbuf->n = 0;
+ }
+
+ for (i=0; i<args->nrm_tr; i++)
+ {
+ tscript_t *tr = args->rm_tr[i];
+ if ( tr->root ) hap_destroy(tr->root);
+ tr->root = NULL;
+ free(tr->hap);
+ free(tr->ref);
+ free(tr->sref);
+ }
+ args->nrm_tr = 0;
+ args->ncsq_buf = 0;
+}
+
+void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+{
+ int i, len;
+ int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
+
+ tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ if ( !tr->ref )
+ error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
+
+ int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
+ if ( pad_beg + pad_end != 2*N_REF_PAD )
+ {
+ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD);
+ for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
+ memcpy(ref+i, tr->ref, len);
+ for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
+ free(tr->ref);
+ tr->ref = ref;
+ }
+}
+
+static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+{
+ char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0);
+ char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos);
+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) );
+ while ( *ref && *vcf )
+ {
+ if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) )
+ error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]);
+ ref++;
+ vcf++;
+ }
+}
+
+int test_cds_local(args_t *args, bcf1_t *rec)
+{
+ int i,j, ret = 0;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ // structures to fake the normal test_cds machinery
+ hap_node_t root, node;
+ root.type = HAP_ROOT;
+ kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
+
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+
+ if ( !tr->ref )
+ {
+ tscript_init_ref(args, tr, chr);
+ tscript_splice_ref(tr);
+ khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
+
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+
+ int csq_type = node.csq;
+
+ // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though
+ if ( node.type == HAP_SSS )
+ {
+ csq.type.type = csq_type;
+ csq_stage(args, &csq, rec);
+ }
+ else
+ {
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + node.dlen;
+ sseq.s = node.seq;
+ int alen = sseq.l = strlen(sseq.s);
+ int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
+
+ sseq.m = sref.m - 2*N_REF_PAD;
+ sseq.s = sref.s + N_REF_PAD + node.sbeg;
+ sseq.l = node.rlen;
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
+
+ // check for truncating stops
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j]=='*' ) break;
+ if ( j!=tref->l )
+ {
+ tref->l = j+1;
+ tref->s[j+1] = 0;
+ }
+ for (j=0; j<tseq->l; j++)
+ if ( tseq->s[j]=='*' ) break;
+ if ( j!=tseq->l )
+ {
+ tseq->l = j+1;
+ tseq->s[j+1] = 0;
+ }
+ if ( csq_type & CSQ_STOP_LOST )
+ {
+ if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ {
+ csq_type &= ~CSQ_STOP_LOST;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else if (tref->s[tref->l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( tseq->s[tseq->l-1] == '*' )
+ {
+ csq_type &= ~CSQ_STOP_GAINED;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq_type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
+ csq_type &= ~CSQ_START_LOST;
+ if ( node.dlen!=0 )
+ {
+ if ( node.dlen%3 )
+ csq_type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( node.dlen<0 )
+ csq_type |= CSQ_INFRAME_DELETION;
+ else
+ csq_type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j] != tseq->s[j] ) break;
+ if ( j==tref->l )
+ csq_type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( tref->s[j] == '*' )
+ csq_type |= CSQ_STOP_LOST;
+ else if ( tseq->s[j] == '*' )
+ csq_type |= CSQ_STOP_GAINED;
+ else
+ csq_type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( csq_type & CSQ_COMPOUND )
+ {
+ // create the aa variant string
+ kstring_t str = {0,0,0};
+ int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(tref->s, &str);
+ if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(tseq->s, &str);
+ }
+ kputc_('|', &str);
+ kputw(rec->pos+1, &str);
+ kputs(node.var, &str);
+ csq.type.vstr = str;
+ csq.type.type = csq_type & CSQ_COMPOUND;
+ csq_stage(args, &csq, rec);
+
+ // all this only to clean vstr when vrec is flushed
+ if ( !tr->root )
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->root->ncsq_list++;
+ hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
+ csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+ rm_csq->type.vstr = str;
+ }
+ if ( csq_type & ~CSQ_COMPOUND )
+ {
+ csq.type.type = csq_type & ~CSQ_COMPOUND;
+ csq.type.vstr.l = 0;
+ csq_stage(args, &csq, rec);
+ }
+ }
+ free(node.seq);
+ free(node.var);
+ }
+ }
+ return ret;
+}
+
+int test_cds(args_t *args, bcf1_t *rec)
+{
+ int i, ret = 0, hap_ret;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+ if ( !tr->root )
+ {
+ // initialize the transcript and its haplotype tree, fetch the reference sequence
+ tscript_init_ref(args, tr, chr);
+
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
+ tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
+ for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
+ tr->root->nend = tr->nhap;
+ tr->root->type = HAP_ROOT;
+
+ khp_insert(trhp, args->active_tr, &tr);
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ if ( args->phase==PHASE_DROP_GT )
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ }
+ else ret = 1; // prevent reporting as intron in test_tscript
+ free(child);
+ continue;
+ }
+ parent->nend--;
+ parent->nchild = 1;
+ parent->mchild = 1;
+ parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
+ parent->child[0] = child;
+ tr->hap[0] = child;
+ tr->hap[0]->nend = 1;
+ continue;
+ }
+
+ // apply the VCF variants and extend the haplotype tree
+ int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ ngts /= bcf_hdr_nsamples(args->hdr);
+ if ( ngts!=1 && ngts!=2 )
+ {
+ if ( !args->quiet )
+ fprintf(stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ continue;
+ }
+ for (ismpl=0; ismpl<args->smpl->n; ismpl++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
+ if ( gt[0]==bcf_gt_missing ) continue;
+
+ if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+ {
+ if ( args->phase==PHASE_MERGE )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ }
+ if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
+ {
+ if ( args->phase==PHASE_REQUIRE )
+ error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ if ( args->phase==PHASE_SKIP )
+ continue;
+ if ( args->phase==PHASE_NON_REF )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0];
+ }
+ }
+ }
+
+ for (ihap=0; ihap<ngts; ihap++)
+ {
+ if ( gt[ihap]==bcf_gt_missing || gt[ihap]==bcf_int32_vector_end ) continue;
+
+ i = 2*ismpl + ihap;
+
+ int ial = bcf_gt_allele(gt[ihap]);
+ if ( !ial ) continue;
+ assert( ial < rec->n_allele );
+ if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
+
+ hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+ if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
+ {
+ // this haplotype has been seen in another sample
+ tr->hap[i] = parent->child[ parent->cur_child[ial] ];
+ tr->hap[i]->nend++;
+ parent->nend--;
+ continue;
+ }
+
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ }
+ free(child);
+ continue;
+ }
+
+ if ( parent->cur_rec!=rec )
+ {
+ hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
+ for (j=0; j<rec->n_allele; j++) parent->cur_child[j] = -1;
+ parent->cur_rec = rec;
+ }
+
+ j = parent->nchild++;
+ hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
+ parent->cur_child[ial] = j;
+ parent->child[j] = child;
+ tr->hap[i] = child;
+ tr->hap[i]->nend++;
+ parent->nend--;
+ }
+ }
+ }
+ return ret;
+}
+
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+ // known issues: tab output leads to unsorted output. This is because
+ // coding haplotypes are printed in one go and buffering is not used
+ // with tab output. VCF output is OK though.
+ if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists
+
+ int i,j,ngt = 0;
+ if ( args->phase!=PHASE_DROP_GT )
+ {
+ ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr);
+ }
+ if ( ngt<=0 )
+ {
+ if ( args->output_type==FT_TAB_TEXT )
+ csq_print_text(args, csq, -1,0);
+ return;
+ }
+ assert( ngt<=2 );
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ csq_print_text(args, csq, args->smpl->idx[i],j+1);
+ }
+ }
+ return;
+ }
+
+ vrec_t *vrec = csq->vrec;
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+
+ int icsq = 2*csq->idx + j;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int ismpl = args->smpl->idx[i];
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+ }
+}
+int test_utr(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
+ tscript_t *tr = splice.tr = utr->tr;
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+int test_splice(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.check_acceptor = splice.check_donor = 1;
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
+ splice.tr = exon->tr;
+ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites
+
+ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
+ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ splice_csq(args, &splice, exon->beg, exon->end);
+ if ( splice.csq ) ret = 1;
+ }
+ }
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return ret;
+}
+int test_tscript(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+
+void process(args_t *args, bcf1_t **rec_ptr)
+{
+ if ( !rec_ptr )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ return;
+ }
+
+ bcf1_t *rec = *rec_ptr;
+
+ int call_csq = 1;
+ if ( !rec->n_allele ) call_csq = 0; // no alternate allele
+ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
+ else if ( args->filter )
+ {
+ call_csq = filter_test(args->filter, rec, NULL);
+ if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1;
+ }
+ if ( !call_csq )
+ {
+ if ( !args->out_fh ) return; // not a VCF output
+ vbuf_push(args, rec_ptr);
+ vbuf_flush(args);
+ return;
+ }
+
+ if ( args->rid != rec->rid )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ }
+ args->rid = rec->rid;
+ vbuf_push(args, rec_ptr);
+
+ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec);
+ hit += test_utr(args, rec);
+ hit += test_splice(args, rec);
+ if ( !hit ) test_tscript(args, rec);
+
+ hap_flush(args, rec->pos-1);
+ vbuf_flush(args);
+
+ return;
+}
+
+const char *usage(void)
+{
+ return
+ "\n"
+ "About: Haplotype-aware consequence caller.\n"
+ "Usage: bcftools csq [options] in.vcf\n"
+ "\n"
+ "Required options:\n"
+ " -f, --fasta-ref <file> reference file in fasta format\n"
+ " -g, --gff-annot <file> gff3 annotation file\n"
+ "\n"
+ "CSQ options:\n"
+ " -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
+ " -l, --local-csq localized predictions, consider only one VCF record at a time\n"
+ " -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
+ " -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
+ " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
+ " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
+ " r: require phased GTs, throw an error on unphased het GTs\n"
+ " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
+ " s: skip unphased GTs\n"
+ "Options:\n"
+ " -e, --exclude <expr> exclude sites for which the expression is true\n"
+ " -i, --include <expr> select sites for which the expression is true\n"
+ " -o, --output <file> write output to a file [standard output]\n"
+ " -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
+ " -q, --quiet suppress warning messages. Can be given two times for even less messages\n"
+ " -r, --regions <region> restrict to comma-separated list of regions\n"
+ " -R, --regions-file <file> restrict to regions listed in a file\n"
+ " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file <file> samples to include\n"
+ " -t, --targets <region> similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n"
+ "\n"
+ "Example:\n"
+ " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
+ "\n"
+ " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
+ " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
+ " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
+ "\n";
+}
+
+int main_csq(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_type = FT_VCF;
+ args->bcsq_tag = "BCSQ";
+ args->ncsq_max = 2*16;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"ncsq",1,0,'n'},
+ {"custom-tag",1,0,'c'},
+ {"local-csq",0,0,'l'},
+ {"gff-annot",1,0,'g'},
+ {"fasta-ref",1,0,'f'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,NULL,'O'},
+ {"phase",1,0,'p'},
+ {"quiet",0,0,'q'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {0,0,0,0}
+ };
+ int c, targets_is_file = 0, regions_is_file = 0;
+ char *targets_list = NULL, *regions_list = NULL;
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'l': args->local_csq = 1; break;
+ case 'c': args->bcsq_tag = optarg; break;
+ case 'q': args->quiet++; break;
+ case 'p':
+ switch (optarg[0])
+ {
+ case 'a': args->phase = PHASE_AS_IS; break;
+ case 'm': args->phase = PHASE_MERGE; break;
+ case 'r': args->phase = PHASE_REQUIRE; break;
+ case 'R': args->phase = PHASE_NON_REF; break;
+ case 's': args->phase = PHASE_SKIP; break;
+ default: error("The -p code \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'f': args->fa_fname = optarg; break;
+ case 'g': args->gff_fname = optarg; break;
+ case 'n':
+ args->ncsq_max = 2 * atoi(optarg);
+ if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 't': args->output_type = FT_TAB_TEXT; break;
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': regions_list = optarg; break;
+ case 'R': regions_list = optarg; regions_is_file = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 't': targets_list = optarg; break;
+ case 'T': targets_list = optarg; targets_is_file = 1; break;
+ case 'h':
+ case '?': error("%s",usage());
+ default: error("The option not recognised: %s\n\n", optarg); break;
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else error("%s", usage());
+ }
+ else fname = argv[optind];
+ if ( argc - optind>1 ) error("%s", usage());
+ if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
+ if ( !args->gff_fname ) error("Missing the --gff option\n");
+ args->sr = bcf_sr_init();
+ if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", targets_list);
+ if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", regions_list);
+ if ( !bcf_sr_add_reader(args->sr, fname) )
+ error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum));
+ args->hdr = bcf_sr_get_header(args->sr,0);
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->sr) )
+ {
+ process(args, &args->sr->readers[0].buffer[0]);
+ }
+ process(args,NULL);
+
+ destroy_data(args);
+ bcf_sr_destroy(args->sr);
+ free(args);
+
+ return 0;
+}
+
diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c
new file mode 100644
index 0000000..b79a030
--- /dev/null
+++ b/bcftools/csq.c.pysam.c
@@ -0,0 +1,3826 @@
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ Things that would be nice to have
+ - for stop-lost events (also in frameshifts) report the number of truncated aa's
+ - memory could be greatly reduced by indexing gff (but it is quite compact already)
+ - deletions that go beyond transcript boundaries are not checked at sequence level
+ - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
+ - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
+
+ Read about transcript types here
+ http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+ http://www.ensembl.org/info/genome/variation/predicted_data.html
+ http://www.gencodegenes.org/gencode_biotypes.html
+
+ List of supported biotypes
+ antisense
+ IG_C_gene
+ IG_D_gene
+ IG_J_gene
+ IG_LV_gene
+ IG_V_gene
+ lincRNA
+ macro_lncRNA
+ miRNA
+ misc_RNA
+ Mt_rRNA
+ Mt_tRNA
+ polymorphic_pseudogene
+ processed_transcript
+ protein_coding
+ ribozyme
+ rRNA
+ sRNA
+ scRNA
+ scaRNA
+ sense_intronic
+ sense_overlapping
+ snRNA
+ snoRNA
+ TR_C_gene
+ TR_D_gene
+ TR_J_gene
+ TR_V_gene
+
+ The gff parsing logic
+ We collect features such by combining gff lines A,B,C as follows:
+ A .. gene line with a supported biotype
+ A.ID=~/^gene:/
+
+ B .. transcript line referencing A
+ B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
+
+ C .. corresponding CDS, exon, and UTR lines:
+ C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+
+ For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
+ complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
+
+
+ The supported consequence types, sorted by impact:
+ splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
+ splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
+ stop_gained .. DNA sequence variant resulting in a stop codon
+ frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
+ stop_lost .. elongated transcript, stop codon changed
+ start_lost .. the first codon changed
+ inframe_altering .. combination of indels leading to unchanged reading frame and length
+ inframe_insertion .. inserted coding sequence, unchanged reading frame
+ inframe_deletion .. deleted coding sequence, unchanged reading frame
+ missense_variant .. amino acid (aa) change, unchanged length
+ splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron
+ synonymous_variant .. DNA sequence variant resulting in no amino acid change
+ stop_retained_variant .. different stop codon
+ non_coding_variant .. variant in non-coding sequence, such as RNA gene
+ 5_prime_UTR_variant
+ 3_prime_UTR_variant
+ intron_variant .. reported only if none of the above
+ intergenic_variant .. reported only if none of the above
+
+
+ The annotation algorithm.
+ The algorithm checks if the variant falls in a region of a supported type. The
+ search is performed in the following order, until a match is found:
+ 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
+ 2. idx_utr(gf_utr_t) - check UTR hits
+ 3. idx_exon(gf_exon_t) - check for splice variants
+ 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
+
+ These regidx indexes are created by parsing a gff3 file as follows:
+ 1. create the array "ftr" of all UTR, CDS, exons. This will be
+ processed later and pruned based on transcript types we want to keep.
+ In the same go, create the hash "id2tr" of transcripts to keep
+ (based on biotype) which maps from transcript_id to a transcript. At
+ the same time also build the hash "gid2gene" which maps from gene_id to
+ gf_gene_t pointer.
+
+ 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
+ Use only features from "ftr" which are present in "id2tr".
+
+ 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
+
+ Data structures.
+ idx_cds, idx_utr, idx_exon, idx_tscript:
+ as described above, regidx structures for fast lookup of exons/transcripts
+ overlapping a region, the payload is a pointer to tscript.cds
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/faidx.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "regidx.h"
+#include "kheap.h"
+#include "smpl_ilist.h"
+#include "rbuf.h"
+
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+// Definition of splice_region, splice_acceptor and splice_donor
+#define N_SPLICE_DONOR 2
+#define N_SPLICE_REGION_EXON 3
+#define N_SPLICE_REGION_INTRON 8
+
+// Ensembl ID format, e.g.
+// ENST00000423372 for human .. ENST%011d
+// ENSMUST00000120394 for mouse .. ENSMUST%011d
+char ENSID_BUF[32], *ENSID_FMT = NULL;
+static inline char *ENSID(uint32_t id)
+{
+ sprintf(ENSID_BUF,ENSID_FMT,id);
+ return ENSID_BUF;
+}
+
+
+#define N_REF_PAD 10 // number of bases to avoid boundary effects
+
+#define STRAND_REV 0
+#define STRAND_FWD 1
+
+#define TRIM_NONE 0
+#define TRIM_5PRIME 1
+#define TRIM_3PRIME 2
+
+// How to treat phased/unphased genotypes
+#define PHASE_REQUIRE 0 // --phase r
+#define PHASE_MERGE 1 // --phase m
+#define PHASE_AS_IS 2 // --phase a
+#define PHASE_SKIP 3 // --phase s
+#define PHASE_NON_REF 4 // --phase R
+#define PHASE_DROP_GT 5 // --samples -
+
+// Node types in the haplotype tree
+#define HAP_CDS 0
+#define HAP_ROOT 1
+#define HAP_SSS 2 // start/stop/splice
+
+#define CSQ_PRINTED_UPSTREAM (1<<0)
+#define CSQ_SYNONYMOUS_VARIANT (1<<1)
+#define CSQ_MISSENSE_VARIANT (1<<2)
+#define CSQ_STOP_LOST (1<<3)
+#define CSQ_STOP_GAINED (1<<4)
+#define CSQ_INFRAME_DELETION (1<<5)
+#define CSQ_INFRAME_INSERTION (1<<6)
+#define CSQ_FRAMESHIFT_VARIANT (1<<7)
+#define CSQ_SPLICE_ACCEPTOR (1<<8)
+#define CSQ_SPLICE_DONOR (1<<9)
+#define CSQ_START_LOST (1<<10)
+#define CSQ_SPLICE_REGION (1<<11)
+#define CSQ_STOP_RETAINED (1<<12)
+#define CSQ_UTR5 (1<<13)
+#define CSQ_UTR3 (1<<14)
+#define CSQ_NON_CODING (1<<15)
+#define CSQ_INTRON (1<<16)
+//#define CSQ_INTERGENIC (1<<17)
+#define CSQ_INFRAME_ALTERING (1<<18)
+#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string
+#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
+#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence
+
+// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
+#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
+ CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
+ CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
+ CSQ_UPSTREAM_STOP)
+#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST)
+
+#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
+#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
+
+// see kput_vcsq()
+const char *csq_strings[] =
+{
+ NULL,
+ "synonymous",
+ "missense",
+ "stop_lost",
+ "stop_gained",
+ "inframe_deletion",
+ "inframe_insertion",
+ "frameshift",
+ "splice_acceptor",
+ "splice_donor",
+ "start_lost",
+ "splice_region",
+ "stop_retained",
+ "5_prime_utr",
+ "3_prime_utr",
+ "non_coding",
+ "intron",
+ "intergenic",
+ "inframe_altering",
+ NULL,
+ NULL,
+ "coding_sequence"
+};
+
+
+// GFF line types
+#define GFF_TSCRIPT_LINE 1
+#define GFF_GENE_LINE 2
+
+
+/*
+ Genomic features, for fast lookup by position to overlapping features
+*/
+#define GF_coding_bit 6
+#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
+#define GF_MT_rRNA 1 // non-coding: 1, 2, ...
+#define GF_MT_tRNA 2
+#define GF_lincRNA 3
+#define GF_miRNA 4
+#define GF_MISC_RNA 5
+#define GF_rRNA 6
+#define GF_snRNA 7
+#define GF_snoRNA 8
+#define GF_PROCESSED_TRANSCRIPT 9
+#define GF_ANTISENSE 10
+#define GF_macro_lncRNA 11
+#define GF_ribozyme 12
+#define GF_sRNA 13
+#define GF_scRNA 14
+#define GF_scaRNA 15
+#define GF_SENSE_INTRONIC 16
+#define GF_SENSE_OVERLAPPING 17
+#define GF_PSEUDOGENE 18
+#define GF_PROCESSED_PSEUDOGENE 19
+#define GF_ARTIFACT 20
+#define GF_IG_PSEUDOGENE 21
+#define GF_IG_C_PSEUDOGENE 22
+#define GF_IG_J_PSEUDOGENE 23
+#define GF_IG_V_PSEUDOGENE 24
+#define GF_TR_V_PSEUDOGENE 25
+#define GF_TR_J_PSEUDOGENE 26
+#define GF_MT_tRNA_PSEUDOGENE 27
+#define GF_misc_RNA_PSEUDOGENE 28
+#define GF_miRNA_PSEUDOGENE 29
+#define GF_RIBOZYME 30
+#define GF_RETAINED_INTRON 31
+#define GF_RETROTRANSPOSED 32
+#define GF_tRNA_PSEUDOGENE 33
+#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
+#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
+#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
+#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
+#define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
+#define GF_KNOWN_NCRNA 39
+#define GF_UNITARY_PSEUDOGENE 40
+#define GF_UNPROCESSED_PSEUDOGENE 41
+#define GF_LRG_GENE 42
+#define GF_3PRIME_OVERLAPPING_ncRNA 43
+#define GF_DISRUPTED_DOMAIN 44
+#define GF_vaultRNA 45
+#define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
+#define GF_AMBIGUOUS_ORF 47
+#define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
+#define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
+#define GF_IG_C (3|(1<<GF_coding_bit))
+#define GF_IG_D (4|(1<<GF_coding_bit))
+#define GF_IG_J (5|(1<<GF_coding_bit))
+#define GF_IG_LV (6|(1<<GF_coding_bit))
+#define GF_IG_V (7|(1<<GF_coding_bit))
+#define GF_TR_C (8|(1<<GF_coding_bit))
+#define GF_TR_D (9|(1<<GF_coding_bit))
+#define GF_TR_J (10|(1<<GF_coding_bit))
+#define GF_TR_V (11|(1<<GF_coding_bit))
+#define GF_NMD (12|(1<<GF_coding_bit))
+#define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
+#define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
+#define GF_EXON ((1<<(GF_coding_bit+1))+2)
+#define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
+#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
+// GF_MAX = (1<<30)-1, see hap_node_t
+
+typedef struct _tscript_t tscript_t;
+typedef struct
+{
+ tscript_t *tr; // transcript
+ uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
+ uint32_t pos; // 0-based index of the first exon base within the transcript (only to
+ // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
+ uint32_t len; // exon length
+ uint32_t icds:30, // exon index within the transcript
+ phase:2; // offset of the CDS
+}
+gf_cds_t;
+typedef struct
+{
+ char *name; // human readable name, e.g. ORF45
+ uint8_t iseq;
+}
+gf_gene_t;
+typedef struct
+{
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_exon_t;
+typedef enum { prime3, prime5 } utr_t;
+typedef struct
+{
+ utr_t which;
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_utr_t;
+
+
+/*
+ Structures related to VCF output:
+
+ vcsq_t
+ information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
+
+ vcrec_t
+ single VCF record and csq tied to this record. (Haplotype can have multiple
+ consequences in several VCF records. Each record can have multiple consequences
+ from multiple haplotypes.)
+
+ csq_t
+ a top-level consequence tied to a haplotype
+
+ vbuf_t
+ pos2vbuf
+ VCF records with the same position clustered together for a fast lookup via pos2vbuf
+*/
+typedef struct _vbuf_t vbuf_t;
+typedef struct _vcsq_t vcsq_t;
+struct _vcsq_t
+{
+ uint32_t strand:1,
+ type:31; // one of CSQ_* types
+ uint32_t trid;
+ uint32_t biotype; // one of GF_* types
+ char *gene; // gene name
+ bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
+ kstring_t vstr; // variant string, eg 5TY>5I|121ACG>A+124TA>T
+};
+typedef struct
+{
+ bcf1_t *line;
+ uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved
+ uint32_t nfmt:4, nvcsq:28, mvcsq;
+ vcsq_t *vcsq; // there can be multiple consequences for a single VCF record
+}
+vrec_t;
+typedef struct
+{
+ uint32_t pos;
+ vrec_t *vrec; // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf)
+ int idx; // 0-based index of the csq at the VCF line, for FMT/BCSQ
+ vcsq_t type;
+}
+csq_t;
+struct _vbuf_t
+{
+ vrec_t **vrec; // buffer of VCF lines with the same position
+ int n, m;
+};
+KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*)
+
+
+/*
+ Structures related to haplotype-aware consequences in coding regions
+
+ hap_node_t
+ node of a haplotype tree. Each transcript has one tree
+
+ tscript_t
+ despite its general name, it is intended for coding transcripts only
+
+ hap_t
+ hstack_t
+ for traversal of the haplotype tree and braking combined
+ consequences into independent parts
+*/
+typedef struct _hap_node_t hap_node_t;
+struct _hap_node_t
+{
+ char *seq; // cds segment [parent_node,this_node)
+ char *var; // variant "ref>alt"
+ uint32_t type:2, // HAP_ROOT or HAP_CDS
+ csq:30; // this node's consequence
+ int dlen; // alt minus ref length: <0 del, >0 ins, 0 substitution
+ uint32_t rbeg; // variant's VCF position (0-based, inclusive)
+ int32_t rlen; // variant's rlen; alen=rlen+dlen; fake for non CDS types
+ uint32_t sbeg; // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included)
+ uint32_t icds; // which exon does this node's variant overlaps
+ hap_node_t **child, *prev; // children haplotypes and previous coding node
+ int nchild, mchild;
+ bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record
+ uint32_t nend; // number of haplotypes ending in this node
+ int *cur_child, mcur_child; // mapping from the allele to the currently active child
+ csq_t *csq_list; // list of haplotype's consequences, broken by position
+ int ncsq_list, mcsq_list;
+};
+struct _tscript_t
+{
+ uint32_t id; // transcript id
+ uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
+ uint32_t strand:1, // STRAND_REV or STRAND_FWD
+ ncds:31, // number of exons
+ mcds;
+ gf_cds_t **cds; // ordered list of exons
+ char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
+ char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
+ hap_node_t *root; // root of the haplotype tree
+ hap_node_t **hap; // pointer to haplotype leaves, two for each sample
+ int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
+ uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
+ type:30; // one of GF_* types
+ gf_gene_t *gene;
+};
+static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+{
+ return ( (*a)->end < (*b)->end ) ? 1 : 0;
+}
+KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+typedef khp_trhp_t tr_heap_t;
+typedef struct
+{
+ hap_node_t *node; // current node
+ int ichild; // current child in the active node
+ int dlen; // total dlen, from the root to the active node
+ size_t slen; // total sequence length, from the root to the active node
+}
+hstack_t;
+typedef struct
+{
+ int mstack;
+ hstack_t *stack;
+ tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ kstring_t sseq; // spliced haplotype sequence on ref strand
+ kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
+ kstring_t tref; // the variable part of translated reference transcript, coding strand
+ uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
+ int upstream_stop;
+}
+hap_t;
+
+
+/*
+ Helper structures, only for initialization
+
+ ftr_t
+ temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
+KHASH_MAP_INIT_INT(int2int, int)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+ int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+ uint32_t beg;
+ uint32_t end;
+ uint32_t trid;
+ uint32_t strand:1; // STRAND_REV,STRAND_FWD
+ uint32_t phase:2; // 0, 1 or 2
+ uint32_t iseq:29;
+}
+ftr_t;
+typedef struct
+{
+ // all exons, CDS, UTRs
+ ftr_t *ftr;
+ int nftr, mftr;
+
+ // mapping from transcript ensembl id to gene id
+ kh_int2gene_t *gid2gene;
+
+ // mapping from transcript id to tscript, for quick CDS anchoring
+ kh_int2tscript_t *id2tr;
+
+ // sequences
+ void *seq2int;
+ char **seq;
+ int nseq, mseq;
+
+ // ignored biotypes
+ void *ignored_biotypes;
+}
+aux_t;
+
+typedef struct _args_t
+{
+ // the main regidx lookups, from chr:beg-end to overlapping features and
+ // index iterator
+ regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+ regitr_t *itr;
+
+ // temporary structures, deleted after initializtion
+ aux_t init;
+
+ // text tab-delimited output (out) or vcf/bcf output (out_fh)
+ FILE *out;
+ htsFile *out_fh;
+
+ // vcf
+ bcf_srs_t *sr;
+ bcf_hdr_t *hdr;
+ int hdr_nsmpl; // actual number of samples in the vcf, for bcf_update_format_values()
+
+ // include or exclude sites which match the filters
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // FLT_INCLUDE or FLT_EXCLUDE
+
+ // samples to process
+ int sample_is_file;
+ char *sample_list;
+ smpl_ilist_t *smpl;
+
+ char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
+ char *bcsq_tag;
+ int argc, output_type;
+ int phase, quiet, local_csq;
+ int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ
+ int ncsq_small_warned;
+
+ int rid; // current chromosome
+ tr_heap_t *active_tr; // heap of active transcripts for quick flushing
+ hap_t *hap; // transcript haplotype recursion
+ vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
+ rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
+ kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
+ tscript_t **rm_tr; // buffer of transcripts to clean
+ int nrm_tr, mrm_tr;
+ csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
+ int ncsq_buf, mcsq_buf;
+
+ faidx_t *fai;
+ kstring_t str, str2;
+ int32_t *gt_arr, mgt_arr;
+}
+args_t;
+
+// AAA, AAC, ...
+const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
+const uint8_t nt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3
+};
+const uint8_t cnt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0
+};
+#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+
+static const char *gf_strings_noncoding[] =
+{
+ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+ "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
+ "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+ "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+const char *gf_type2gff_string(int type)
+{
+ if ( !GF_is_coding(type) )
+ {
+ if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+ type &= (1<<(GF_coding_bit+1)) - 1;
+ return gf_strings_special[type - 1];
+ }
+ type &= (1<<GF_coding_bit) - 1;
+ return gf_strings_coding[type - 1];
+}
+
+/*
+ gff parsing functions
+*/
+static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
+{
+ aux_t *aux = &args->init;
+ char c = chr_end[1];
+ chr_end[1] = 0;
+ int iseq;
+ if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ {
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = strdup(chr_beg);
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 256 ); // see gf_gene_t.iseq
+ }
+ chr_end[1] = c;
+ return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+ while ( *ss && *ss!='\t' ) ss++;
+ if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return ss+1;
+}
+static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
+{
+ char *se = (char*) line;
+ while ( *se && *se!='\t' ) se++;
+ if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ *chr_beg = (char*) line;
+ *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+ char *se = ss;
+ *beg = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+ ss = se+1;
+ *end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return se+1;
+}
+static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ while ( *ss && !isdigit(*ss) ) ss++;
+ if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ char *se;
+ uint32_t id = strtol(ss, &se, 10);
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice
+ return id;
+}
+static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ char *se = ss;
+ while ( *se && !isdigit(*se) ) se++;
+ kstring_t str = {0,0,0};
+ kputsn(ss,se-ss,&str);
+ ss = se;
+ while ( *se && isdigit(*se) ) se++;
+ ksprintf(&str,"%%0%dd",(int)(se-ss));
+ ENSID_FMT = str.s;
+}
+static inline int gff_parse_type(char *line)
+{
+ line = strstr(line,"ID=");
+ if ( !line ) return -1;
+ line += 3;
+ if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
+ else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
+ return -1;
+}
+static inline int gff_parse_biotype(char *_line)
+{
+ char *line = strstr(_line,"biotype=");
+ if ( !line ) return -1;
+
+ line += 8;
+ switch (*line)
+ {
+ case 'p':
+ if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+ else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+ else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+ else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+ break;
+ case 'a':
+ if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+ else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+ else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+ break;
+ case 'I':
+ if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
+ else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
+ else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
+ else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
+ else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
+ else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+ break;
+ case 'T':
+ if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
+ else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
+ else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
+ else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
+ else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+ break;
+ case 'M':
+ if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+ break;
+ case 'l':
+ if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+ break;
+ case 'm':
+ if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+ else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+ else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+ break;
+ case 'r':
+ if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+ else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+ else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+ else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+ break;
+ case 's':
+ if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+ else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+ else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+ else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+ else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+ else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+ else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+ break;
+ case 't':
+ if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+ break;
+ case 'n':
+ if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+ else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+ break;
+ case 'k':
+ if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+ break;
+ case 'u':
+ if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+ break;
+ case 'L':
+ if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+ break;
+ case '3':
+ if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ break;
+ case 'd':
+ if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+ break;
+ case 'v':
+ if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+ break;
+ case 'b':
+ if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+ break;
+ }
+ return 0;
+}
+static inline int gff_ignored_biotype(args_t *args, char *ss)
+{
+ ss = strstr(ss,"biotype=");
+ if ( !ss ) return 0;
+
+ ss += 8;
+ char *se = ss, tmp;
+ while ( *se && *se!=';' ) se++;
+ tmp = *se;
+ *se = 0;
+
+ char *key = ss;
+ int n = 0;
+ if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+ khash_str2int_set(args->init.ignored_biotypes, key, n+1);
+
+ *se = tmp;
+ return 1;
+}
+gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+ khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+ gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+ if ( !gene )
+ {
+ gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+ int ret;
+ k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+ kh_val(aux->gid2gene,k) = gene;
+ }
+ return gene;
+}
+void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored transcript: %s\n",line);
+ return;
+ }
+
+ // create a mapping from transcript_id to gene_id
+ uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
+ uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
+
+ if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species
+
+ tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
+ tr->id = trid;
+ tr->strand = ftr->strand;
+ tr->gene = gene_init(aux, gene_id);
+ tr->type = biotype;
+ tr->beg = ftr->beg;
+ tr->end = ftr->end;
+
+ khint_t k;
+ int ret;
+ k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+ kh_val(aux->id2tr,k) = tr;
+}
+void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
+{
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored gene: %s\n",line);
+ return;
+ }
+
+ aux_t *aux = &args->init;
+
+ // substring search for "ID=gene:ENSG00000437963"
+ uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+ gf_gene_t *gene = gene_init(aux, gene_id);
+ assert( !gene->name ); // the gene_id should be unique
+
+ gene->iseq = feature_set_seq(args, chr_beg,chr_end);
+
+ // substring search for "Name=OR4F5"
+ ss = strstr(chr_end+2,"Name=");
+ if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
+ ss += 5;
+ char *se = ss;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ gene->name = (char*) malloc(se-ss+1);
+ memcpy(gene->name,ss,se-ss);
+ gene->name[se-ss] = 0;
+}
+int gff_parse(args_t *args, char *line, ftr_t *ftr)
+{
+ // - skip empty lines and commented lines
+ // - columns
+ // 1. chr
+ // 2. <skip>
+ // 3. CDS, transcript, gene, ...
+ // 4-5. beg,end
+ // 6. <skip>
+ // 7. strand
+ // 8. phase
+ // 9. Parent=transcript:ENST(\d+);ID=... etc
+
+ char *ss = line;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *chr_beg, *chr_end;
+ gff_parse_chr(line, &chr_beg, &chr_end);
+ ss = gff_skip(line, chr_end + 2);
+
+ // 3. column: is this a CDS, transcript, gene, etc.
+ if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+ else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+ else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+ else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+ else
+ {
+ ss = gff_skip(line, ss);
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+ int type = gff_parse_type(ss);
+ if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
+ {
+ // we ignore these, debug print to see new types:
+ ss = strstr(ss,"ID=");
+ if ( !ss ) return -1; // no ID, ignore the line
+ if ( !strncmp("chromosome",ss+3,10) ) return -1;
+ if ( !strncmp("supercontig",ss+3,11) ) return -1;
+ if ( args->quiet<2 ) fprintf(pysam_stderr,"ignored: %s\n", line);
+ return -1;
+ }
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else error("Unknown strand: %c .. %s\n", *ss,ss);
+
+ if ( type==GFF_TSCRIPT_LINE )
+ gff_parse_transcript(args, line, ss, ftr);
+ else
+ gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
+
+ return -1;
+ }
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
+ ss += 2;
+
+ // 8. column: phase (codon offset)
+ if ( *ss == '0' ) ftr->phase = 0;
+ else if ( *ss == '1' ) ftr->phase = 1;
+ else if ( *ss == '2' ) ftr->phase = 2;
+ else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase
+ else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
+ ss += 2;
+
+ // substring search for "Parent=transcript:ENST00000437963"
+ ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+ ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
+ return 0;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+ // comparison function for qsort of transcripts's CDS
+ if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+ if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+ return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+ *chr_beg = *chr_end = aux->seq[iseq];
+ while ( (*chr_end)[1] ) (*chr_end)++;
+}
+tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+ tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+ assert( tr );
+ return tr;
+}
+void register_cds(args_t *args, ftr_t *ftr)
+{
+ // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+ // ftr is the result of parsing a gff CDS line
+ aux_t *aux = &args->init;
+
+ tscript_t *tr = tscript_init(aux, ftr->trid);
+ if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+ gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+ cds->tr = tr;
+ cds->beg = ftr->beg;
+ cds->len = ftr->end - ftr->beg + 1;
+ cds->icds = 0; // to keep valgrind on mac happy
+ cds->phase = ftr->phase;
+
+ hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+ tr->cds[tr->ncds++] = cds;
+}
+void register_utr(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+ utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+ utr->beg = ftr->beg;
+ utr->end = ftr->end;
+ utr->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+void register_exon(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+ exon->beg = ftr->beg;
+ exon->end = ftr->end;
+ exon->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+void tscript_init_cds(args_t *args)
+{
+ aux_t *aux = &args->init;
+
+ // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+ khint_t k;
+ for (k=0; k<kh_end(aux->id2tr); k++)
+ {
+ if ( !kh_exist(aux->id2tr, k) ) continue;
+ tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
+
+ // position-to-tscript lookup
+ char *chr_beg, *chr_end;
+ chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+ if ( !tr->ncds ) continue; // transcript with no CDS
+
+ // sort CDs
+ qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+ // trim non-coding start
+ int i, len = 0;
+ if ( tr->strand==STRAND_FWD )
+ {
+ if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+ tr->cds[0]->beg += tr->cds[0]->phase;
+ tr->cds[0]->len -= tr->cds[0]->phase;
+ tr->cds[0]->phase = 0;
+
+ // sanity check phase
+ for (i=0; i<tr->ncds; i++)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ assert( phase == len%3 );
+ len += tr->cds[i]->len;
+ }
+ }
+ else
+ {
+ // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+ // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
+ // todo: the same for the fwd strand
+ i = tr->ncds - 1;
+ int phase = tr->cds[i]->phase;
+ if ( phase ) tr->trim |= TRIM_5PRIME;
+ while ( i>=0 && phase > tr->cds[i]->len )
+ {
+ phase -= tr->cds[i]->len;
+ tr->cds[i]->phase = 0;
+ tr->cds[i]->len = 0;
+ i--;
+ }
+ tr->cds[i]->len -= tr->cds[i]->phase;
+ tr->cds[i]->phase = 0;
+
+ // sanity check phase
+ for (i=tr->ncds-1; i>=0; i--)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ len += tr->cds[i]->len;
+ }
+ }
+
+ // set len. At the same check that CDS within a transcript do not overlap
+ len = 0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->icds = i;
+ len += tr->cds[i]->len;
+ if ( !i ) continue;
+
+ gf_cds_t *a = tr->cds[i-1];
+ gf_cds_t *b = tr->cds[i];
+ if ( a->beg + a->len - 1 >= b->beg )
+ error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n",
+ kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
+ if ( len%3 != 0 )
+ {
+ // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+ // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+ // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+ tr->trim |= TRIM_3PRIME;
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = tr->ncds - 1;
+ while ( i>=0 && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ len -= dlen;
+ i--;
+ }
+ }
+ else
+ {
+ i = 0;
+ while ( i<tr->ncds && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ tr->cds[i]->beg += dlen;
+ len -= dlen;
+ i++;
+ }
+ }
+ }
+
+ // set CDS offsets and insert into regidx
+ len=0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->pos = len;
+ len += tr->cds[i]->len;
+ regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+ }
+ }
+}
+
+void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
+
+void init_gff(args_t *args)
+{
+ aux_t *aux = &args->init;
+ aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
+ aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
+ args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
+ aux->ignored_biotypes = khash_str2int_init();
+
+ // parse gff
+ kstring_t str = {0,0,0};
+ htsFile *fp = hts_open(args->gff_fname,"r");
+ if ( !fp ) error("Failed to read %s\n", args->gff_fname);
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+ int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
+ if ( !ret ) aux->nftr++;
+ }
+ free(str.s);
+ if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
+
+
+ // process gff information: connect CDS and exons to transcripts
+ args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+ args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+ args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+ args->itr = regitr_init(NULL);
+
+ int i;
+ for (i=0; i<aux->nftr; i++)
+ {
+ ftr_t *ftr = &aux->ftr[i];
+
+ // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+ if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
+
+ tscript_t *tr = kh_val(aux->id2tr,k);
+ if ( !tr->gene->name )
+ {
+ // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
+ regidx_free_tscript(&tr);
+ kh_del(int2tscript, aux->id2tr,k);
+ continue;
+ }
+
+ // populate regidx by category:
+ // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+ // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+ if ( ftr->type==GF_CDS ) register_cds(args, ftr);
+ else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
+ else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
+ else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
+ else
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+ }
+ tscript_init_cds(args);
+
+ if ( !args->quiet )
+ {
+ fprintf(pysam_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ regidx_nregs(args->idx_tscript),
+ regidx_nregs(args->idx_exon),
+ regidx_nregs(args->idx_cds),
+ regidx_nregs(args->idx_utr));
+ }
+
+ free(aux->ftr);
+ khash_str2int_destroy_free(aux->seq2int);
+ // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+ kh_destroy(int2tscript,aux->id2tr);
+ free(aux->seq);
+
+ if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
+ {
+ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+ fprintf(pysam_stderr,"Ignored the following biotypes:\n");
+ for (i = kh_begin(ign); i < kh_end(ign); i++)
+ {
+ if ( !kh_exist(ign,i)) continue;
+ fprintf(pysam_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i));
+ }
+ }
+ khash_str2int_destroy_free(aux->ignored_biotypes);
+}
+
+void init_data(args_t *args)
+{
+ args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32;
+
+ if ( !args->quiet ) fprintf(pysam_stderr,"Parsing %s ...\n", args->gff_fname);
+ init_gff(args);
+
+ args->rid = -1;
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ args->fai = fai_load(args->fa_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
+
+ args->pos2vbuf = kh_init(pos2vbuf);
+ args->active_tr = khp_init(trhp);
+ args->hap = (hap_t*) calloc(1,sizeof(hap_t));
+
+ // init samples
+ if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT;
+ if ( args->sample_list && !strcmp("-",args->sample_list) )
+ {
+ // ignore all samples
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ // significant speedup for plain VCFs
+ bcf_hdr_set_samples(args->hdr,NULL,0);
+ }
+ args->phase = PHASE_DROP_GT;
+ }
+ else
+ args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT);
+ args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr);
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ args->out = args->output_fname ? fopen(args->output_fname,"w") : pysam_stdout;
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno));
+
+ fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++)
+ fprintf(args->out," %s",args->argv[i]);
+ fprintf(args->out,"\n");
+ fprintf(args->out,"# LOG\t[2]Message\n");
+ fprintf(args->out,"# CSQ"); i = 1;
+ fprintf(args->out,"\t[%d]Sample", ++i);
+ fprintf(args->out,"\t[%d]Haplotype", ++i);
+ fprintf(args->out,"\t[%d]Chromosome", ++i);
+ fprintf(args->out,"\t[%d]Position", ++i);
+ fprintf(args->out,"\t[%d]Consequence", ++i);
+ fprintf(args->out,"\n");
+ }
+ else
+ {
+ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
+ bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
+ bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq. Format: '[*]consequence|gene|transcript|biotype[|strand|amino_acid_change|dna_change]' or, for consequences of variants split across multiple sites, a pointer to the record storing the consequences '@position'. '*' prefix indicates a consequence downstream from a stop \">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
+ if ( args->hdr_nsmpl )
+ bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
+ bcf_hdr_write(args->out_fh, args->hdr);
+ }
+ if ( !args->quiet ) fprintf(pysam_stderr,"Calling...\n");
+}
+
+void destroy_data(args_t *args)
+{
+ regidx_destroy(args->idx_cds);
+ regidx_destroy(args->idx_utr);
+ regidx_destroy(args->idx_exon);
+ regidx_destroy(args->idx_tscript);
+ regitr_destroy(args->itr);
+
+ khint_t k,i,j;
+ for (k=0; k<kh_end(args->init.gid2gene); k++)
+ {
+ if ( !kh_exist(args->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
+ free(gene->name);
+ free(gene);
+ }
+ kh_destroy(int2gene,args->init.gid2gene);
+
+ if ( args->filter )
+ filter_destroy(args->filter);
+
+ khp_destroy(trhp,args->active_tr);
+ kh_destroy(pos2vbuf,args->pos2vbuf);
+ if ( args->smpl ) smpl_ilist_destroy(args->smpl);
+ int ret;
+ if ( args->out_fh )
+ ret = hts_close(args->out_fh);
+ else
+ ret = fclose(args->out);
+ if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"pysam_stdout");
+ for (i=0; i<args->vcf_rbuf.m; i++)
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ if ( !vbuf ) continue;
+ for (j=0; j<vbuf->m; j++)
+ {
+ if ( !vbuf->vrec[j] ) continue;
+ if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
+ free(vbuf->vrec[j]->smpl);
+ free(vbuf->vrec[j]->vcsq);
+ free(vbuf->vrec[j]);
+ }
+ free(vbuf->vrec);
+ free(vbuf);
+ }
+ free(args->vcf_buf);
+ free(args->rm_tr);
+ free(args->csq_buf);
+ free(args->hap->stack);
+ free(args->hap->sseq.s);
+ free(args->hap->tseq.s);
+ free(args->hap->tref.s);
+ free(args->hap);
+ fai_destroy(args->fai);
+ free(args->gt_arr);
+ free(args->str.s);
+ free(args->str2.s);
+ free(ENSID_FMT);
+}
+
+/*
+ The splice_* functions are for consquences around splice sites: start,stop,splice_*
+ */
+#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely
+#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region
+#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
+#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
+typedef struct
+{
+ tscript_t *tr;
+ struct {
+ int32_t pos, rlen, alen;
+ char *ref, *alt;
+ bcf1_t *rec;
+ } vcf;
+ uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev)
+ check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
+ check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
+ check_donor:1, // as with check_acceptor
+ check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon
+ check_region_end:1, //
+ check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr
+ set_refalt:1; // set kref,kalt, if set, check also for synonymous events
+ uint32_t csq;
+ int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele
+ uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
+ ref_end; // a more conservative csq (the first and last base in kref.s)
+ kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP
+}
+splice_t;
+void splice_init(splice_t *splice, bcf1_t *rec)
+{
+ memset(splice,0,sizeof(*splice));
+ splice->vcf.rec = rec;
+ splice->vcf.pos = rec->pos;
+ splice->vcf.rlen = rec->rlen;
+ splice->vcf.ref = rec->d.allele[0];
+}
+static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
+{
+ // len>0 .. beg is the first base, del filled from right
+ // len<0 .. beg is the last base, del filled from left
+
+ int rlen, alen, rbeg, abeg; // first base to include (ref coordinates)
+ if ( len<0 )
+ {
+ rlen = alen = -len;
+ rbeg = beg - rlen + 1;
+ int dlen = splice->vcf.alen - splice->vcf.rlen;
+ if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle
+ dlen += splice->ref_end - beg;
+ abeg = rbeg + dlen;
+ }
+ else
+ {
+ rbeg = abeg = beg;
+ rlen = alen = len;
+ // check for incomplete del as above??
+ }
+
+#define XDBG 0
+#if XDBG
+fprintf(pysam_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
+#endif
+ splice->kref.l = 0;
+ splice->kalt.l = 0;
+
+ // add the part before vcf.ref, in the vcf.ref and after vcf.ref
+ int roff; // how many vcf.ref bases already used
+ if ( rbeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
+ kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+ roff = 0;
+ }
+ else
+ roff = rbeg - splice->vcf.pos;
+#if XDBG
+fprintf(pysam_stderr,"r1: %s roff=%d\n",splice->kref.s,roff);
+#endif
+
+ if ( roff < splice->vcf.rlen && splice->kref.l < rlen )
+ {
+ int len = splice->vcf.rlen - roff; // len still available in vcf.ref
+ if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed
+ kputsn(splice->vcf.ref + roff, len, &splice->kref);
+ }
+#if XDBG
+fprintf(pysam_stderr,"r2: %s\n",splice->kref.s);
+#endif
+
+ uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kref.l < rlen )
+ {
+ if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
+ rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
+ if ( splice->kref.l < rlen )
+ kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+ }
+#if XDBG
+fprintf(pysam_stderr,"r3: %s\n",splice->kref.s);
+#endif
+
+
+ int aoff;
+ if ( abeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= abeg );
+ kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+ aoff = 0;
+ }
+ else
+ aoff = abeg - splice->vcf.pos;
+#if XDBG
+fprintf(pysam_stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ if ( aoff < splice->vcf.alen && splice->kalt.l < alen )
+ {
+ int len = splice->vcf.alen - aoff; // len still available in vcf.alt
+ if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed
+ kputsn(splice->vcf.alt + aoff, len, &splice->kalt);
+ aoff -= len;
+ }
+ if ( aoff < 0 ) aoff = 0;
+ else aoff--;
+#if XDBG
+fprintf(pysam_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kalt.l < alen )
+ {
+ if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
+ alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
+ if ( alen > 0 && alen > splice->kalt.l )
+ kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+ }
+#if XDBG
+fprintf(pysam_stderr,"a3: %s\n",splice->kalt.s);
+fprintf(pysam_stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s);
+#endif
+}
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid)
+{
+ while ( regitr_overlap(itr) )
+ {
+ gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
+ tscript_t *tr = utr->tr;
+ if ( tr->id != trid ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ return csq.type.type;
+ }
+ return 0;
+}
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type)
+{
+#if XDBG
+fprintf(pysam_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
+#endif
+ if ( !type ) return;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = type;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+}
+static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
+ // before and after the inserted bases
+ if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] )
+ {
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+ else
+ {
+ if ( splice->tend ) splice->tend--;
+ splice->ref_beg = splice->vcf.pos;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+#if XDBG
+fprintf(pysam_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ int ret;
+ if ( splice->ref_beg >= ex_end ) // fully outside, beyond the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_end ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_beg ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ // overlaps the exon or inside the exon
+ // possible todo: find better alignment for frameshifting variants?
+ if ( splice->ref_beg <= ex_beg + 2 ) // in the first 3bp
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 2 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ // Make sure the variant will not end up left aligned to avoid overlapping vcf records
+ // splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ // splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ // if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ if ( splice->ref_beg < splice->vcf.pos ) // this must have been caused by too much trimming from right
+ {
+ int dlen = splice->vcf.pos - splice->ref_beg;
+ assert( dlen==1 );
+ splice->tbeg += dlen;
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen;
+ splice->ref_beg = splice->vcf.pos;
+ }
+ if ( splice->ref_end==ex_beg ) splice->tend--; // prevent zero-length ref allele
+ splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base
+
+#if XDBG
+fprintf(pysam_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ // filling from the left does not work for ENST00000341065/frame3.vcf
+ // CAG.GTGGCCAG CAG.GTGGCCAG
+ // CA-.--GGCCAG vs CAG.---GCCAG
+ // splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON);
+ //
+ // filling from the right:
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
+ splice->ref_beg = ex_beg - 1;
+ if ( splice->tbeg + splice->tend == splice->vcf.alen )
+ {
+ // the deletion overlaps ex_beg and cannot be easily realigned to the right
+ if ( !splice->tend )
+ {
+ splice->csq |= CSQ_CODING_SEQUENCE;
+ return SPLICE_OVERLAP;
+ }
+ splice->tend--;
+ }
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); // ref,alt positioned at the first intron base
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_beg < ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ if ( splice->tbeg>0 ) splice->tbeg--; //why is this?
+ if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->vcf.alen -= splice->tbeg + splice->tend;
+ }
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
+ if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
+ {
+ splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
+ return SPLICE_OVERLAP;
+ }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // not a real variant, can be ignored: eg ACGT>ACGT
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF;
+
+ splice->ref_beg = splice->vcf.pos + splice->tbeg;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1;
+
+#if XDBG
+fprintf(pysam_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg < ex_beg ) // the part before the exon
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos;
+ splice->ref_beg = ex_beg;
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_beg <= ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 3 )
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ splice->csq = 0;
+ splice->vcf.alen = strlen(splice->vcf.alt);
+
+ int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0;
+ splice->tbeg = 0, splice->tend = 0;
+
+ // trim from the right, then from the left
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break;
+ i++;
+ }
+ splice->tend = i;
+ rlen1 -= i, alen1 -= i, i = 0;
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break;
+ i++;
+ }
+ splice->tbeg = i;
+
+ // The mnp, ins and del code was split into near-identical functions for clarity and debugging;
+ // possible todo: generalize once stable
+ if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end);
+
+ return 0;
+}
+
+// return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref)
+int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial)
+{
+ int i;
+ kstring_t str = {0,0,0};
+ tscript_t *tr = cds->tr;
+ child->icds = cds->icds; // index of cds in the tscript's list of exons
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.tr = tr;
+ splice.vcf.alt = rec->d.allele[ial];
+ splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
+ if ( !(tr->trim & TRIM_5PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
+ else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
+ }
+ if ( !(tr->trim & TRIM_3PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
+ else { if ( child->icds==0 ) splice.check_stop = 1; }
+ }
+ if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ }
+ if ( child->icds!=0 ) splice.check_region_beg = 1;
+ if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
+
+#if XDBG
+fprintf(pysam_stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop);
+#endif
+ int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1);
+#if XDBG
+fprintf(pysam_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq);
+#endif
+
+ if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA
+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq
+ {
+ free(splice.kref.s);
+ free(splice.kalt.s);
+
+ if ( !splice.csq ) return 2; // fully intronic, no csq
+
+ // splice_region/acceptor/donor
+ child->seq = NULL;
+ child->sbeg = 0;
+ child->rbeg = rec->pos;
+ child->rlen = 0;
+ child->dlen = 0;
+ kputs(rec->d.allele[0],&str);
+ kputc('>',&str);
+ kputs(rec->d.allele[ial],&str);
+ child->var = str.s;
+ child->type = HAP_SSS;
+ child->csq = splice.csq;
+ child->prev = parent->type==HAP_SSS ? parent->prev : parent;
+ child->rec = rec;
+ return 0;
+ }
+ if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; // synonymous&splice,frame could become synonymous&frame,splice
+
+ int dbeg = 0;
+ if ( splice.ref_beg < cds->beg )
+ {
+ // The vcf record overlaps the exon boundary, but the variant itself
+ // should fit inside since we are here. This will need more work.
+ // #1475227917
+ dbeg = cds->beg - splice.ref_beg;
+ splice.kref.l -= dbeg;
+ splice.ref_beg = cds->beg;
+ assert( dbeg <= splice.kalt.l );
+ }
+
+ if ( parent->type==HAP_SSS ) parent = parent->prev;
+ if ( parent->type==HAP_CDS )
+ {
+ i = parent->icds;
+ if ( i!=cds->icds )
+ {
+ // the variant is on a new exon, finish up the previous
+ int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
+ if ( len > 0 )
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+
+ // append any skipped non-variant exons
+ while ( ++i < cds->icds )
+ kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+
+ if ( parent->icds==child->icds )
+ {
+ int len = splice.ref_beg - parent->rbeg - parent->rlen;
+ if ( len < 0 ) // overlapping variants
+ {
+ free(str.s);
+ return 1;
+ }
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+ else
+ kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+ }
+ kputs(splice.kalt.s + dbeg, &str);
+
+ child->seq = str.s;
+ child->sbeg = cds->pos + (splice.ref_beg - cds->beg);
+ child->rbeg = splice.ref_beg;
+ child->rlen = splice.kref.l;
+ child->type = HAP_CDS;
+ child->prev = parent;
+ child->rec = rec;
+ child->csq = splice.csq;
+
+ // set vlen and the "ref>alt" string
+ {
+ int rlen = strlen(rec->d.allele[0]);
+ int alen = strlen(rec->d.allele[ial]);
+ child->dlen = alen - rlen;
+ child->var = (char*) malloc(rlen+alen+2);
+ memcpy(child->var,rec->d.allele[0],rlen);
+ child->var[rlen] = '>';
+ memcpy(child->var+rlen+1,rec->d.allele[ial],alen);
+ child->var[rlen+alen+1] = 0;
+ }
+
+ // yuck, the whole CDS is modified/deleted, not ready for this, todo.
+ if ( child->rbeg + child->rlen > cds->beg + cds->len )
+ {
+ child->type = HAP_SSS;
+ if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf
+ }
+
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return 0;
+}
+void hap_destroy(hap_node_t *hap)
+{
+ int i;
+ for (i=0; i<hap->nchild; i++)
+ if ( hap->child[i] ) hap_destroy(hap->child[i]);
+ for (i=0; i<hap->mcsq_list; i++) free(hap->csq_list[i].type.vstr.s);
+ free(hap->csq_list);
+ free(hap->child);
+ free(hap->cur_child);
+ free(hap->seq);
+ free(hap->var);
+ free(hap);
+}
+
+
+/*
+ ref: spliced reference and its length (ref.l)
+ seq: part of the spliced query transcript on the reference strand to translate, its
+ length (seq.l) and the total length of the complete transcript (seq.m)
+ sbeg: seq offset within the spliced query transcript
+ rbeg: seq offset within ref, 0-based
+ rend: last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l
+ strand: coding strand - 0:rev, 1:fwd
+ tseq: translated sequence (aa)
+ fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
+ */
+void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
+{
+#if XDBG
+fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+#endif
+ char tmp[3], *codon, *end;
+ int i, len, npad;
+
+ kstring_t ref = *_ref;
+ kstring_t seq = *_seq;
+
+ tseq->l = 0;
+ if ( !seq.l )
+ {
+ kputc('?', tseq);
+ return;
+ }
+
+#define DBG 0
+#if DBG
+ fprintf(pysam_stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+ fprintf(pysam_stderr," ref: l=%d %s\n", (int)ref.l,ref.s);
+ fprintf(pysam_stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m);
+ for (i=0; i<seq.l; i++) fprintf(pysam_stderr,"%c",seq.s[i]); fprintf(pysam_stderr,"\n");
+ fprintf(pysam_stderr," sbeg,rbeg,rend: %d,%d,%d\n", sbeg,rbeg,rend);
+ fprintf(pysam_stderr," strand,fill: %d,%d\n", strand,fill);
+#endif
+
+ if ( strand==STRAND_FWD )
+ {
+ // left padding
+ npad = sbeg % 3;
+#if DBG>1
+ fprintf(pysam_stderr," npad: %d\n",npad);
+#endif
+ assert( npad<=rbeg );
+
+ for (i=0; i<npad; i++)
+ tmp[i] = ref.s[rbeg+i-npad+N_REF_PAD];
+ for (; i<3 && i-npad<seq.l; i++)
+ tmp[i] = seq.s[i-npad];
+ len = seq.l - i + npad; // the remaining length of padded sseq
+#if DBG>1
+ fprintf(pysam_stderr,"\t i=%d\n", i);
+#endif
+ if ( i==3 )
+ {
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ codon = seq.s + 3 - npad; // next codon
+ end = codon + len - 1 - (len % 3); // last position of a valid codon
+ while ( codon < end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
+#endif
+ codon += 3;
+ }
+ end = seq.s + seq.l - 1;
+ for (i=0; codon+i<=end; i++) tmp[i] = codon[i];
+ }
+
+ // right padding
+ codon = ref.s + rend + N_REF_PAD;
+ if ( i>0 )
+ {
+#if DBG>1
+ if(i==1)fprintf(pysam_stderr,"[3]%c\n",tmp[0]);
+ if(i==2)fprintf(pysam_stderr,"[3]%c%c\n",tmp[0],tmp[1]);
+#endif
+ for (; i<3; i++)
+ {
+ tmp[i] = *codon;
+ codon++;
+ }
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ }
+ if ( fill!=0 )
+ {
+ end = ref.s + ref.l - N_REF_PAD;
+ while ( codon+3 <= end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
+#endif
+ codon += 3;
+ }
+ }
+ }
+ else // STRAND_REV
+ {
+ // right padding - number of bases to take from ref
+ npad = (seq.m - (sbeg + seq.l)) % 3;
+#if DBG>1
+ fprintf(pysam_stderr," npad: %d\n",npad);
+#endif
+if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m);
+ assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand
+
+ if ( npad==2 )
+ {
+ tmp[1] = ref.s[rend+N_REF_PAD];
+ tmp[2] = ref.s[rend+N_REF_PAD+1];
+ i = 0;
+ }
+ else if ( npad==1 )
+ {
+ tmp[2] = ref.s[rend+N_REF_PAD];
+ i = 1;
+ }
+ else
+ i = 2;
+
+ end = seq.s + seq.l;
+ for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end);
+#if DBG>1
+ fprintf(pysam_stderr,"\t i=%d\n", i);
+ if(i==1)fprintf(pysam_stderr,"[0] %c\n",tmp[2]);
+ if(i==0)fprintf(pysam_stderr,"[0] %c%c\n",tmp[1],tmp[2]);
+#endif
+ if ( i==-1 )
+ {
+#if DBG>1
+ fprintf(pysam_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
+#endif
+ kputc_(cdna2aa(tmp), tseq);
+ codon = end - 3;
+ while ( codon >= seq.s )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ if ( seq.s-codon==2 )
+ {
+ tmp[2] = seq.s[0];
+ i = 1;
+ }
+ else if ( seq.s-codon==1 )
+ {
+ tmp[1] = seq.s[0];
+ tmp[2] = seq.s[1];
+ i = 0;
+ }
+ else
+ i = -1;
+#if DBG>1
+ if(i==1)fprintf(pysam_stderr,"[3] %c\n",tmp[2]);
+ if(i==0)fprintf(pysam_stderr,"[3] %c%c\n",tmp[1],tmp[2]);
+#endif
+ }
+ // left padding
+ end = ref.s + N_REF_PAD + rbeg;
+ if ( i>=0 )
+ {
+ for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
+ kputc_(cdna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
+#endif
+ }
+ if ( fill!=0 )
+ {
+ codon = end - 3;
+ while ( codon >= ref.s + N_REF_PAD )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ }
+ }
+ kputc_(0,tseq); tseq->l--;
+#if DBG
+ fprintf(pysam_stderr," tseq: %s\n", tseq->s);
+#endif
+}
+
+void tscript_splice_ref(tscript_t *tr)
+{
+ int i, len = 0;
+ for (i=0; i<tr->ncds; i++)
+ len += tr->cds[i]->len;
+
+ tr->nsref = len + 2*N_REF_PAD;
+ tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
+ len = 0;
+
+ memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ for (i=0; i<tr->ncds; i++)
+ {
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+ len += tr->cds[i]->len;
+ }
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ tr->sref[len] = 0;
+}
+
+// returns: 0 if consequence was added, 1 if it already exists or could not be added
+int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+#if XDBG
+fprintf(pysam_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
+#endif
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
+ vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
+ if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+
+ int i;
+ for (i=0; i<vbuf->n; i++)
+ if ( vbuf->vrec[i]->line==rec ) break;
+ if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+ vrec_t *vrec = vbuf->vrec[i];
+
+ // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
+ if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
+ csq->type.type &= ~CSQ_SPLICE_REGION;
+
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ // Same as below, to avoid records like
+ // 3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // 3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i] = csq->type;
+ goto exit_duplicate;
+ }
+ if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue;
+ if ( csq->type.ref != vrec->vcsq[i].ref ) continue;
+ goto exit_duplicate;
+ }
+ }
+ else if ( csq->type.type & CSQ_COMPOUND )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
+ if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
+ {
+ // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
+ // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
+ // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two
+ // consequences:
+ // stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
+ if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
+ {
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+
+ // remove stop_lost&synonymous if stop_retained set
+ if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
+ vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
+
+ if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
+ goto exit_duplicate;
+ }
+ continue;
+ }
+ if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
+ }
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ }
+ else
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate;
+ }
+ }
+ // no such csq yet in this vcf record
+ csq->vrec = vrec;
+ csq->idx = i;
+ vrec->nvcsq++;
+ hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
+ vrec->vcsq[i] = csq->type;
+ return 0;
+
+exit_duplicate:
+ csq->vrec = vrec;
+ csq->idx = i;
+ return 1;
+}
+
+// soff .. position of the variant within the trimmed query transcript
+// sbeg .. position of the variant within the query transcript
+// rbeg .. position on the reference transcript (if there are no indels, then rbeg=send)
+// rpos .. VCF position
+#define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen))
+#define node2sbeg(i) (hap->sbeg + node2soff(i))
+#define node2send(i) (hap->sbeg + hap->stack[i].slen)
+#define node2rbeg(i) (hap->stack[i].node->sbeg)
+#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
+#define node2rpos(i) (hap->stack[i].node->rec->pos)
+
+void kput_vcsq(vcsq_t *csq, kstring_t *str)
+{
+ // Remove start/stop from incomplete CDS, but only if there is another
+ // consequence as something must be reported
+ if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS);
+
+ // Remove missense from start/stops
+ if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT;
+
+ if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref )
+ {
+ kputc_('@',str);
+ kputw(csq->ref->pos+1, str);
+ return;
+ }
+ if ( csq->type & CSQ_UPSTREAM_STOP )
+ kputc_('*',str);
+
+ int i, n = sizeof(csq_strings)/sizeof(char*);
+ for (i=1; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+ i++;
+ for (; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+
+ kputc_('|', str);
+ if ( csq->gene ) kputs(csq->gene , str);
+
+ kputc_('|', str);
+ if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+
+ kputc_('|', str);
+ kputs(gf_type2gff_string(csq->biotype), str);
+
+ if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
+ kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
+
+ if ( csq->vstr.l )
+ kputs(csq->vstr.s, str);
+}
+
+void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
+{
+ int i;
+ tscript_t *tr = hap->tr;
+ int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
+
+ int icsq = node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *csq = &node->csq_list[icsq];
+ csq->pos = hap->stack[ref_node].node->rec->pos;
+ csq->type.trid = tr->id;
+ csq->type.gene = tr->gene->name;
+ csq->type.strand = tr->strand;
+ csq->type.biotype = tr->type;
+
+ // only now we see the translated sequence and can determine if the stop/start changes are real
+ int rm_csq = 0;
+ csq->type.type = 0;
+ for (i=ibeg; i<=iend; i++)
+ csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
+ if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING;
+
+ int has_upstream_stop = hap->upstream_stop;
+ if ( hap->stack[ibeg].node->type != HAP_SSS )
+ {
+ // check for truncating stops
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i]=='*' ) break;
+ if ( i!=hap->tref.l )
+ {
+ hap->tref.l = i+1;
+ hap->tref.s[i+1] = 0;
+ }
+ for (i=0; i<hap->tseq.l; i++)
+ if ( hap->tseq.s[i]=='*' ) break;
+ if ( i!=hap->tseq.l )
+ {
+ hap->tseq.l = i+1;
+ hap->tseq.s[i+1] = 0;
+ hap->upstream_stop = 1;
+ }
+ if ( csq->type.type & CSQ_STOP_LOST )
+ {
+ if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ {
+ rm_csq |= CSQ_STOP_LOST;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else if ( hap->tref.s[hap->tref.l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( hap->tseq.s[hap->tseq.l-1] == '*' )
+ {
+ rm_csq |= CSQ_STOP_GAINED;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq->type.type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
+ {
+ rm_csq |= CSQ_START_LOST;
+ csq->type.type &= ~CSQ_START_LOST;
+ }
+ if ( dlen!=0 )
+ {
+ if ( dlen%3 )
+ csq->type.type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( dlen<0 )
+ csq->type.type |= CSQ_INFRAME_DELETION;
+ else
+ csq->type.type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
+ if ( i==hap->tref.l )
+ csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( hap->tref.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_LOST;
+ else if ( hap->tseq.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_GAINED;
+ else
+ csq->type.type |= CSQ_MISSENSE_VARIANT;
+ }
+ }
+ if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
+ csq->type.type &= ~rm_csq;
+
+ if ( hap->stack[ibeg].node->type == HAP_SSS )
+ {
+ node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq;
+ node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec;
+ node->csq_list[icsq].type.biotype = tr->type;
+ csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec);
+ return;
+ }
+
+ kstring_t str = node->csq_list[icsq].type.vstr;
+ str.l = 0;
+
+ // create the aa variant string
+ int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(hap->tref.s, &str);
+ if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(hap->tseq.s, &str);
+ }
+ kputc_('|', &str);
+
+ // create the dna variant string and, in case of combined variants,
+ // insert silent CSQ_PRINTED_UPSTREAM variants
+ for (i=ibeg; i<=iend; i++)
+ {
+ if ( i>ibeg ) kputc_('+', &str);
+ kputw(node2rpos(i)+1, &str);
+ kputs(hap->stack[i].node->var, &str);
+ }
+ node->csq_list[icsq].type.vstr = str;
+ csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec);
+
+ for (i=ibeg; i<=iend; i++)
+ {
+ // csq are printed at one position only for combined variants, the rest is
+ // silent and references the first
+ if ( hap->stack[i].node->csq & ~CSQ_COMPOUND )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.vstr.l = 0;
+ kputs(str.s,&tmp_csq->type.vstr);
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.ref = hap->stack[ref_node].node->rec;
+ tmp_csq->type.vstr.l = 0;
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ }
+}
+
+void hap_finalize(args_t *args, hap_t *hap)
+{
+ tscript_t *tr = hap->tr;
+ if ( !tr->sref )
+ tscript_splice_ref(tr);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ int istack = 0;
+ hts_expand(hstack_t,1,hap->mstack,hap->stack);
+
+ hap->sseq.l = 0;
+ hap->tseq.l = 0;
+ hap->stack[0].node = tr->root;
+ hap->stack[0].ichild = -1;
+ hap->stack[0].slen = 0;
+ hap->stack[0].dlen = 0;
+
+ while ( istack>=0 )
+ {
+ hstack_t *stack = &hap->stack[istack];
+ hap_node_t *node = hap->stack[istack].node;
+ while ( ++hap->stack[istack].ichild < node->nchild )
+ {
+ if ( node->child[stack->ichild] ) break;
+ }
+ if ( stack->ichild == node->nchild ) { istack--; continue; }
+
+ node = node->child[stack->ichild];
+
+ istack++;
+ hts_expand(hstack_t,istack+1,hap->mstack,hap->stack);
+ stack = &hap->stack[istack-1];
+
+ hap->stack[istack].node = node;
+ hap->stack[istack].ichild = -1;
+
+ hap->sseq.l = stack->slen;
+ if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq);
+ hap->stack[istack].slen = hap->sseq.l;
+ hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen;
+
+ if ( !node->nend ) continue; // not a leaf node
+
+ // The spliced sequence has been built for the current haplotype and stored
+ // in hap->sseq. Now we break it and output as independent parts
+
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript
+ hap->upstream_stop = 0;
+
+ int i = 1, dlen = 0, ibeg, indel = 0;
+ while ( i<istack && hap->stack[i].node->type == HAP_SSS ) i++;
+ hap->sbeg = hap->stack[i].node->sbeg;
+
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = 0, ibeg = -1;
+ while ( ++i <= istack )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ // start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i<istack )
+ {
+ if ( dlen%3 ) // frameshift
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = node2sbeg(i);
+ int inext = node2sbeg(i+1);
+ if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+
+ int ioff = node2soff(ibeg);
+ int icur = node2sbeg(ibeg);
+ int rbeg = node2rbeg(ibeg);
+ int rend = node2rend(i);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[i].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(i) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ else
+ {
+ i = istack + 1, ibeg = -1;
+ while ( --i > 0 )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i>1 && hap->stack[i-1].node->type != HAP_SSS )
+ {
+ if ( dlen%3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = sseq.m - 1 - node2sbeg(i);
+ int inext = sseq.m - 1 - node2sbeg(i-1);
+ if ( icur/3 == inext/3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+ int ioff = node2soff(i);
+ int icur = node2sbeg(i);
+ int rbeg = node2rbeg(i);
+ int rend = node2rend(ibeg);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[ibeg].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(ibeg) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ }
+}
+
+static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
+{
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+}
+static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue;
+ assert( csq->type.vstr.l );
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+ }
+}
+
+static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list || ismpl<0 ) return;
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ vrec_t *vrec = csq->vrec;
+ int icsq = 2*csq->idx + ihap;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+}
+
+void hap_flush(args_t *args, uint32_t pos)
+{
+ int i,j;
+ tr_heap_t *heap = args->active_tr;
+
+ while ( heap->ndat && heap->dat[0]->end<=pos )
+ {
+ tscript_t *tr = heap->dat[0];
+ khp_delete(trhp, heap);
+
+ args->hap->tr = tr;
+ if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+ {
+ hap_finalize(args, args->hap);
+
+ if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
+ {
+ if ( args->phase==PHASE_DROP_GT )
+ hap_print_text(args, tr, -1,0, tr->hap[0]);
+ else
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+ }
+ }
+ }
+ else if ( args->phase!=PHASE_DROP_GT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+ }
+ }
+ }
+
+ // mark the transcript for deletion. Cannot delete it immediately because
+ // by-position VCF output will need them when flushed by vcf_buf_push
+ args->nrm_tr++;
+ hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+ args->rm_tr[args->nrm_tr-1] = tr;
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+
+void vbuf_push(args_t *args, bcf1_t **rec_ptr)
+{
+ int i;
+
+ assert(rec_ptr);
+ bcf1_t *rec = *rec_ptr;
+
+ // check for duplicate records
+ i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
+ if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
+ {
+ // vcf record with a new pos
+ rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
+ i = rbuf_append(&args->vcf_rbuf);
+ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t));
+ args->vcf_buf[i]->n = 0;
+ }
+ vbuf_t *vbuf = args->vcf_buf[i];
+ vbuf->n++;
+ hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec);
+ if ( !vbuf->vrec[vbuf->n - 1] )
+ vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t));
+
+ vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
+ if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
+ {
+ if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ }
+ if ( !vrec->line ) vrec->line = bcf_init1();
+ SWAP(bcf1_t*, (*rec_ptr), vrec->line);
+
+ int ret;
+ khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret);
+ kh_val(args->pos2vbuf,k) = vbuf;
+}
+
+void vbuf_flush(args_t *args)
+{
+ if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone
+
+ int i,j;
+ while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 )
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ for (i=0; i<vbuf->n; i++)
+ {
+ vrec_t *vrec = vbuf->vrec[i];
+ if ( !args->out_fh ) // not a VCF output
+ {
+ vrec->nvcsq = 0;
+ continue;
+ }
+ if ( !vrec->nvcsq )
+ {
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ continue;
+ }
+
+ args->str.l = 0;
+ kput_vcsq(&vrec->vcsq[0], &args->str);
+ for (j=1; j<vrec->nvcsq; j++)
+ {
+ kputc_(',', &args->str);
+ kput_vcsq(&vrec->vcsq[j], &args->str);
+ }
+ bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
+ if ( args->hdr_nsmpl )
+ {
+ if ( vrec->nfmt < args->nfmt_bcsq )
+ for (j=1; j<args->hdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl));
+ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt);
+ }
+ vrec->nvcsq = 0;
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ }
+ if ( vbuf->n )
+ {
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos);
+ if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
+ }
+ vbuf->n = 0;
+ }
+
+ for (i=0; i<args->nrm_tr; i++)
+ {
+ tscript_t *tr = args->rm_tr[i];
+ if ( tr->root ) hap_destroy(tr->root);
+ tr->root = NULL;
+ free(tr->hap);
+ free(tr->ref);
+ free(tr->sref);
+ }
+ args->nrm_tr = 0;
+ args->ncsq_buf = 0;
+}
+
+void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+{
+ int i, len;
+ int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
+
+ tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ if ( !tr->ref )
+ error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
+
+ int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
+ if ( pad_beg + pad_end != 2*N_REF_PAD )
+ {
+ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD);
+ for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
+ memcpy(ref+i, tr->ref, len);
+ for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
+ free(tr->ref);
+ tr->ref = ref;
+ }
+}
+
+static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+{
+ char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0);
+ char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos);
+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) );
+ while ( *ref && *vcf )
+ {
+ if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) )
+ error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]);
+ ref++;
+ vcf++;
+ }
+}
+
+int test_cds_local(args_t *args, bcf1_t *rec)
+{
+ int i,j, ret = 0;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ // structures to fake the normal test_cds machinery
+ hap_node_t root, node;
+ root.type = HAP_ROOT;
+ kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
+
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+
+ if ( !tr->ref )
+ {
+ tscript_init_ref(args, tr, chr);
+ tscript_splice_ref(tr);
+ khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
+
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+
+ int csq_type = node.csq;
+
+ // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though
+ if ( node.type == HAP_SSS )
+ {
+ csq.type.type = csq_type;
+ csq_stage(args, &csq, rec);
+ }
+ else
+ {
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + node.dlen;
+ sseq.s = node.seq;
+ int alen = sseq.l = strlen(sseq.s);
+ int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
+
+ sseq.m = sref.m - 2*N_REF_PAD;
+ sseq.s = sref.s + N_REF_PAD + node.sbeg;
+ sseq.l = node.rlen;
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
+
+ // check for truncating stops
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j]=='*' ) break;
+ if ( j!=tref->l )
+ {
+ tref->l = j+1;
+ tref->s[j+1] = 0;
+ }
+ for (j=0; j<tseq->l; j++)
+ if ( tseq->s[j]=='*' ) break;
+ if ( j!=tseq->l )
+ {
+ tseq->l = j+1;
+ tseq->s[j+1] = 0;
+ }
+ if ( csq_type & CSQ_STOP_LOST )
+ {
+ if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ {
+ csq_type &= ~CSQ_STOP_LOST;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else if (tref->s[tref->l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( tseq->s[tseq->l-1] == '*' )
+ {
+ csq_type &= ~CSQ_STOP_GAINED;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq_type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
+ csq_type &= ~CSQ_START_LOST;
+ if ( node.dlen!=0 )
+ {
+ if ( node.dlen%3 )
+ csq_type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( node.dlen<0 )
+ csq_type |= CSQ_INFRAME_DELETION;
+ else
+ csq_type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j] != tseq->s[j] ) break;
+ if ( j==tref->l )
+ csq_type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( tref->s[j] == '*' )
+ csq_type |= CSQ_STOP_LOST;
+ else if ( tseq->s[j] == '*' )
+ csq_type |= CSQ_STOP_GAINED;
+ else
+ csq_type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( csq_type & CSQ_COMPOUND )
+ {
+ // create the aa variant string
+ kstring_t str = {0,0,0};
+ int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(tref->s, &str);
+ if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(tseq->s, &str);
+ }
+ kputc_('|', &str);
+ kputw(rec->pos+1, &str);
+ kputs(node.var, &str);
+ csq.type.vstr = str;
+ csq.type.type = csq_type & CSQ_COMPOUND;
+ csq_stage(args, &csq, rec);
+
+ // all this only to clean vstr when vrec is flushed
+ if ( !tr->root )
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->root->ncsq_list++;
+ hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
+ csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+ rm_csq->type.vstr = str;
+ }
+ if ( csq_type & ~CSQ_COMPOUND )
+ {
+ csq.type.type = csq_type & ~CSQ_COMPOUND;
+ csq.type.vstr.l = 0;
+ csq_stage(args, &csq, rec);
+ }
+ }
+ free(node.seq);
+ free(node.var);
+ }
+ }
+ return ret;
+}
+
+int test_cds(args_t *args, bcf1_t *rec)
+{
+ int i, ret = 0, hap_ret;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+ if ( !tr->root )
+ {
+ // initialize the transcript and its haplotype tree, fetch the reference sequence
+ tscript_init_ref(args, tr, chr);
+
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
+ tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
+ for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
+ tr->root->nend = tr->nhap;
+ tr->root->type = HAP_ROOT;
+
+ khp_insert(trhp, args->active_tr, &tr);
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ if ( args->phase==PHASE_DROP_GT )
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ }
+ else ret = 1; // prevent reporting as intron in test_tscript
+ free(child);
+ continue;
+ }
+ parent->nend--;
+ parent->nchild = 1;
+ parent->mchild = 1;
+ parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
+ parent->child[0] = child;
+ tr->hap[0] = child;
+ tr->hap[0]->nend = 1;
+ continue;
+ }
+
+ // apply the VCF variants and extend the haplotype tree
+ int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ ngts /= bcf_hdr_nsamples(args->hdr);
+ if ( ngts!=1 && ngts!=2 )
+ {
+ if ( !args->quiet )
+ fprintf(pysam_stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ continue;
+ }
+ for (ismpl=0; ismpl<args->smpl->n; ismpl++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
+ if ( gt[0]==bcf_gt_missing ) continue;
+
+ if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+ {
+ if ( args->phase==PHASE_MERGE )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ }
+ if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
+ {
+ if ( args->phase==PHASE_REQUIRE )
+ error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ if ( args->phase==PHASE_SKIP )
+ continue;
+ if ( args->phase==PHASE_NON_REF )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0];
+ }
+ }
+ }
+
+ for (ihap=0; ihap<ngts; ihap++)
+ {
+ if ( gt[ihap]==bcf_gt_missing || gt[ihap]==bcf_int32_vector_end ) continue;
+
+ i = 2*ismpl + ihap;
+
+ int ial = bcf_gt_allele(gt[ihap]);
+ if ( !ial ) continue;
+ assert( ial < rec->n_allele );
+ if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
+
+ hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+ if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
+ {
+ // this haplotype has been seen in another sample
+ tr->hap[i] = parent->child[ parent->cur_child[ial] ];
+ tr->hap[i]->nend++;
+ parent->nend--;
+ continue;
+ }
+
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ }
+ free(child);
+ continue;
+ }
+
+ if ( parent->cur_rec!=rec )
+ {
+ hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
+ for (j=0; j<rec->n_allele; j++) parent->cur_child[j] = -1;
+ parent->cur_rec = rec;
+ }
+
+ j = parent->nchild++;
+ hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
+ parent->cur_child[ial] = j;
+ parent->child[j] = child;
+ tr->hap[i] = child;
+ tr->hap[i]->nend++;
+ parent->nend--;
+ }
+ }
+ }
+ return ret;
+}
+
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+ // known issues: tab output leads to unsorted output. This is because
+ // coding haplotypes are printed in one go and buffering is not used
+ // with tab output. VCF output is OK though.
+ if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists
+
+ int i,j,ngt = 0;
+ if ( args->phase!=PHASE_DROP_GT )
+ {
+ ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr);
+ }
+ if ( ngt<=0 )
+ {
+ if ( args->output_type==FT_TAB_TEXT )
+ csq_print_text(args, csq, -1,0);
+ return;
+ }
+ assert( ngt<=2 );
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ csq_print_text(args, csq, args->smpl->idx[i],j+1);
+ }
+ }
+ return;
+ }
+
+ vrec_t *vrec = csq->vrec;
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+
+ int icsq = 2*csq->idx + j;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int ismpl = args->smpl->idx[i];
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+ }
+}
+int test_utr(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
+ tscript_t *tr = splice.tr = utr->tr;
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+int test_splice(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.check_acceptor = splice.check_donor = 1;
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
+ splice.tr = exon->tr;
+ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites
+
+ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
+ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ splice_csq(args, &splice, exon->beg, exon->end);
+ if ( splice.csq ) ret = 1;
+ }
+ }
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return ret;
+}
+int test_tscript(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+
+void process(args_t *args, bcf1_t **rec_ptr)
+{
+ if ( !rec_ptr )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ return;
+ }
+
+ bcf1_t *rec = *rec_ptr;
+
+ int call_csq = 1;
+ if ( !rec->n_allele ) call_csq = 0; // no alternate allele
+ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
+ else if ( args->filter )
+ {
+ call_csq = filter_test(args->filter, rec, NULL);
+ if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1;
+ }
+ if ( !call_csq )
+ {
+ if ( !args->out_fh ) return; // not a VCF output
+ vbuf_push(args, rec_ptr);
+ vbuf_flush(args);
+ return;
+ }
+
+ if ( args->rid != rec->rid )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ }
+ args->rid = rec->rid;
+ vbuf_push(args, rec_ptr);
+
+ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec);
+ hit += test_utr(args, rec);
+ hit += test_splice(args, rec);
+ if ( !hit ) test_tscript(args, rec);
+
+ hap_flush(args, rec->pos-1);
+ vbuf_flush(args);
+
+ return;
+}
+
+const char *usage(void)
+{
+ return
+ "\n"
+ "About: Haplotype-aware consequence caller.\n"
+ "Usage: bcftools csq [options] in.vcf\n"
+ "\n"
+ "Required options:\n"
+ " -f, --fasta-ref <file> reference file in fasta format\n"
+ " -g, --gff-annot <file> gff3 annotation file\n"
+ "\n"
+ "CSQ options:\n"
+ " -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
+ " -l, --local-csq localized predictions, consider only one VCF record at a time\n"
+ " -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
+ " -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
+ " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
+ " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
+ " r: require phased GTs, throw an error on unphased het GTs\n"
+ " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
+ " s: skip unphased GTs\n"
+ "Options:\n"
+ " -e, --exclude <expr> exclude sites for which the expression is true\n"
+ " -i, --include <expr> select sites for which the expression is true\n"
+ " -o, --output <file> write output to a file [standard output]\n"
+ " -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
+ " -q, --quiet suppress warning messages. Can be given two times for even less messages\n"
+ " -r, --regions <region> restrict to comma-separated list of regions\n"
+ " -R, --regions-file <file> restrict to regions listed in a file\n"
+ " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file <file> samples to include\n"
+ " -t, --targets <region> similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n"
+ "\n"
+ "Example:\n"
+ " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
+ "\n"
+ " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
+ " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
+ " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
+ "\n";
+}
+
+int main_csq(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_type = FT_VCF;
+ args->bcsq_tag = "BCSQ";
+ args->ncsq_max = 2*16;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"ncsq",1,0,'n'},
+ {"custom-tag",1,0,'c'},
+ {"local-csq",0,0,'l'},
+ {"gff-annot",1,0,'g'},
+ {"fasta-ref",1,0,'f'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,NULL,'O'},
+ {"phase",1,0,'p'},
+ {"quiet",0,0,'q'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {0,0,0,0}
+ };
+ int c, targets_is_file = 0, regions_is_file = 0;
+ char *targets_list = NULL, *regions_list = NULL;
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'l': args->local_csq = 1; break;
+ case 'c': args->bcsq_tag = optarg; break;
+ case 'q': args->quiet++; break;
+ case 'p':
+ switch (optarg[0])
+ {
+ case 'a': args->phase = PHASE_AS_IS; break;
+ case 'm': args->phase = PHASE_MERGE; break;
+ case 'r': args->phase = PHASE_REQUIRE; break;
+ case 'R': args->phase = PHASE_NON_REF; break;
+ case 's': args->phase = PHASE_SKIP; break;
+ default: error("The -p code \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'f': args->fa_fname = optarg; break;
+ case 'g': args->gff_fname = optarg; break;
+ case 'n':
+ args->ncsq_max = 2 * atoi(optarg);
+ if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 't': args->output_type = FT_TAB_TEXT; break;
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': regions_list = optarg; break;
+ case 'R': regions_list = optarg; regions_is_file = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 't': targets_list = optarg; break;
+ case 'T': targets_list = optarg; targets_is_file = 1; break;
+ case 'h':
+ case '?': error("%s",usage());
+ default: error("The option not recognised: %s\n\n", optarg); break;
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else error("%s", usage());
+ }
+ else fname = argv[optind];
+ if ( argc - optind>1 ) error("%s", usage());
+ if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
+ if ( !args->gff_fname ) error("Missing the --gff option\n");
+ args->sr = bcf_sr_init();
+ if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", targets_list);
+ if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", regions_list);
+ if ( !bcf_sr_add_reader(args->sr, fname) )
+ error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum));
+ args->hdr = bcf_sr_get_header(args->sr,0);
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->sr) )
+ {
+ process(args, &args->sr->readers[0].buffer[0]);
+ }
+ process(args,NULL);
+
+ destroy_data(args);
+ bcf_sr_destroy(args->sr);
+ free(args);
+
+ return 0;
+}
+
diff --git a/bcftools/filter.c b/bcftools/filter.c
index c56ae6d..463028f 100644
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -24,6 +24,7 @@ THE SOFTWARE. */
#include <ctype.h>
#include <stdlib.h>
+#include <strings.h>
#include <errno.h>
#include <math.h>
#include <wordexp.h>
@@ -34,13 +35,37 @@ THE SOFTWARE. */
#include <htslib/hts_defs.h>
#include <htslib/vcfutils.h>
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+uint64_t bcf_double_missing = 0x7ff0000000000001;
+uint64_t bcf_double_vector_end = 0x7ff0000000000002;
+static inline void bcf_double_set(double *ptr, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.i = value;
+ *ptr = u.d;
+}
+static inline int bcf_double_test(double d, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.d = d;
+ return u.i==value ? 1 : 0;
+}
+#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
+#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing)
+#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end)
+#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
+
+
typedef struct _token_t
{
// read-only values, same for all VCF lines
int tok_type; // one of the TOK_* keys below
char *key; // set only for string constants, otherwise NULL
char *tag; // for debugging and printout only, VCF tag name
- float threshold; // filtering threshold
+ double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
@@ -49,7 +74,7 @@ typedef struct _token_t
regex_t *regex; // precompiled regex for string comparison
// modified on filter evaluation at each VCF line
- float *values; // In case str_value is set, values[0] is one sample's string length
+ double *values; // In case str_value is set, values[0] is one sample's string length
char *str_value; // and values[0]*nsamples gives the total length;
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
@@ -67,7 +92,8 @@ struct _filter_t
int nfilters;
token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
int32_t *tmpi;
- int max_unpack, mtmpi, nsamples;
+ float *tmpf;
+ int max_unpack, mtmpi, mtmpf, nsamples;
};
@@ -221,13 +247,15 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
tok->nvalues = 0;
else
{
- tok->values[0] = line->qual;
+ tok->values[0] = (double)line->qual;
tok->nvalues = 1;
}
}
static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->values[0] = bcf_get_variant_types(line);
+ if ( !tok->values[0] ) tok->values[0] = 1; // mistake in htslib: VCF_* should start with 1
+ else tok->values[0] = ((int)tok->values[0]) << 1;
tok->nvalues = 1;
}
static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -272,6 +300,13 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->str_value = NULL;
}
}
+static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
+ int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
+ if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
+ return a&b ? 0 : 1;
+}
static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
{
int i;
@@ -316,7 +351,7 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
}
/**
- * bcf_get_info_value() - get single INFO value, int or float
+ * bcf_get_info_value() - get single INFO value, int64_t or double
* @line: BCF line
* @info_id: tag ID, as returned by bcf_hdr_id2int
* @ivec: 0-based index to retrieve, -1 when single value is expected
@@ -336,8 +371,8 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
bcf_info_t *info = &line->d.info[j];
if ( info->len == 1 )
{
- if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f;
- else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i;
+ if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f;
+ else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i;
return 1;
}
@@ -354,10 +389,10 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
return 1; \
}
switch (info->type) {
- case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break;
- case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break;
- case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break;
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int64_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
@@ -374,14 +409,18 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
- tok->nvalues = n;
- hts_expand(float,n,tok->mvalues,tok->values);
- for (i=0; i<n; i++) tok->values[i] = flt->tmpi[i];
+ int i;
+ tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+ }
}
else
{
- int32_t value;
+ int64_t value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
@@ -396,12 +435,20 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues);
- if ( tok->nvalues<0 ) tok->nvalues = 0;
+ int i;
+ tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
+ if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
+ else tok->values[i] = flt->tmpf[i];
+ }
}
else
{
- float value;
+ double value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
@@ -460,11 +507,11 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
else
{
int is_missing = 1;
- hts_expand(float,tok->nvalues,tok->mvalues,tok->values);
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
for (i=0; i<tok->nvalues; i++)
{
if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
- bcf_float_set_missing(tok->values[i]);
+ bcf_double_set_missing(tok->values[i]);
else
{
tok->values[i] = flt->tmpi[i];
@@ -490,20 +537,38 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 )
+ int i;
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+ {
tok->nvalues = tok->nsamples = 0; // missing values
- else if ( tok->idx >= 0 )
+ }
+ else
{
- int i, nsmpl, nvals;
- nsmpl = bcf_hdr_nsamples(flt->hdr);
- nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nsamples = tok->nvalues = 0; // the index is too big
- else
+ int is_missing = 1;
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
{
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nsamples = tok->nvalues = nsmpl;
+ if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+ bcf_double_set_missing(tok->values[i]);
+ else
+ {
+ tok->values[i] = flt->tmpf[i];
+ is_missing = 0;
+ }
+ }
+ if ( is_missing ) tok->nvalues = 0;
+ else if ( tok->idx >= 0 )
+ {
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nvalues = nsmpl;
+ }
}
}
tok->nsamples = tok->nvalues;
@@ -567,7 +632,7 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to
tok->nvalues = tok->nsamples = 0;
return;
}
- int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
kstring_t str;
gt_length_too_big:
@@ -576,29 +641,15 @@ gt_length_too_big:
{
int plen = str.l;
- #define BRANCH(type_t) { \
- type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
- if ( !(ptr[0]>>1) ) kputc('.',&str); \
- }
- switch (fmt->type) {
- case BCF_BT_INT8: BRANCH(int8_t); break;
- case BCF_BT_INT16: BRANCH(int16_t); break;
- case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
- }
- #undef BRANCH
-
- if ( plen==str.l )
+ bcf_format_gt(fmt, i, &str);
+ kputc_(0,&str);
+ if ( str.l - plen > blen )
{
- bcf_format_gt(fmt, i, &str);
- if ( str.l - plen > blen )
- {
- // too many alternate alleles or ploidy is too large, the genotype does not fit
- // three characters ("0/0" vs "10/10").
- tok->str_value = str.s;
- blen *= 2;
- goto gt_length_too_big;
- }
+ // too many alternate alleles or ploidy is too large, the genotype does not fit
+ // three characters ("0/0" vs "10/10").
+ tok->str_value = str.s;
+ blen *= 2;
+ goto gt_length_too_big;
}
plen = str.l - plen;
@@ -680,7 +731,7 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
}
else
{
- hts_expand(float,line->n_allele,tok->mvalues,tok->values);
+ hts_expand(double,line->n_allele,tok->mvalues,tok->values);
for (i=1; i<line->n_allele; i++)
tok->values[i-1] = flt->tmpi[i];
tok->nvalues = line->n_allele - 1;
@@ -706,7 +757,7 @@ static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok)
if ( !tok->nvalues ) return;
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
}
static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
{
@@ -715,18 +766,18 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
{
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
}
}
static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = -HUGE_VAL;
+ double val = -HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
{
- if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
}
tok->values[0] = val;
tok->nvalues = 1;
@@ -734,30 +785,30 @@ static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
}
static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = HUGE_VAL;
+ double val = HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = n ? val / n : 0;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
@@ -812,20 +863,20 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) continue; \
- if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \
+ if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
has_values = 1; \
(atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
} \
} \
else if ( (btok)->nsamples ) \
{ \
- hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
+ hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
for (i=0; i<(btok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \
+ if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
@@ -838,9 +889,9 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \
+ if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
@@ -921,10 +972,14 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
for (i=0; i<btok->nsamples; i++)
atok->pass_samples[i] = btok->pass_samples[i];
atok->nsamples = btok->nsamples;
+ atok->nvalues = 1;
return btok->pass_site;
}
if ( !btok->nvalues ) // missing value in b
+ {
+ btok->nvalues = 1;
return atok->pass_site;
+ }
if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
if ( !atok->nsamples )
@@ -978,6 +1033,7 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
token_t *tok = (atok)->is_missing ? (btok) : (atok); \
(ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
+ tok->nvalues = 1; \
}
#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
@@ -990,8 +1046,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
{ \
for (i=0; i<(atok)->nsamples; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
has_values = 1; \
if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
else (atok)->pass_samples[i] = 0; \
@@ -1000,34 +1054,26 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
} \
else if ( (atok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(atok)->nsamples; i++) \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
+ /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (btok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(btok)->nsamples; i++) \
{ \
- for (i=0; i<(btok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
@@ -1124,10 +1170,23 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
}
return pass_site;
}
-static int regex_vector_strings(token_t *atok, token_t *btok)
+static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
{
- int ret = regexec(btok->regex, atok->str_value, 0,NULL,0);
- return ret==0 ? 1 : 0;
+ int i, pass_site = 0;
+ if ( atok->nsamples )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ char *ptr = atok->str_value + i*(int)atok->values[0];
+ atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
+ if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ return pass_site;
+ }
+ pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+ if ( negate ) pass_site = pass_site ? 0 : 1;
+ return pass_site;
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
@@ -1143,7 +1202,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
int quote = str[0];
if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
tok->key = (char*) calloc(len-1,sizeof(char));
- hts_expand(float,1,tok->mvalues,tok->values);
+ hts_expand(double,1,tok->mvalues,tok->values);
tok->values[0] = len-2;
memcpy(tok->key,str+1,len-2);
tok->key[len-2] = 0;
@@ -1372,11 +1431,18 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
return 0;
}
- // is it a value?
+ // is it a value? Here we parse as integer/float separately and use strtof
+ // rather than strtod, because the more accurate double representation
+ // would invalidate floating point comparisons like QUAL=59.2, obtained via
+ // htslib/vcf parser
char *end;
- errno = 0;
- tok->threshold = strtod(tmp.s, &end);
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ tok->threshold = strtol(tmp.s, &end, 10); // integer?
+ if ( end - tmp.s != strlen(tmp.s) )
+ {
+ errno = 0;
+ tok->threshold = strtof(tmp.s, &end); // float?
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ }
if ( tmp.s ) free(tmp.s);
return 0;
@@ -1511,11 +1577,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
// Look for j="." and k numeric type
int j = i-1, k = i-2;
if ( !out[j].is_str ) { k = i-1, j = i-2; }
- if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) )
+ if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
{
int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
- if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
- if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
}
if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
@@ -1524,7 +1590,14 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
if ( !out[j].key )
error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
out[j].regex = (regex_t *) malloc(sizeof(regex_t));
- if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) )
+ int cflags = REG_NOSUB;
+ int len = strlen(out[j].key);
+ if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\' )
+ {
+ out[j].key[len-2] = 0;
+ cflags |= REG_ICASE;
+ }
+ if ( regcomp(out[j].regex, out[j].key, cflags) )
error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
}
if ( out[i].tok_type!=TOK_VAL ) continue;
@@ -1532,41 +1605,47 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
if ( !strcmp(out[i].tag,"TYPE") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; }
- else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
- out[j].tag = out[j].key; out[j].key = NULL;
- i = j;
+ int itok, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1;
+ else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
+ else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
+ if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ i = itok;
continue;
}
if ( !strcmp(out[i].tag,"FILTER") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value"
- if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and !=
- if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE;
- if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ;
- if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key )
+ int itok = i, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
+ else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( strcmp(".",out[j].key) )
+ if ( strcmp(".",out[ival].key) )
{
- out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
- if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
- error("The filter \"%s\" not present in the VCF header\n", out[j].key);
+ out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
+ error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
}
else
- out[j].hdr_id = -1;
- out[j].tag = out[j].key; out[j].key = NULL;
- out[i].hdr_id = out[j].hdr_id;
- i = j;
+ out[ival].hdr_id = -1;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ out[itok].hdr_id = out[ival].hdr_id;
continue;
}
}
@@ -1579,7 +1658,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
- hts_expand0(float,1,out[i].mvalues,out[i].values);
+ hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
@@ -1618,6 +1697,7 @@ void filter_destroy(filter_t *filter)
free(filter->flt_stack);
free(filter->str);
free(filter->tmpi);
+ free(filter->tmpf);
free(filter);
}
@@ -1704,7 +1784,9 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
}
int is_true = 0;
- if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
+ if ( filter->filters[i].comparator )
+ is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
+ else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
{
int skip = 0;
if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
@@ -1746,10 +1828,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
{
if ( is_str==2 )
- {
- is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]);
- if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1;
- }
+ is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
else
error("The regex operator can be used on strings only: %s\n", filter->str);
}
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c
index 531339e..44046f2 100644
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -26,6 +26,7 @@ THE SOFTWARE. */
#include <ctype.h>
#include <stdlib.h>
+#include <strings.h>
#include <errno.h>
#include <math.h>
#include <wordexp.h>
@@ -36,13 +37,37 @@ THE SOFTWARE. */
#include <htslib/hts_defs.h>
#include <htslib/vcfutils.h>
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+uint64_t bcf_double_missing = 0x7ff0000000000001;
+uint64_t bcf_double_vector_end = 0x7ff0000000000002;
+static inline void bcf_double_set(double *ptr, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.i = value;
+ *ptr = u.d;
+}
+static inline int bcf_double_test(double d, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.d = d;
+ return u.i==value ? 1 : 0;
+}
+#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
+#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing)
+#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end)
+#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
+
+
typedef struct _token_t
{
// read-only values, same for all VCF lines
int tok_type; // one of the TOK_* keys below
char *key; // set only for string constants, otherwise NULL
char *tag; // for debugging and printout only, VCF tag name
- float threshold; // filtering threshold
+ double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
@@ -51,7 +76,7 @@ typedef struct _token_t
regex_t *regex; // precompiled regex for string comparison
// modified on filter evaluation at each VCF line
- float *values; // In case str_value is set, values[0] is one sample's string length
+ double *values; // In case str_value is set, values[0] is one sample's string length
char *str_value; // and values[0]*nsamples gives the total length;
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
@@ -69,7 +94,8 @@ struct _filter_t
int nfilters;
token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
int32_t *tmpi;
- int max_unpack, mtmpi, nsamples;
+ float *tmpf;
+ int max_unpack, mtmpi, mtmpf, nsamples;
};
@@ -223,13 +249,15 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
tok->nvalues = 0;
else
{
- tok->values[0] = line->qual;
+ tok->values[0] = (double)line->qual;
tok->nvalues = 1;
}
}
static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->values[0] = bcf_get_variant_types(line);
+ if ( !tok->values[0] ) tok->values[0] = 1; // mistake in htslib: VCF_* should start with 1
+ else tok->values[0] = ((int)tok->values[0]) << 1;
tok->nvalues = 1;
}
static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -274,6 +302,13 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->str_value = NULL;
}
}
+static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
+ int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
+ if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
+ return a&b ? 0 : 1;
+}
static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
{
int i;
@@ -318,7 +353,7 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
}
/**
- * bcf_get_info_value() - get single INFO value, int or float
+ * bcf_get_info_value() - get single INFO value, int64_t or double
* @line: BCF line
* @info_id: tag ID, as returned by bcf_hdr_id2int
* @ivec: 0-based index to retrieve, -1 when single value is expected
@@ -338,8 +373,8 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
bcf_info_t *info = &line->d.info[j];
if ( info->len == 1 )
{
- if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f;
- else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i;
+ if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f;
+ else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i;
return 1;
}
@@ -356,10 +391,10 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
return 1; \
}
switch (info->type) {
- case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break;
- case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break;
- case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break;
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int64_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
@@ -376,14 +411,18 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
- tok->nvalues = n;
- hts_expand(float,n,tok->mvalues,tok->values);
- for (i=0; i<n; i++) tok->values[i] = flt->tmpi[i];
+ int i;
+ tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+ }
}
else
{
- int32_t value;
+ int64_t value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
@@ -398,12 +437,20 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues);
- if ( tok->nvalues<0 ) tok->nvalues = 0;
+ int i;
+ tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
+ if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
+ else tok->values[i] = flt->tmpf[i];
+ }
}
else
{
- float value;
+ double value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
@@ -462,11 +509,11 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
else
{
int is_missing = 1;
- hts_expand(float,tok->nvalues,tok->mvalues,tok->values);
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
for (i=0; i<tok->nvalues; i++)
{
if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
- bcf_float_set_missing(tok->values[i]);
+ bcf_double_set_missing(tok->values[i]);
else
{
tok->values[i] = flt->tmpi[i];
@@ -492,20 +539,38 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 )
+ int i;
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+ {
tok->nvalues = tok->nsamples = 0; // missing values
- else if ( tok->idx >= 0 )
+ }
+ else
{
- int i, nsmpl, nvals;
- nsmpl = bcf_hdr_nsamples(flt->hdr);
- nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nsamples = tok->nvalues = 0; // the index is too big
- else
+ int is_missing = 1;
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
{
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nsamples = tok->nvalues = nsmpl;
+ if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+ bcf_double_set_missing(tok->values[i]);
+ else
+ {
+ tok->values[i] = flt->tmpf[i];
+ is_missing = 0;
+ }
+ }
+ if ( is_missing ) tok->nvalues = 0;
+ else if ( tok->idx >= 0 )
+ {
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nvalues = nsmpl;
+ }
}
}
tok->nsamples = tok->nvalues;
@@ -569,7 +634,7 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to
tok->nvalues = tok->nsamples = 0;
return;
}
- int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
kstring_t str;
gt_length_too_big:
@@ -578,29 +643,15 @@ gt_length_too_big:
{
int plen = str.l;
- #define BRANCH(type_t) { \
- type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
- if ( !(ptr[0]>>1) ) kputc('.',&str); \
- }
- switch (fmt->type) {
- case BCF_BT_INT8: BRANCH(int8_t); break;
- case BCF_BT_INT16: BRANCH(int16_t); break;
- case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(pysam_stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
- }
- #undef BRANCH
-
- if ( plen==str.l )
+ bcf_format_gt(fmt, i, &str);
+ kputc_(0,&str);
+ if ( str.l - plen > blen )
{
- bcf_format_gt(fmt, i, &str);
- if ( str.l - plen > blen )
- {
- // too many alternate alleles or ploidy is too large, the genotype does not fit
- // three characters ("0/0" vs "10/10").
- tok->str_value = str.s;
- blen *= 2;
- goto gt_length_too_big;
- }
+ // too many alternate alleles or ploidy is too large, the genotype does not fit
+ // three characters ("0/0" vs "10/10").
+ tok->str_value = str.s;
+ blen *= 2;
+ goto gt_length_too_big;
}
plen = str.l - plen;
@@ -682,7 +733,7 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
}
else
{
- hts_expand(float,line->n_allele,tok->mvalues,tok->values);
+ hts_expand(double,line->n_allele,tok->mvalues,tok->values);
for (i=1; i<line->n_allele; i++)
tok->values[i-1] = flt->tmpi[i];
tok->nvalues = line->n_allele - 1;
@@ -708,7 +759,7 @@ static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok)
if ( !tok->nvalues ) return;
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
}
static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
{
@@ -717,18 +768,18 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
{
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
}
}
static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = -HUGE_VAL;
+ double val = -HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
{
- if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
}
tok->values[0] = val;
tok->nvalues = 1;
@@ -736,30 +787,30 @@ static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
}
static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = HUGE_VAL;
+ double val = HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = n ? val / n : 0;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
@@ -814,20 +865,20 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) continue; \
- if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \
+ if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
has_values = 1; \
(atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
} \
} \
else if ( (btok)->nsamples ) \
{ \
- hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
+ hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
for (i=0; i<(btok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \
+ if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
@@ -840,9 +891,9 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \
+ if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
@@ -923,10 +974,14 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
for (i=0; i<btok->nsamples; i++)
atok->pass_samples[i] = btok->pass_samples[i];
atok->nsamples = btok->nsamples;
+ atok->nvalues = 1;
return btok->pass_site;
}
if ( !btok->nvalues ) // missing value in b
+ {
+ btok->nvalues = 1;
return atok->pass_site;
+ }
if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
if ( !atok->nsamples )
@@ -980,6 +1035,7 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
token_t *tok = (atok)->is_missing ? (btok) : (atok); \
(ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
+ tok->nvalues = 1; \
}
#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
@@ -992,8 +1048,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
{ \
for (i=0; i<(atok)->nsamples; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
has_values = 1; \
if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
else (atok)->pass_samples[i] = 0; \
@@ -1002,34 +1056,26 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
} \
else if ( (atok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(atok)->nsamples; i++) \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
+ /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (btok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(btok)->nsamples; i++) \
{ \
- for (i=0; i<(btok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
@@ -1126,10 +1172,23 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
}
return pass_site;
}
-static int regex_vector_strings(token_t *atok, token_t *btok)
+static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
{
- int ret = regexec(btok->regex, atok->str_value, 0,NULL,0);
- return ret==0 ? 1 : 0;
+ int i, pass_site = 0;
+ if ( atok->nsamples )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ char *ptr = atok->str_value + i*(int)atok->values[0];
+ atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
+ if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ return pass_site;
+ }
+ pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+ if ( negate ) pass_site = pass_site ? 0 : 1;
+ return pass_site;
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
@@ -1145,7 +1204,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
int quote = str[0];
if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
tok->key = (char*) calloc(len-1,sizeof(char));
- hts_expand(float,1,tok->mvalues,tok->values);
+ hts_expand(double,1,tok->mvalues,tok->values);
tok->values[0] = len-2;
memcpy(tok->key,str+1,len-2);
tok->key[len-2] = 0;
@@ -1374,11 +1433,18 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
return 0;
}
- // is it a value?
+ // is it a value? Here we parse as integer/float separately and use strtof
+ // rather than strtod, because the more accurate double representation
+ // would invalidate floating point comparisons like QUAL=59.2, obtained via
+ // htslib/vcf parser
char *end;
- errno = 0;
- tok->threshold = strtod(tmp.s, &end);
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ tok->threshold = strtol(tmp.s, &end, 10); // integer?
+ if ( end - tmp.s != strlen(tmp.s) )
+ {
+ errno = 0;
+ tok->threshold = strtof(tmp.s, &end); // float?
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ }
if ( tmp.s ) free(tmp.s);
return 0;
@@ -1513,11 +1579,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
// Look for j="." and k numeric type
int j = i-1, k = i-2;
if ( !out[j].is_str ) { k = i-1, j = i-2; }
- if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) )
+ if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
{
int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
- if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
- if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
}
if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
@@ -1526,7 +1592,14 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
if ( !out[j].key )
error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
out[j].regex = (regex_t *) malloc(sizeof(regex_t));
- if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) )
+ int cflags = REG_NOSUB;
+ int len = strlen(out[j].key);
+ if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\' )
+ {
+ out[j].key[len-2] = 0;
+ cflags |= REG_ICASE;
+ }
+ if ( regcomp(out[j].regex, out[j].key, cflags) )
error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
}
if ( out[i].tok_type!=TOK_VAL ) continue;
@@ -1534,41 +1607,47 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
if ( !strcmp(out[i].tag,"TYPE") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; }
- else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
- out[j].tag = out[j].key; out[j].key = NULL;
- i = j;
+ int itok, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1;
+ else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
+ else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
+ if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ i = itok;
continue;
}
if ( !strcmp(out[i].tag,"FILTER") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value"
- if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and !=
- if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE;
- if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ;
- if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key )
+ int itok = i, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
+ else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( strcmp(".",out[j].key) )
+ if ( strcmp(".",out[ival].key) )
{
- out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
- if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
- error("The filter \"%s\" not present in the VCF header\n", out[j].key);
+ out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
+ error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
}
else
- out[j].hdr_id = -1;
- out[j].tag = out[j].key; out[j].key = NULL;
- out[i].hdr_id = out[j].hdr_id;
- i = j;
+ out[ival].hdr_id = -1;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ out[itok].hdr_id = out[ival].hdr_id;
continue;
}
}
@@ -1581,7 +1660,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
- hts_expand0(float,1,out[i].mvalues,out[i].values);
+ hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
@@ -1620,6 +1699,7 @@ void filter_destroy(filter_t *filter)
free(filter->flt_stack);
free(filter->str);
free(filter->tmpi);
+ free(filter->tmpf);
free(filter);
}
@@ -1706,7 +1786,9 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
}
int is_true = 0;
- if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
+ if ( filter->filters[i].comparator )
+ is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
+ else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
{
int skip = 0;
if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
@@ -1748,10 +1830,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
{
if ( is_str==2 )
- {
- is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]);
- if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1;
- }
+ is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
else
error("The regex operator can be used on strings only: %s\n", filter->str);
}
diff --git a/bcftools/hclust.c b/bcftools/hclust.c
new file mode 100644
index 0000000..692fa54
--- /dev/null
+++ b/bcftools/hclust.c
@@ -0,0 +1,400 @@
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <stdlib.h>
+#include "bcftools.h"
+#include "hclust.h"
+
+typedef struct _node_t
+{
+ struct _node_t *akid, *bkid, *next, *prev, *parent;
+ int id, idx; // id: unique node id; idx: current index to pdist
+ float value; // max pairwise dist of elements within the node
+}
+node_t;
+
+struct _hclust_t
+{
+ int ndat, nclust; // ndat: number of elements (pdist matrix size); nclust: current number of clusters
+ float *pdist; // pairwise cluster distances, diagonal matrix accessed via the PDIST macro
+ node_t *first, *last; // clusters are maintained in a double-linked list
+ node_t **rmme; // convenience array to remove all allocated nodes at the end
+ int nrmme;
+ kstring_t str; // (for debugging) pointer to str.s is returned by create_dot()
+ char **dbg; // (for debugging) created by create_list() via set_threshold() and returned by explain()
+ int ndbg, mdbg;
+};
+
+node_t *append_node(hclust_t *clust, int idx)
+{
+ node_t *node = (node_t*) calloc(1,sizeof(node_t));
+
+ clust->nclust++;
+ node->id = clust->nrmme;
+ node->idx = idx;
+ if ( !clust->first )
+ {
+ clust->first = node;
+ clust->last = node;
+ }
+ else
+ {
+ node->prev = clust->last;
+ clust->last->next = node;
+ clust->last = node;
+ }
+
+ if ( clust->nrmme >= clust->ndat*2 ) error("hclust fixme: %d vs %d\n",clust->nrmme,clust->ndat);
+ clust->rmme[clust->nrmme++] = node;
+
+ return node;
+}
+void remove_node(hclust_t *clust, node_t *node)
+{
+ if ( node==clust->first ) clust->first = node->next;
+ if ( node==clust->last ) clust->last = node->prev;
+ if ( node->next ) node->next->prev = node->prev;
+ if ( node->prev ) node->prev->next = node->next;
+ clust->nclust--;
+}
+
+#if DEBUG
+void hclust_debug(hclust_t *clust)
+{
+ int i;
+ fprintf(stderr,"nrmme=%d nclust=%d\n", clust->nrmme,clust->nclust);
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ int akid = node->akid ? node->akid->id : -1;
+ int bkid = node->bkid ? node->bkid->id : -1;
+ int akidx = node->akid ? node->akid->idx : -1;
+ int bkidx = node->bkid ? node->bkid->idx : -1;
+ fprintf(stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx);
+ }
+
+ int j;
+ for (i=1; i<clust->ndat; i++)
+ {
+ int active = 0;
+ node_t *node = clust->first;
+ while (node)
+ {
+ if ( node->idx==i ) { active=1; break; }
+ node = node->next;
+ }
+ fprintf(stderr,"%2d%c ",i,active?'*':' ');
+ for (j=0; j<i; j++)
+ {
+ if ( PDIST(clust->pdist,i,j)==9 )
+ fprintf(stderr," ----- ");
+ else
+ fprintf(stderr," %f", PDIST(clust->pdist,i,j));
+ }
+ fprintf(stderr,"\n");
+ }
+ for (j=0; j<clust->ndat-1; j++) fprintf(stderr," %6d ",j); fprintf(stderr,"\n");
+}
+#endif
+
+hclust_t *hclust_init(int n, float *pdist)
+{
+ hclust_t *clust = (hclust_t*) calloc(1,sizeof(hclust_t));
+ clust->ndat = n;
+ clust->pdist = pdist;
+ clust->rmme = (node_t**) calloc(n*2,sizeof(node_t*));
+
+ // init clusters
+ int i;
+ for (i=0; i<clust->ndat; i++) append_node(clust,i);
+
+ // build the tree
+ while ( clust->nclust>1 )
+ {
+ // find two clusters with minimum distance
+ float min_value = HUGE_VAL;
+ node_t *iclust = clust->first->next;
+ node_t *min_iclust = NULL, *min_jclust = NULL;
+ while ( iclust )
+ {
+ node_t *jclust = clust->first;
+ while ( jclust!=iclust )
+ {
+ float value = PDIST(clust->pdist,iclust->idx,jclust->idx);
+ if ( value < min_value )
+ {
+ min_value = value;
+ min_iclust = iclust;
+ min_jclust = jclust;
+ }
+ jclust = jclust->next;
+ }
+ iclust = iclust->next;
+ }
+ assert( min_iclust && min_jclust ); // pdist contains inf or nan, fix the caller
+ remove_node(clust,min_iclust);
+ remove_node(clust,min_jclust);
+
+ // update the pairwise distances. We keep the matrix and as we are moving up the
+ // tree, we use fewer columns/rows as the number of clusters decreases: we reuse
+ // i-th and leave j-th unused. Inter-cluster distance is defined as maximum distance
+ // between pairwise distances of elements within the cluster.
+ iclust = clust->first;
+ while ( iclust )
+ {
+ if ( PDIST(clust->pdist,iclust->idx,min_iclust->idx) < PDIST(clust->pdist,iclust->idx,min_jclust->idx) )
+ PDIST(clust->pdist,iclust->idx,min_iclust->idx) = PDIST(clust->pdist,iclust->idx,min_jclust->idx);
+ iclust = iclust->next;
+ }
+
+ node_t *node = append_node(clust,min_iclust->idx);
+ node->akid = min_iclust;
+ node->bkid = min_jclust;
+ node->value = min_value;
+ node->akid->parent = node;
+ node->bkid->parent = node;
+ }
+
+ return clust;
+}
+void hclust_destroy(hclust_t *clust)
+{
+ int i;
+ for (i=0; i<clust->nrmme; i++) free(clust->rmme[i]);
+ free(clust->rmme);
+ free(clust->dbg);
+ free(clust->str.s);
+ free(clust);
+}
+
+char *hclust_create_dot(hclust_t *clust, char **labels, float th)
+{
+ clust->str.l = 0;
+ ksprintf(&clust->str,"digraph myGraph {");
+
+ int i;
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->value )
+ ksprintf(&clust->str,"\"%d\" [label=\"%f\"];", node->id,node->value);
+ else
+ ksprintf(&clust->str,"\"%d\" [label=\"%s\"];", node->id,labels[node->idx]);
+ }
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->akid )
+ {
+ if ( node->value >= th && node->akid && node->akid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->akid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->akid->id);
+ }
+
+ if ( node->bkid )
+ {
+ if ( node->value >= th && node->bkid && node->bkid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->bkid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->bkid->id);
+ }
+ }
+ ksprintf(&clust->str,"};");
+ return clust->str.s;
+}
+char **hclust_explain(hclust_t *clust, int *nlines)
+{
+ clust->ndbg = 0;
+ char *beg = clust->str.s;
+ while ( *beg )
+ {
+ char *end = beg;
+ while ( *end && *end!='\n' ) end++;
+ clust->ndbg++;
+ hts_expand(char*,clust->ndbg,clust->mdbg,clust->dbg);
+ clust->dbg[clust->ndbg-1] = beg;
+ if ( !*end ) break;
+ *end = 0;
+ beg = end + 1;
+ }
+
+ *nlines = clust->ndbg;
+ return clust->dbg;
+}
+
+cluster_t *append_cluster(node_t *node, cluster_t *cluster, int *nclust, node_t **stack)
+{
+ (*nclust)++;
+ cluster = (cluster_t*) realloc(cluster,sizeof(cluster_t)*(*nclust));
+ cluster_t *clust = &cluster[*nclust-1];
+ clust->nmemb = 0;
+ clust->memb = NULL;
+ clust->dist = node->value;
+
+ int nstack = 1;
+ stack[0] = node;
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( node->akid )
+ {
+ stack[nstack++] = akid;
+ stack[nstack++] = bkid;
+ }
+ else
+ {
+ clust->nmemb++;
+ clust->memb = (int*) realloc(clust->memb,sizeof(int)*clust->nmemb);
+ clust->memb[clust->nmemb-1] = node->id;
+ }
+ }
+ return cluster;
+}
+
+int cmp_nodes(const void *a, const void *b)
+{
+ const node_t *an = *((const node_t**) a);
+ const node_t *bn = *((const node_t**) b);
+ if ( an->value < bn->value ) return -1;
+ if ( an->value > bn->value ) return 1;
+ return 0;
+}
+
+float calc_dev(node_t **dat, int n)
+{
+ float avg = 0, dev = 0;
+ int i;
+ for (i=0; i<n; i++) avg += dat[i]->value;
+ avg /= n;
+ for (i=0; i<n; i++) dev += (dat[i]->value - avg)*(dat[i]->value - avg);
+ return sqrt(dev/n);
+}
+
+/*
+ Heuristics to determine clustering cutoff: sort nodes by distance and
+ split into two groups by minimizing the standard deviation.
+ This works best when two elements from a single different sample are
+ included in the mix.
+ - min_inter_dist .. smaller values are always considered identical
+ - max_intra_dist .. larger values are always considered different
+ */
+float hclust_set_threshold(hclust_t *clust, float min_inter_dist, float max_intra_dist)
+{
+ node_t **dat = clust->rmme + clust->ndat;
+ int i, ndat = clust->nrmme - clust->ndat;
+
+ qsort(dat, ndat, sizeof(dat), cmp_nodes);
+
+ clust->str.l = 0;
+ float th, min_dev = HUGE_VAL;
+ int imin = -1;
+ for (i=0; i<ndat; i++)
+ {
+ float dev = 0;
+ if ( i>0 ) dev += calc_dev(dat,i);
+ if ( i+1<ndat ) dev += calc_dev(dat+i,ndat-i);
+ th = dat[i]->value;
+ ksprintf(&clust->str,"DEV\t%f\t%f\n",th,dev);
+ if ( min_dev > dev && th >= min_inter_dist ) { min_dev = dev; imin = i; }
+ }
+ if ( max_intra_dist > 0 )
+ th = max_intra_dist; // use fixed cutoff, the above was only for debugging output
+ else
+ {
+ // dynamic cutoff
+ max_intra_dist = fabs(max_intra_dist);
+ th = imin==-1 ? max_intra_dist : dat[imin]->value;
+ if ( th > max_intra_dist ) th = max_intra_dist;
+ }
+ ksprintf(&clust->str,"TH\t%f\n", th);
+ ksprintf(&clust->str,"MAX_DIST\t%f\n", dat[ndat-1]->value);
+ ksprintf(&clust->str,"MIN_INTER\t%f\n", min_inter_dist);
+ ksprintf(&clust->str,"MAX_INTRA\t%f\n", max_intra_dist);
+ return th;
+}
+
+cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust)
+{
+ float cutoff = *max_intra_dist = hclust_set_threshold(clust, min_inter_dist, *max_intra_dist);
+
+ node_t **stack = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ node_t **tmp = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ stack[0] = clust->first;
+ int nstack = 1;
+
+ cluster_t *cluster = NULL;
+ int ncluster = 0;
+
+ if ( stack[0]->value < cutoff )
+ {
+ // all values are within the limits - create a single cluster
+ cluster = append_cluster(stack[0], cluster, &ncluster, tmp);
+ nstack = 0;
+ }
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( !akid )
+ {
+ cluster = append_cluster(node, cluster, &ncluster, tmp);
+ continue;
+ }
+
+ if ( node->value >= cutoff && akid->value < cutoff )
+ cluster = append_cluster(akid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = akid;
+
+ if ( node->value >= cutoff && bkid->value < cutoff )
+ cluster = append_cluster(bkid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = bkid;
+ }
+
+ free(tmp);
+ free(stack);
+
+ *nclust = ncluster;
+ return cluster;
+}
+
+void hclust_destroy_list(cluster_t *clust, int nclust)
+{
+ int i;
+ for (i=0; i<nclust; i++) free(clust[i].memb);
+ free(clust);
+}
+
+
diff --git a/bcftools/hclust.c.pysam.c b/bcftools/hclust.c.pysam.c
new file mode 100644
index 0000000..d43ddcf
--- /dev/null
+++ b/bcftools/hclust.c.pysam.c
@@ -0,0 +1,402 @@
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <stdlib.h>
+#include "bcftools.h"
+#include "hclust.h"
+
+typedef struct _node_t
+{
+ struct _node_t *akid, *bkid, *next, *prev, *parent;
+ int id, idx; // id: unique node id; idx: current index to pdist
+ float value; // max pairwise dist of elements within the node
+}
+node_t;
+
+struct _hclust_t
+{
+ int ndat, nclust; // ndat: number of elements (pdist matrix size); nclust: current number of clusters
+ float *pdist; // pairwise cluster distances, diagonal matrix accessed via the PDIST macro
+ node_t *first, *last; // clusters are maintained in a double-linked list
+ node_t **rmme; // convenience array to remove all allocated nodes at the end
+ int nrmme;
+ kstring_t str; // (for debugging) pointer to str.s is returned by create_dot()
+ char **dbg; // (for debugging) created by create_list() via set_threshold() and returned by explain()
+ int ndbg, mdbg;
+};
+
+node_t *append_node(hclust_t *clust, int idx)
+{
+ node_t *node = (node_t*) calloc(1,sizeof(node_t));
+
+ clust->nclust++;
+ node->id = clust->nrmme;
+ node->idx = idx;
+ if ( !clust->first )
+ {
+ clust->first = node;
+ clust->last = node;
+ }
+ else
+ {
+ node->prev = clust->last;
+ clust->last->next = node;
+ clust->last = node;
+ }
+
+ if ( clust->nrmme >= clust->ndat*2 ) error("hclust fixme: %d vs %d\n",clust->nrmme,clust->ndat);
+ clust->rmme[clust->nrmme++] = node;
+
+ return node;
+}
+void remove_node(hclust_t *clust, node_t *node)
+{
+ if ( node==clust->first ) clust->first = node->next;
+ if ( node==clust->last ) clust->last = node->prev;
+ if ( node->next ) node->next->prev = node->prev;
+ if ( node->prev ) node->prev->next = node->next;
+ clust->nclust--;
+}
+
+#if DEBUG
+void hclust_debug(hclust_t *clust)
+{
+ int i;
+ fprintf(pysam_stderr,"nrmme=%d nclust=%d\n", clust->nrmme,clust->nclust);
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ int akid = node->akid ? node->akid->id : -1;
+ int bkid = node->bkid ? node->bkid->id : -1;
+ int akidx = node->akid ? node->akid->idx : -1;
+ int bkidx = node->bkid ? node->bkid->idx : -1;
+ fprintf(pysam_stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx);
+ }
+
+ int j;
+ for (i=1; i<clust->ndat; i++)
+ {
+ int active = 0;
+ node_t *node = clust->first;
+ while (node)
+ {
+ if ( node->idx==i ) { active=1; break; }
+ node = node->next;
+ }
+ fprintf(pysam_stderr,"%2d%c ",i,active?'*':' ');
+ for (j=0; j<i; j++)
+ {
+ if ( PDIST(clust->pdist,i,j)==9 )
+ fprintf(pysam_stderr," ----- ");
+ else
+ fprintf(pysam_stderr," %f", PDIST(clust->pdist,i,j));
+ }
+ fprintf(pysam_stderr,"\n");
+ }
+ for (j=0; j<clust->ndat-1; j++) fprintf(pysam_stderr," %6d ",j); fprintf(pysam_stderr,"\n");
+}
+#endif
+
+hclust_t *hclust_init(int n, float *pdist)
+{
+ hclust_t *clust = (hclust_t*) calloc(1,sizeof(hclust_t));
+ clust->ndat = n;
+ clust->pdist = pdist;
+ clust->rmme = (node_t**) calloc(n*2,sizeof(node_t*));
+
+ // init clusters
+ int i;
+ for (i=0; i<clust->ndat; i++) append_node(clust,i);
+
+ // build the tree
+ while ( clust->nclust>1 )
+ {
+ // find two clusters with minimum distance
+ float min_value = HUGE_VAL;
+ node_t *iclust = clust->first->next;
+ node_t *min_iclust = NULL, *min_jclust = NULL;
+ while ( iclust )
+ {
+ node_t *jclust = clust->first;
+ while ( jclust!=iclust )
+ {
+ float value = PDIST(clust->pdist,iclust->idx,jclust->idx);
+ if ( value < min_value )
+ {
+ min_value = value;
+ min_iclust = iclust;
+ min_jclust = jclust;
+ }
+ jclust = jclust->next;
+ }
+ iclust = iclust->next;
+ }
+ assert( min_iclust && min_jclust ); // pdist contains inf or nan, fix the caller
+ remove_node(clust,min_iclust);
+ remove_node(clust,min_jclust);
+
+ // update the pairwise distances. We keep the matrix and as we are moving up the
+ // tree, we use fewer columns/rows as the number of clusters decreases: we reuse
+ // i-th and leave j-th unused. Inter-cluster distance is defined as maximum distance
+ // between pairwise distances of elements within the cluster.
+ iclust = clust->first;
+ while ( iclust )
+ {
+ if ( PDIST(clust->pdist,iclust->idx,min_iclust->idx) < PDIST(clust->pdist,iclust->idx,min_jclust->idx) )
+ PDIST(clust->pdist,iclust->idx,min_iclust->idx) = PDIST(clust->pdist,iclust->idx,min_jclust->idx);
+ iclust = iclust->next;
+ }
+
+ node_t *node = append_node(clust,min_iclust->idx);
+ node->akid = min_iclust;
+ node->bkid = min_jclust;
+ node->value = min_value;
+ node->akid->parent = node;
+ node->bkid->parent = node;
+ }
+
+ return clust;
+}
+void hclust_destroy(hclust_t *clust)
+{
+ int i;
+ for (i=0; i<clust->nrmme; i++) free(clust->rmme[i]);
+ free(clust->rmme);
+ free(clust->dbg);
+ free(clust->str.s);
+ free(clust);
+}
+
+char *hclust_create_dot(hclust_t *clust, char **labels, float th)
+{
+ clust->str.l = 0;
+ ksprintf(&clust->str,"digraph myGraph {");
+
+ int i;
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->value )
+ ksprintf(&clust->str,"\"%d\" [label=\"%f\"];", node->id,node->value);
+ else
+ ksprintf(&clust->str,"\"%d\" [label=\"%s\"];", node->id,labels[node->idx]);
+ }
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->akid )
+ {
+ if ( node->value >= th && node->akid && node->akid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->akid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->akid->id);
+ }
+
+ if ( node->bkid )
+ {
+ if ( node->value >= th && node->bkid && node->bkid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->bkid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->bkid->id);
+ }
+ }
+ ksprintf(&clust->str,"};");
+ return clust->str.s;
+}
+char **hclust_explain(hclust_t *clust, int *nlines)
+{
+ clust->ndbg = 0;
+ char *beg = clust->str.s;
+ while ( *beg )
+ {
+ char *end = beg;
+ while ( *end && *end!='\n' ) end++;
+ clust->ndbg++;
+ hts_expand(char*,clust->ndbg,clust->mdbg,clust->dbg);
+ clust->dbg[clust->ndbg-1] = beg;
+ if ( !*end ) break;
+ *end = 0;
+ beg = end + 1;
+ }
+
+ *nlines = clust->ndbg;
+ return clust->dbg;
+}
+
+cluster_t *append_cluster(node_t *node, cluster_t *cluster, int *nclust, node_t **stack)
+{
+ (*nclust)++;
+ cluster = (cluster_t*) realloc(cluster,sizeof(cluster_t)*(*nclust));
+ cluster_t *clust = &cluster[*nclust-1];
+ clust->nmemb = 0;
+ clust->memb = NULL;
+ clust->dist = node->value;
+
+ int nstack = 1;
+ stack[0] = node;
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( node->akid )
+ {
+ stack[nstack++] = akid;
+ stack[nstack++] = bkid;
+ }
+ else
+ {
+ clust->nmemb++;
+ clust->memb = (int*) realloc(clust->memb,sizeof(int)*clust->nmemb);
+ clust->memb[clust->nmemb-1] = node->id;
+ }
+ }
+ return cluster;
+}
+
+int cmp_nodes(const void *a, const void *b)
+{
+ const node_t *an = *((const node_t**) a);
+ const node_t *bn = *((const node_t**) b);
+ if ( an->value < bn->value ) return -1;
+ if ( an->value > bn->value ) return 1;
+ return 0;
+}
+
+float calc_dev(node_t **dat, int n)
+{
+ float avg = 0, dev = 0;
+ int i;
+ for (i=0; i<n; i++) avg += dat[i]->value;
+ avg /= n;
+ for (i=0; i<n; i++) dev += (dat[i]->value - avg)*(dat[i]->value - avg);
+ return sqrt(dev/n);
+}
+
+/*
+ Heuristics to determine clustering cutoff: sort nodes by distance and
+ split into two groups by minimizing the standard deviation.
+ This works best when two elements from a single different sample are
+ included in the mix.
+ - min_inter_dist .. smaller values are always considered identical
+ - max_intra_dist .. larger values are always considered different
+ */
+float hclust_set_threshold(hclust_t *clust, float min_inter_dist, float max_intra_dist)
+{
+ node_t **dat = clust->rmme + clust->ndat;
+ int i, ndat = clust->nrmme - clust->ndat;
+
+ qsort(dat, ndat, sizeof(dat), cmp_nodes);
+
+ clust->str.l = 0;
+ float th, min_dev = HUGE_VAL;
+ int imin = -1;
+ for (i=0; i<ndat; i++)
+ {
+ float dev = 0;
+ if ( i>0 ) dev += calc_dev(dat,i);
+ if ( i+1<ndat ) dev += calc_dev(dat+i,ndat-i);
+ th = dat[i]->value;
+ ksprintf(&clust->str,"DEV\t%f\t%f\n",th,dev);
+ if ( min_dev > dev && th >= min_inter_dist ) { min_dev = dev; imin = i; }
+ }
+ if ( max_intra_dist > 0 )
+ th = max_intra_dist; // use fixed cutoff, the above was only for debugging output
+ else
+ {
+ // dynamic cutoff
+ max_intra_dist = fabs(max_intra_dist);
+ th = imin==-1 ? max_intra_dist : dat[imin]->value;
+ if ( th > max_intra_dist ) th = max_intra_dist;
+ }
+ ksprintf(&clust->str,"TH\t%f\n", th);
+ ksprintf(&clust->str,"MAX_DIST\t%f\n", dat[ndat-1]->value);
+ ksprintf(&clust->str,"MIN_INTER\t%f\n", min_inter_dist);
+ ksprintf(&clust->str,"MAX_INTRA\t%f\n", max_intra_dist);
+ return th;
+}
+
+cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust)
+{
+ float cutoff = *max_intra_dist = hclust_set_threshold(clust, min_inter_dist, *max_intra_dist);
+
+ node_t **stack = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ node_t **tmp = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ stack[0] = clust->first;
+ int nstack = 1;
+
+ cluster_t *cluster = NULL;
+ int ncluster = 0;
+
+ if ( stack[0]->value < cutoff )
+ {
+ // all values are within the limits - create a single cluster
+ cluster = append_cluster(stack[0], cluster, &ncluster, tmp);
+ nstack = 0;
+ }
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( !akid )
+ {
+ cluster = append_cluster(node, cluster, &ncluster, tmp);
+ continue;
+ }
+
+ if ( node->value >= cutoff && akid->value < cutoff )
+ cluster = append_cluster(akid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = akid;
+
+ if ( node->value >= cutoff && bkid->value < cutoff )
+ cluster = append_cluster(bkid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = bkid;
+ }
+
+ free(tmp);
+ free(stack);
+
+ *nclust = ncluster;
+ return cluster;
+}
+
+void hclust_destroy_list(cluster_t *clust, int nclust)
+{
+ int i;
+ for (i=0; i<nclust; i++) free(clust[i].memb);
+ free(clust);
+}
+
+
diff --git a/bcftools/hclust.h b/bcftools/hclust.h
new file mode 100644
index 0000000..43d333f
--- /dev/null
+++ b/bcftools/hclust.h
@@ -0,0 +1,77 @@
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+/*
+ Simple hierarchical clustering
+*/
+
+#ifndef __HCLUST_H__
+#define __HCLUST_H__
+
+#include <stdio.h>
+
+typedef struct _hclust_t hclust_t;
+
+typedef struct
+{
+ float dist;
+ int nmemb, *memb;
+}
+cluster_t;
+
+#define PDIST(mat,a,b) (mat)[((a)>(b)?((a)*((a)-1)/2+(b)):((b)*((b)-1)/2+(a)))]
+
+/*
+ * hclust_init() - init and run clustering
+ * @n: number of elements
+ * @pdist: pairwise distances. The array will be modified by hclust and
+ * must exist until hclust_destroy() is called
+ */
+hclust_t *hclust_init(int n, float *pdist);
+void hclust_destroy(hclust_t *clust);
+
+/*
+ * hclust_create_list() - returns a list of clusters
+ * @min_inter_dist: minimum inter-cluster distance. If smaller, elements are considered
+ * homogenous, belonging to the same cluster.
+ * @max_intra_dist: maximum intra-cluster distance allowed. If smaller than 0,
+ * the threshold can be heuristically lowered, otherwise considered
+ * a fixed cutoff. The pointer will be filled to the cutoff actually used.
+ */
+cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust);
+void hclust_destroy_list(cluster_t *clust, int nclust);
+
+/*
+ * Access debugging data used in the decision making process. Note that this
+ * must be called immediately after hclust_create_list because other calls,
+ * such as hclust_create_dot(), invalidate the temporary data structures.
+ */
+char **hclust_explain(hclust_t *clust, int *nlines);
+
+char *hclust_create_dot(hclust_t *clust, char **labels, float th);
+
+#endif
+
diff --git a/bcftools/kheap.h b/bcftools/kheap.h
new file mode 100644
index 0000000..ac2f9f9
--- /dev/null
+++ b/bcftools/kheap.h
@@ -0,0 +1,171 @@
+/* The MIT License
+
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ Usage example:
+
+ #include "kheap.h"
+
+ // First we prepare the user data to store, in this example it is a
+ // struct with a single element "key", and a comparator function
+ // "is_smaller". In this example the comparator defines a min heap (as
+ // opposed to a max heap).
+ typedef struct
+ {
+ uint32_t key;
+ }
+ data_t;
+ static inline int is_smaller(data_t *a, data_t *b)
+ {
+ return a->key < b->key ? 1 : 0;
+ }
+ data_t data[3] = { {3}, {2}, {1} };
+
+
+ // Heap declaration, "mh" is an arbitrary string. The typedef is not
+ // required, it is just a convenience shortcut so that we can use
+ // "heap_t" instead of the generic "khp_mh_t" automatically created by
+ // the KHEAP_INIT macro.
+ KHEAP_INIT(mh, data_t, is_smaller)
+ typedef khp_mh_t heap_t;
+
+ // Initialize the heap, insert the test data, then retrieve them back,
+ // sorted. Multiple heaps with the same name "mh" can be created and
+ // used simultaneously, as long as they all use the same data type
+ // "data_t".
+ heap_t *heap = khp_init(mh);
+
+ for (int i=0; i<3; i++)
+ khp_insert(mh, heap, &data[i]);
+
+ while (heap->ndat)
+ {
+ printf("%d\n", heap->dat[0].pos);
+ khp_delete(mh, heap);
+ }
+
+ // Clean up
+ khp_destroy(mh, heap);
+
+*/
+
+#ifndef __KHEAP_H__
+#define __KHEAP_H__
+
+#include <stdlib.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+
+#define __KHEAP_TYPE(name, kheap_t) \
+ typedef struct { \
+ int ndat, mdat; \
+ kheap_t *dat; \
+ kheap_t tmp; \
+ } khp_##name##_t;
+
+#define khp_parent(i) (((i)-1)/2)
+#define khp_lchild(i) (2*(i)+1)
+#define khp_rchild(i) (2*(i)+2)
+#define khp_swap(hp,i,j) { \
+ ((hp)->tmp) = ((hp)->dat[i]); \
+ ((hp)->dat[i]) = ((hp)->dat[j]); \
+ ((hp)->dat[j]) = ((hp)->tmp); \
+ }
+
+#define __KHEAP_IMPL(name, SCOPE, kheap_t, __cmp) \
+ SCOPE khp_##name##_t *khp_init_##name(void) \
+ { \
+ return (khp_##name##_t*)calloc(1, sizeof(khp_##name##_t)); \
+ } \
+ SCOPE void khp_destroy_##name(khp_##name##_t *heap) \
+ { \
+ if (heap) free(heap->dat); \
+ free(heap); \
+ } \
+ SCOPE int khp_insert_##name(khp_##name##_t *heap, kheap_t *dat) \
+ { \
+ heap->ndat++; \
+ if ( heap->ndat > heap->mdat ) \
+ { \
+ heap->mdat = heap->ndat; \
+ kroundup32(heap->mdat); \
+ heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
+ } \
+ int i = heap->ndat - 1; \
+ while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) ) \
+ { \
+ heap->dat[i] = heap->dat[khp_parent(i)]; \
+ i = khp_parent(i); \
+ } \
+ heap->dat[i] = *dat; \
+ return i; \
+ } \
+ SCOPE void khp_heapify_##name(khp_##name##_t *heap, int i) \
+ { \
+/*todo: loop instead of a recursive function? */ \
+ int extreme = khp_lchild(i) < heap->ndat && __cmp(&heap->dat[khp_lchild(i)],&heap->dat[i]) ? khp_lchild(i) : i; \
+ if ( khp_rchild(i) < heap->ndat && __cmp(&heap->dat[khp_rchild(i)],&heap->dat[extreme]) ) extreme = khp_rchild(i); \
+ if ( extreme != i ) \
+ { \
+ khp_swap(heap,i,extreme); \
+ khp_heapify_##name(heap,extreme); \
+ } \
+ } \
+ SCOPE void khp_delete_##name(khp_##name##_t *heap) \
+ { \
+ if ( !heap || !heap->ndat ) return; \
+ heap->dat[0] = heap->dat[--heap->ndat]; \
+ khp_heapify_##name(heap, 0); \
+ } \
+
+#define KHEAP_INIT(name, kheap_t, __cmp) \
+ __KHEAP_TYPE(name, kheap_t) \
+ __KHEAP_IMPL(name, static kh_inline klib_unused, kheap_t, __cmp)
+
+#define khp_init(name) khp_init_##name()
+#define khp_destroy(name, heap) khp_destroy_##name(heap)
+#define khp_insert(name, heap, dat) khp_insert_##name(heap, dat)
+#define khp_delete(name, heap) khp_delete_##name(heap)
+
+#endif
diff --git a/bcftools/main.c b/bcftools/main.c
index 1892c1d..9350ff8 100644
--- a/bcftools/main.c
+++ b/bcftools/main.c
@@ -54,6 +54,8 @@ int main_polysomy(int argc, char *argv[]);
#endif
int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
+int main_csq(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
typedef struct
{
@@ -140,6 +142,10 @@ static cmd_t cmds[] =
.alias = "cnv",
.help = "HMM CNV calling"
},
+ { .func = main_csq,
+ .alias = "csq",
+ .help = "call variation consequences"
+ },
{ .func = main_vcffilter,
.alias = "filter",
.help = "filter VCF/BCF files using fixed thresholds"
@@ -148,6 +154,10 @@ static cmd_t cmds[] =
.alias = "gtcheck",
.help = "check sample concordance, detect sample swaps and contamination"
},
+ { .func = bam_mpileup,
+ .alias = "mpileup",
+ .help = "multi-way pileup producing genotype likelihoods"
+ },
#if USE_GPL
{ .func = main_polysomy,
.alias = "polysomy",
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c
index f578442..a2b4a99 100644
--- a/bcftools/main.c.pysam.c
+++ b/bcftools/main.c.pysam.c
@@ -56,6 +56,8 @@ int main_polysomy(int argc, char *argv[]);
#endif
int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
+int main_csq(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
typedef struct
{
@@ -142,6 +144,10 @@ static cmd_t cmds[] =
.alias = "cnv",
.help = "HMM CNV calling"
},
+ { .func = main_csq,
+ .alias = "csq",
+ .help = "call variation consequences"
+ },
{ .func = main_vcffilter,
.alias = "filter",
.help = "filter VCF/BCF files using fixed thresholds"
@@ -150,6 +156,10 @@ static cmd_t cmds[] =
.alias = "gtcheck",
.help = "check sample concordance, detect sample swaps and contamination"
},
+ { .func = bam_mpileup,
+ .alias = "mpileup",
+ .help = "multi-way pileup producing genotype likelihoods"
+ },
#if USE_GPL
{ .func = main_polysomy,
.alias = "polysomy",
diff --git a/bcftools/mcall.c b/bcftools/mcall.c
index 495f849..7f7515f 100644
--- a/bcftools/mcall.c
+++ b/bcftools/mcall.c
@@ -1,6 +1,6 @@
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -107,6 +107,16 @@ int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl)
//
static void mcall_init_trios(call_t *call)
{
+ if ( call->prior_AN )
+ {
+ int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN);
+ id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC);
+ }
+
// 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
@@ -347,8 +357,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse
break;
}
if ( PLs[j]==bcf_int32_missing ) break;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
@@ -367,8 +376,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse
{
assert( PLs[j]!=bcf_int32_vector_end );
if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
}
@@ -539,19 +547,19 @@ float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
/**
* log(sum_i exp(a_i))
*/
-static inline double logsumexp(double *vals, int nvals)
-{
- int i;
- double max_exp = vals[0];
- for (i=1; i<nvals; i++)
- if ( max_exp < vals[i] ) max_exp = vals[i];
-
- double sum = 0;
- for (i=0; i<nvals; i++)
- sum += exp(vals[i] - max_exp);
-
- return log(sum) + max_exp;
-}
+// static inline double logsumexp(double *vals, int nvals)
+// {
+// int i;
+// double max_exp = vals[0];
+// for (i=1; i<nvals; i++)
+// if ( max_exp < vals[i] ) max_exp = vals[i];
+
+// double sum = 0;
+// for (i=0; i<nvals; i++)
+// sum += exp(vals[i] - max_exp);
+
+// return log(sum) + max_exp;
+// }
/** log(exp(a)+exp(b)) */
static inline double logsumexp2(double a, double b)
{
@@ -562,9 +570,9 @@ static inline double logsumexp2(double a, double b)
}
// Macro to set the most likely alleles
-#define UPDATE_MAX_LKs(als) { \
+#define UPDATE_MAX_LKs(als,sum) { \
if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
- if ( lk_tot_set ) lk_sum = logsumexp2(lk_tot,lk_sum); \
+ if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
}
#define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
@@ -595,7 +603,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
}
if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
else lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia);
+ UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
}
// Two alleles
@@ -612,14 +620,16 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
int lk_tot_set = 0;
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]);
- double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fab = 2*fa*fb;
int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
double *pdg = call->pdg;
for (isample=0; isample<nsmpl; isample++)
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
@@ -627,7 +637,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta;
- UPDATE_MAX_LKs(1<<ia|1<<ib);
+ UPDATE_MAX_LKs(1<<ia|1<<ib, lk_tot_set);
}
}
}
@@ -652,7 +662,10 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fc2 = fc*fc;
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
int isample, icc = (ic+1)*(ic+2)/2-1;
int iac = iaa - ia + ic, ibc = ibb - ib + ic;
double *pdg = call->pdg;
@@ -660,7 +673,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
@@ -669,7 +682,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta; // the prior
if ( ic!=0 ) lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic);
+ UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic, lk_tot_set);
}
}
}
@@ -780,7 +793,7 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
{
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
#if USE_PRIOR_FOR_GTS
if ( ia!=0 ) lk *= prior;
#endif
@@ -934,7 +947,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
sum_lk += lk;
gls[idx] = lk;
if ( best_lk < lk )
@@ -1184,82 +1197,80 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
{
- int i, ret;
+ if ( nals==nout_als ) return;
+
+ int i,j, nret, size = sizeof(float);
+
+ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
+ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
- // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer,
- // so only dealing with these cases at the moment
+ // INFO fields
for (i=0; i<rec->n_info; i++)
{
bcf_info_t *info = &rec->d.info[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
- if ( vlen!=BCF_VL_R ) continue;
- int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
- if ( type!=BCF_HT_INT ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
- ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp);
- if ( ret>0 )
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key);
+ nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if ( nret<=0 ) continue;
+
+ if ( nout_als==1 )
+ bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change
+ else
{
- assert( ret==nals );
- if ( out_als==1 )
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1);
- else
+ for (j=0; j<nals; j++)
{
- int j;
- for (j=0; j<nals; j++)
- {
- if ( call->als_map[j]==-1 ) continue; // to be dropped
- call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point
- }
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als);
+ int k = call->als_map[j];
+ if ( k==-1 ) continue; // to be dropped
+ memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
}
+ bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
}
}
+ // FORMAT fields
for (i=0; i<rec->n_fmt; i++)
{
bcf_fmt_t *fmt = &rec->d.fmt[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
- if ( vlen!=BCF_VL_R ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
+
int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
- if ( type!=BCF_HT_INT ) continue;
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
+ nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if (nret<=0) continue;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
- ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp);
- if ( ret>0 )
- {
- int j, nsmpl = bcf_hdr_nsamples(call->hdr);
- int ndp = ret / nsmpl;
- assert( ndp==nals );
- if ( out_als==1 )
- {
- for (j=0; j<nsmpl; j++)
- call->PLs[j] = call->itmp[j*ndp];
+ assert( nret==nals*nsmpl );
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl);
- }
- else
+ for (j=0; j<nsmpl; j++)
+ {
+ char *ptr_src = (char *)tmp_ori + j*nals*size;
+ char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+ int k;
+ for (k=0; k<nals; k++)
{
- int k;
- for (j=0; j<nsmpl; j++)
- {
- int32_t *dp_dst = call->PLs + j*nout_als;
- int32_t *dp_src = call->itmp + j*ndp;
- for (k=0; k<nals; k++)
- {
- if ( call->als_map[k]==-1 ) continue; // to be dropped
- dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point
- }
- }
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als);
+ int l = call->als_map[k];
+ if ( l==-1 ) continue; // to be dropped
+ memcpy(ptr_dst+size*l, ptr_src+size*k, size);
}
}
+ bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
}
+
+ call->PLs = (int32_t*) tmp_new;
+ call->mPLs = ntmp_new;
+ call->itmp = (int32_t*) tmp_ori;
+ call->n_itmp = ntmp_ori;
}
// NB: in this function we temporarily use calls->als_map for a different
// purpose to store mapping from new (target) alleles to original alleles.
//
-static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
+static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
{
bcf_sr_regions_t *tgt = call->srs->targets;
if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals);
@@ -1282,7 +1293,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
call->als[nals] = tgt->als[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]);
- if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]);
+ if ( j+1==*unseen ) { fprintf(stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; }
if ( j>=0 )
{
@@ -1308,7 +1319,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
nals++;
}
- if ( !has_new && nals==rec->n_allele ) return;
+ if ( !has_new && nals==rec->n_allele ) return 0;
bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
// create mapping from new PL to old PL
@@ -1360,6 +1371,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
bcf_update_info_float(call->hdr, rec, "QS", qsum, nals);
if ( *unseen ) *unseen = nals-1;
+ return 0;
}
@@ -1374,7 +1386,7 @@ int mcall(call_t *call, bcf1_t *rec)
int i, unseen = call->unseen;
// Force alleles when calling genotypes given alleles was requested
- if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen);
+ if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
int nsmpl = bcf_hdr_nsamples(call->hdr);
int nals = rec->n_allele;
@@ -1395,7 +1407,7 @@ int mcall(call_t *call, bcf1_t *rec)
#if QS_FROM_PDG
estimate_qsum(call, rec);
#else
- // Get sum of qualities
+ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
if ( nqs < nals )
@@ -1406,23 +1418,50 @@ int mcall(call_t *call, bcf1_t *rec)
hts_expand(float,nals,call->nqsum,call->qsum);
for (i=nqs; i<nals; i++) call->qsum[i] = 0;
}
- float qsum_tot = 0;
- for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
- if ( !call->qsum[0] )
+
+ // If available, take into account reference panel AFs
+ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
{
- // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
- // an equivalent of a single reference read.
- if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
- error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
- if ( call->itmp[0] )
+ int an = call->ac[0];
+ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
{
- call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
- qsum_tot += call->qsum[0];
+ int ac0 = an; // number of alleles in the reference population
+ for (i=0; i<nals-1; i++)
+ {
+ if ( call->ac[i]==bcf_int32_vector_end ) break;
+ if ( call->ac[i]==bcf_int32_missing ) continue;
+ ac0 -= call->ac[i];
+ call->qsum[i+1] += call->ac[i]*0.5;
+ }
+ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
+ call->qsum[0] += ac0*0.5;
+ for (i=0; i<nals; i++) call->qsum[i] /= nsmpl + 0.5*an;
}
}
+
+ float qsum_tot = 0;
+ for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
+
+ // Is this still necessary??
+ //
+ // if (0&& !call->qsum[0] )
+ // {
+ // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
+ // // an equivalent of a single reference read.
+ // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
+ // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
+ // if ( call->itmp[0] )
+ // {
+ // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
+ // qsum_tot += call->qsum[0];
+ // }
+ // }
+
if ( qsum_tot ) for (i=0; i<nals; i++) call->qsum[i] /= qsum_tot;
#endif
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
// Find the best combination of alleles
int out_als, nout;
if ( nals > 8*sizeof(out_als) )
@@ -1497,13 +1536,17 @@ int mcall(call_t *call, bcf1_t *rec)
if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
// Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum));
+ rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
}
else
{
// Set the quality of a REF site
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum));
+ if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ rec->qual = call->theta ? -4.343*call->theta : 0;
+ else
+ rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
}
+
if ( rec->qual>999 ) rec->qual = 999;
if ( rec->qual>50 ) rec->qual = rint(rec->qual);
@@ -1530,7 +1573,6 @@ int mcall(call_t *call, bcf1_t *rec)
}
bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
- bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
return nout;
}
diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c
index 29ed799..a315656 100644
--- a/bcftools/mcall.c.pysam.c
+++ b/bcftools/mcall.c.pysam.c
@@ -2,7 +2,7 @@
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -109,6 +109,16 @@ int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl)
//
static void mcall_init_trios(call_t *call)
{
+ if ( call->prior_AN )
+ {
+ int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN);
+ id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC);
+ }
+
// 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
@@ -349,8 +359,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse
break;
}
if ( PLs[j]==bcf_int32_missing ) break;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
@@ -369,8 +378,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse
{
assert( PLs[j]!=bcf_int32_vector_end );
if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
}
@@ -541,19 +549,19 @@ float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
/**
* log(sum_i exp(a_i))
*/
-static inline double logsumexp(double *vals, int nvals)
-{
- int i;
- double max_exp = vals[0];
- for (i=1; i<nvals; i++)
- if ( max_exp < vals[i] ) max_exp = vals[i];
-
- double sum = 0;
- for (i=0; i<nvals; i++)
- sum += exp(vals[i] - max_exp);
-
- return log(sum) + max_exp;
-}
+// static inline double logsumexp(double *vals, int nvals)
+// {
+// int i;
+// double max_exp = vals[0];
+// for (i=1; i<nvals; i++)
+// if ( max_exp < vals[i] ) max_exp = vals[i];
+
+// double sum = 0;
+// for (i=0; i<nvals; i++)
+// sum += exp(vals[i] - max_exp);
+
+// return log(sum) + max_exp;
+// }
/** log(exp(a)+exp(b)) */
static inline double logsumexp2(double a, double b)
{
@@ -564,9 +572,9 @@ static inline double logsumexp2(double a, double b)
}
// Macro to set the most likely alleles
-#define UPDATE_MAX_LKs(als) { \
+#define UPDATE_MAX_LKs(als,sum) { \
if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
- if ( lk_tot_set ) lk_sum = logsumexp2(lk_tot,lk_sum); \
+ if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
}
#define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
@@ -597,7 +605,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
}
if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
else lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia);
+ UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
}
// Two alleles
@@ -614,14 +622,16 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
int lk_tot_set = 0;
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]);
- double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fab = 2*fa*fb;
int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
double *pdg = call->pdg;
for (isample=0; isample<nsmpl; isample++)
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
@@ -629,7 +639,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta;
- UPDATE_MAX_LKs(1<<ia|1<<ib);
+ UPDATE_MAX_LKs(1<<ia|1<<ib, lk_tot_set);
}
}
}
@@ -654,7 +664,10 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fc2 = fc*fc;
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
int isample, icc = (ic+1)*(ic+2)/2-1;
int iac = iaa - ia + ic, ibc = ibb - ib + ic;
double *pdg = call->pdg;
@@ -662,7 +675,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
@@ -671,7 +684,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta; // the prior
if ( ic!=0 ) lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic);
+ UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic, lk_tot_set);
}
}
}
@@ -782,7 +795,7 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
{
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
#if USE_PRIOR_FOR_GTS
if ( ia!=0 ) lk *= prior;
#endif
@@ -936,7 +949,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
sum_lk += lk;
gls[idx] = lk;
if ( best_lk < lk )
@@ -1186,82 +1199,80 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
{
- int i, ret;
+ if ( nals==nout_als ) return;
+
+ int i,j, nret, size = sizeof(float);
+
+ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
+ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
- // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer,
- // so only dealing with these cases at the moment
+ // INFO fields
for (i=0; i<rec->n_info; i++)
{
bcf_info_t *info = &rec->d.info[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
- if ( vlen!=BCF_VL_R ) continue;
- int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
- if ( type!=BCF_HT_INT ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
- ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp);
- if ( ret>0 )
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key);
+ nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if ( nret<=0 ) continue;
+
+ if ( nout_als==1 )
+ bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change
+ else
{
- assert( ret==nals );
- if ( out_als==1 )
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1);
- else
+ for (j=0; j<nals; j++)
{
- int j;
- for (j=0; j<nals; j++)
- {
- if ( call->als_map[j]==-1 ) continue; // to be dropped
- call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point
- }
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als);
+ int k = call->als_map[j];
+ if ( k==-1 ) continue; // to be dropped
+ memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
}
+ bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
}
}
+ // FORMAT fields
for (i=0; i<rec->n_fmt; i++)
{
bcf_fmt_t *fmt = &rec->d.fmt[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
- if ( vlen!=BCF_VL_R ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
+
int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
- if ( type!=BCF_HT_INT ) continue;
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
+ nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if (nret<=0) continue;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
- ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp);
- if ( ret>0 )
- {
- int j, nsmpl = bcf_hdr_nsamples(call->hdr);
- int ndp = ret / nsmpl;
- assert( ndp==nals );
- if ( out_als==1 )
- {
- for (j=0; j<nsmpl; j++)
- call->PLs[j] = call->itmp[j*ndp];
+ assert( nret==nals*nsmpl );
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl);
- }
- else
+ for (j=0; j<nsmpl; j++)
+ {
+ char *ptr_src = (char *)tmp_ori + j*nals*size;
+ char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+ int k;
+ for (k=0; k<nals; k++)
{
- int k;
- for (j=0; j<nsmpl; j++)
- {
- int32_t *dp_dst = call->PLs + j*nout_als;
- int32_t *dp_src = call->itmp + j*ndp;
- for (k=0; k<nals; k++)
- {
- if ( call->als_map[k]==-1 ) continue; // to be dropped
- dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point
- }
- }
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als);
+ int l = call->als_map[k];
+ if ( l==-1 ) continue; // to be dropped
+ memcpy(ptr_dst+size*l, ptr_src+size*k, size);
}
}
+ bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
}
+
+ call->PLs = (int32_t*) tmp_new;
+ call->mPLs = ntmp_new;
+ call->itmp = (int32_t*) tmp_ori;
+ call->n_itmp = ntmp_ori;
}
// NB: in this function we temporarily use calls->als_map for a different
// purpose to store mapping from new (target) alleles to original alleles.
//
-static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
+static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
{
bcf_sr_regions_t *tgt = call->srs->targets;
if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals);
@@ -1284,7 +1295,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
call->als[nals] = tgt->als[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]);
- if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]);
+ if ( j+1==*unseen ) { fprintf(pysam_stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; }
if ( j>=0 )
{
@@ -1310,7 +1321,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
nals++;
}
- if ( !has_new && nals==rec->n_allele ) return;
+ if ( !has_new && nals==rec->n_allele ) return 0;
bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
// create mapping from new PL to old PL
@@ -1362,6 +1373,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
bcf_update_info_float(call->hdr, rec, "QS", qsum, nals);
if ( *unseen ) *unseen = nals-1;
+ return 0;
}
@@ -1376,7 +1388,7 @@ int mcall(call_t *call, bcf1_t *rec)
int i, unseen = call->unseen;
// Force alleles when calling genotypes given alleles was requested
- if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen);
+ if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
int nsmpl = bcf_hdr_nsamples(call->hdr);
int nals = rec->n_allele;
@@ -1397,7 +1409,7 @@ int mcall(call_t *call, bcf1_t *rec)
#if QS_FROM_PDG
estimate_qsum(call, rec);
#else
- // Get sum of qualities
+ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
if ( nqs < nals )
@@ -1408,23 +1420,50 @@ int mcall(call_t *call, bcf1_t *rec)
hts_expand(float,nals,call->nqsum,call->qsum);
for (i=nqs; i<nals; i++) call->qsum[i] = 0;
}
- float qsum_tot = 0;
- for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
- if ( !call->qsum[0] )
+
+ // If available, take into account reference panel AFs
+ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
{
- // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
- // an equivalent of a single reference read.
- if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
- error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
- if ( call->itmp[0] )
+ int an = call->ac[0];
+ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
{
- call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
- qsum_tot += call->qsum[0];
+ int ac0 = an; // number of alleles in the reference population
+ for (i=0; i<nals-1; i++)
+ {
+ if ( call->ac[i]==bcf_int32_vector_end ) break;
+ if ( call->ac[i]==bcf_int32_missing ) continue;
+ ac0 -= call->ac[i];
+ call->qsum[i+1] += call->ac[i]*0.5;
+ }
+ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
+ call->qsum[0] += ac0*0.5;
+ for (i=0; i<nals; i++) call->qsum[i] /= nsmpl + 0.5*an;
}
}
+
+ float qsum_tot = 0;
+ for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
+
+ // Is this still necessary??
+ //
+ // if (0&& !call->qsum[0] )
+ // {
+ // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
+ // // an equivalent of a single reference read.
+ // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
+ // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
+ // if ( call->itmp[0] )
+ // {
+ // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
+ // qsum_tot += call->qsum[0];
+ // }
+ // }
+
if ( qsum_tot ) for (i=0; i<nals; i++) call->qsum[i] /= qsum_tot;
#endif
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
// Find the best combination of alleles
int out_als, nout;
if ( nals > 8*sizeof(out_als) )
@@ -1499,13 +1538,17 @@ int mcall(call_t *call, bcf1_t *rec)
if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
// Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum));
+ rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
}
else
{
// Set the quality of a REF site
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum));
+ if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ rec->qual = call->theta ? -4.343*call->theta : 0;
+ else
+ rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
}
+
if ( rec->qual>999 ) rec->qual = 999;
if ( rec->qual>50 ) rec->qual = rint(rec->qual);
@@ -1532,7 +1575,6 @@ int mcall(call_t *call, bcf1_t *rec)
}
bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
- bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
return nout;
}
diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c
new file mode 100644
index 0000000..ac37dd4
--- /dev/null
+++ b/bcftools/mpileup.c
@@ -0,0 +1,1110 @@
+/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
+
+ Copyright (C) 2008-2017 Genome Research Ltd.
+ Portions copyright (C) 2009-2012 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <string.h>
+#include <strings.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <getopt.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <assert.h>
+#include "regidx.h"
+#include "bcftools.h"
+#include "bam2bcf.h"
+#include "bam_sample.h"
+#include "gvcf.h"
+
+#define MPLP_BCF 1
+#define MPLP_VCF (1<<1)
+#define MPLP_NO_COMP (1<<2)
+#define MPLP_NO_ORPHAN (1<<3)
+#define MPLP_REALN (1<<4)
+#define MPLP_NO_INDEL (1<<5)
+#define MPLP_REDO_BAQ (1<<6)
+#define MPLP_ILLUMINA13 (1<<7)
+#define MPLP_IGNORE_RG (1<<8)
+#define MPLP_PRINT_POS (1<<9)
+#define MPLP_PRINT_MAPQ (1<<10)
+#define MPLP_PER_SAMPLE (1<<11)
+#define MPLP_SMART_OVERLAPS (1<<12)
+
+typedef struct _mplp_aux_t mplp_aux_t;
+typedef struct _mplp_pileup_t mplp_pileup_t;
+
+// Data shared by all bam files
+typedef struct {
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int rflag_require, rflag_filter, output_type;
+ int openQ, extQ, tandemQ, min_support; // for indels
+ double min_frac; // for indels
+ char *reg_fname, *pl_list, *fai_fname, *output_fname;
+ int reg_is_file, record_cmd_line, n_threads;
+ faidx_t *fai;
+ regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions
+ regitr_t *bed_itr, *reg_itr;
+ int bed_logic; // 1: include region, 0: exclude region
+ gvcf_t *gvcf;
+
+ // auxiliary structures for calling
+ bcf_callaux_t *bca;
+ bcf_callret1_t *bcr;
+ bcf_call_t bc;
+ bam_mplp_t iter;
+ mplp_aux_t **mplp_data;
+ int nfiles;
+ char **files;
+ mplp_pileup_t *gplp;
+ int *n_plp;
+ const bam_pileup1_t **plp;
+ bam_smpl_t *bsmpl;
+ kstring_t buf;
+ bcf1_t *bcf_rec;
+ htsFile *bcf_fp;
+ bcf_hdr_t *bcf_hdr;
+ int argc;
+ char **argv;
+} mplp_conf_t;
+
+typedef struct {
+ char *ref[2];
+ int ref_id[2];
+ int ref_len[2];
+} mplp_ref_t;
+
+#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
+
+// Data specific to each bam file
+struct _mplp_aux_t {
+ samFile *fp;
+ hts_itr_t *iter;
+ bam_hdr_t *h;
+ mplp_ref_t *ref;
+ const mplp_conf_t *conf;
+ int bam_id;
+ hts_idx_t *idx; // maintained only with more than one -r regions
+};
+
+// Data passed to htslib/mpileup
+struct _mplp_pileup_t {
+ int n;
+ int *n_plp, *m_plp;
+ bam_pileup1_t **plp;
+};
+
+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
+ mplp_ref_t *r = ma->ref;
+
+ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
+
+ if (!r || !ma->conf->fai) {
+ *ref = NULL;
+ return 0;
+ }
+
+ // Do we need to reference count this so multiple mplp_aux_t can
+ // track which references are in use?
+ // For now we just cache the last two. Sufficient?
+ if (tid == r->ref_id[0]) {
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+ if (tid == r->ref_id[1]) {
+ // Last, swap over
+ int tmp;
+ tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp;
+ tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
+
+ char *tc;
+ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+
+ // New, so migrate to old and load new
+ free(r->ref[1]);
+ r->ref[1] = r->ref[0];
+ r->ref_id[1] = r->ref_id[0];
+ r->ref_len[1] = r->ref_len[0];
+
+ r->ref_id[0] = tid;
+ r->ref[0] = faidx_fetch_seq(ma->conf->fai,
+ ma->h->target_name[r->ref_id[0]],
+ 0,
+ INT_MAX,
+ &r->ref_len[0]);
+
+ if (!r->ref[0]) {
+ r->ref[0] = NULL;
+ r->ref_id[0] = -1;
+ r->ref_len[0] = 0;
+ *ref = NULL;
+ return 0;
+ }
+
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+}
+
+static int mplp_func(void *data, bam1_t *b)
+{
+ char *ref;
+ mplp_aux_t *ma = (mplp_aux_t*)data;
+ int ret, ref_len;
+ while (1)
+ {
+ int has_ref;
+ ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
+ if (ret < 0) break;
+ // The 'B' cigar operation is not part of the specification, considering as obsolete.
+ // bam_remove_B(b);
+ if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads
+ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue;
+ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue;
+ if (ma->conf->bed)
+ {
+ // test overlap
+ regitr_t *itr = ma->conf->bed_itr;
+ int beg = b->core.pos, end = bam_endpos(b)-1;
+ int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr);
+ if ( !ma->conf->bed_logic && !overlap )
+ {
+ // exclude only reads which are fully contained in the region
+ while ( regitr_overlap(itr) )
+ {
+ if ( beg < itr->beg ) { overlap = 1; break; }
+ if ( end > itr->end ) { overlap = 1; break; }
+ }
+ }
+ if ( !overlap ) continue;
+ }
+ if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue;
+ if (ma->conf->flag & MPLP_ILLUMINA13) {
+ int i;
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ qual[i] = qual[i] > 31? qual[i] - 31 : 0;
+ }
+
+ if (ma->conf->fai && b->core.tid >= 0) {
+ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
+ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
+ fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
+ __func__, b->core.pos, ref_len, b->core.tid);
+ continue;
+ }
+ } else {
+ has_ref = 0;
+ }
+
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && ma->conf->capQ_thres > 10) {
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
+ if (q < 0) continue; // skip
+ else if (b->core.qual > q) b->core.qual = q;
+ }
+ if (b->core.qual < ma->conf->min_mq) continue;
+ else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue;
+
+ return ret;
+ };
+ return ret;
+}
+
+// Called once per new bam added to the pileup.
+// We cache sample information here so we don't have to keep recomputing this
+// on each and every pileup column.
+//
+// Cd is an arbitrary block of data we can write into, which ends up in
+// the pileup structures. We stash the sample ID there.
+static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ mplp_aux_t *ma = (mplp_aux_t *)data;
+ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+ return 0;
+}
+
+static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, j;
+ memset(m->n_plp, 0, m->n * sizeof(int));
+ for (i = 0; i < n; ++i) // iterate over all bams
+ {
+ for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position
+ {
+ const bam_pileup1_t *p = plp[i] + j;
+ int id = p->cd.i;
+ if (m->n_plp[id] == m->m_plp[id])
+ {
+ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
+ m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
+ }
+ m->plp[id][m->n_plp[id]++] = *p;
+ }
+ }
+}
+
+static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec)
+{
+ if ( !conf->gvcf )
+ {
+ if ( rec ) bcf_write1(fp, hdr, rec);
+ return;
+ }
+
+ if ( !rec )
+ {
+ gvcf_write(conf->gvcf, fp, hdr, NULL, 0);
+ return;
+ }
+
+ int is_ref = 0;
+ if ( rec->n_allele==1 ) is_ref = 1;
+ else if ( rec->n_allele==2 )
+ {
+ // second allele is mpileup's X, not a variant
+ if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1;
+ }
+ rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref);
+ if ( rec ) bcf_write1(fp,hdr,rec);
+}
+
+static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
+{
+ bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
+
+ int ret, i, tid, pos, ref_len;
+ char *ref;
+
+ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
+ {
+ if ( end && (pos<beg || pos>end) ) continue;
+ if ( conf->bed && tid >= 0 )
+ {
+ int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL);
+ if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
+ if ( !overlap ) continue;
+ }
+ mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+
+ int total_depth, _ref0, ref16;
+ for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
+ group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp);
+ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
+ ref16 = seq_nt16_table[_ref0];
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i);
+ conf->bc.tid = tid; conf->bc.pos = pos;
+ bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+
+ // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
+ // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+ && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+ {
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ {
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ }
+ }
+ }
+ return 0;
+}
+
+static int mpileup(mplp_conf_t *conf)
+{
+ if (conf->nfiles == 0) {
+ fprintf(stderr,"[%s] no input file/data given\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ mplp_ref_t mp_ref = MPLP_REF_INIT;
+ conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
+ conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
+ conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
+ conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));
+
+ // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
+ // must be kept in the memory for the whole time which can be a problem with many bams.
+ // Therefore if none or only one region is requested, we initialize the bam iterator as
+ // before and free the index. Only when multiple regions are queried, we keep the index.
+ int nregs = 0;
+ if ( conf->reg_fname )
+ {
+ if ( conf->reg_is_file )
+ {
+ conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
+ if ( !conf->reg ) {
+ fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else
+ {
+ conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
+ fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ nregs = regidx_nregs(conf->reg);
+ conf->reg_itr = regitr_init(conf->reg);
+ regitr_loop(conf->reg_itr); // region iterator now positioned at the first region
+ }
+
+ // read the header of each file in the list and initialize data
+ // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
+ bam_hdr_t *hdr = NULL; // header of first file in input list
+ int i;
+ for (i = 0; i < conf->nfiles; ++i) {
+ bam_hdr_t *h_tmp;
+ conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
+ conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
+ if ( !conf->mplp_data[i]->fp )
+ {
+ fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ exit(EXIT_FAILURE);
+ }
+ if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->conf = conf;
+ conf->mplp_data[i]->ref = &mp_ref;
+ h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
+ if ( !h_tmp ) {
+ fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
+ conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
+ if ( conf->mplp_data[i]->bam_id<0 )
+ {
+ // no usable readgroups in this bam, it can be skipped
+ sam_close(conf->mplp_data[i]->fp);
+ free(conf->mplp_data[i]);
+ bam_hdr_destroy(h_tmp);
+ free(conf->files[i]);
+ if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
+ conf->nfiles--;
+ i--;
+ continue;
+ }
+ if (conf->reg) {
+ hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
+ if (idx == NULL) {
+ fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ if ( nregs==1 ) // no need to keep the index in memory
+ hts_idx_destroy(idx);
+ else
+ conf->mplp_data[i]->idx = idx;
+ }
+
+ if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
+ else {
+ // FIXME: check consistency between h and h_tmp
+ bam_hdr_destroy(h_tmp);
+
+ // we store only the first file's header; it's (alleged to be)
+ // compatible with the i-th file's target_name lookup needs
+ conf->mplp_data[i]->h = hdr;
+ }
+ }
+ // allocate data storage proportionate to number of samples being studied sm->n
+ bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
+ conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
+
+ fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
+ // write the VCF header
+ conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+ if (conf->bcf_fp == NULL) {
+ fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
+
+ // BCF header creation
+ conf->bcf_hdr = bcf_hdr_init("w");
+ conf->buf.l = 0;
+
+ if (conf->record_cmd_line)
+ {
+ ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
+ for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
+ kputc('\n', &conf->buf);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ if (conf->fai_fname)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ // Translate BAM @SQ tags to BCF ##contig tags
+ // todo: use/write new BAM header manipulation routines, fill also UR, M5
+ for (i=0; i<hdr->n_targets; i++)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+ conf->buf.l = 0;
+
+ bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+#if CDF_MWU_TESTS
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
+#endif
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
+ if ( conf->fmt_flag&B2B_FMT_DP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DV )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_INFO_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_FMT_DP4 )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
+ if ( conf->fmt_flag&B2B_FMT_SP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
+ if ( conf->fmt_flag&B2B_FMT_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
+ if ( conf->fmt_flag&B2B_FMT_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_FMT_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
+ if ( conf->fmt_flag&B2B_INFO_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
+ if ( conf->fmt_flag&B2B_INFO_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_INFO_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
+ if ( conf->gvcf )
+ gvcf_update_header(conf->gvcf, conf->bcf_hdr);
+
+ int nsmpl;
+ const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
+ for (i=0; i<nsmpl; i++)
+ bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
+ bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr);
+
+ conf->bca = bcf_call_init(-1., conf->min_baseQ);
+ conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
+ conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+ conf->bca->min_frac = conf->min_frac;
+ conf->bca->min_support = conf->min_support;
+ conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
+
+ conf->bc.bcf_hdr = conf->bcf_hdr;
+ conf->bc.n = nsmpl;
+ conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+ if (conf->fmt_flag)
+ {
+ assert( sizeof(float)==sizeof(int32_t) );
+ conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
+ conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
+ if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
+ {
+ // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
+ conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ for (i=0; i<nsmpl; i++)
+ {
+ conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
+ conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
+ }
+ }
+ }
+
+ // init mpileup
+ conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
+ if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
+ if ( (double)conf->max_depth * conf->nfiles > 1<<20)
+ fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
+ if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
+ fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
+ bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
+ conf->max_indel_depth = conf->max_indel_depth * nsmpl;
+ conf->bcf_rec = bcf_init1();
+ bam_mplp_constructor(conf->iter, pileup_constructor);
+
+ // Run mpileup for multiple regions
+ if ( nregs )
+ {
+ int ireg = 0;
+ do
+ {
+ // first region is already positioned
+ if ( ireg++ > 0 )
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end);
+
+ for (i=0; i<conf->nfiles; i++)
+ {
+ hts_itr_destroy(conf->mplp_data[i]->iter);
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ bam_mplp_reset(conf->iter);
+ }
+ }
+ mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+ }
+ while ( regitr_loop(conf->reg_itr) );
+ }
+ else
+ mpileup_reg(conf,0,0);
+
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
+
+ // clean up
+ free(conf->bc.tmp.s);
+ bcf_destroy1(conf->bcf_rec);
+ if (conf->bcf_fp)
+ {
+ hts_close(conf->bcf_fp);
+ bcf_hdr_destroy(conf->bcf_hdr);
+ bcf_call_destroy(conf->bca);
+ free(conf->bc.PL);
+ free(conf->bc.DP4);
+ free(conf->bc.ADR);
+ free(conf->bc.ADF);
+ free(conf->bc.fmt_arr);
+ free(conf->bcr);
+ }
+ if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
+ free(conf->buf.s);
+ for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
+ free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
+ bam_mplp_destroy(conf->iter);
+ bam_hdr_destroy(hdr);
+ for (i = 0; i < conf->nfiles; ++i) {
+ if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
+ sam_close(conf->mplp_data[i]->fp);
+ if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
+ free(conf->mplp_data[i]);
+ }
+ if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
+ free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
+ free(mp_ref.ref[0]);
+ free(mp_ref.ref[1]);
+ return 0;
+}
+
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
+#define MAX_PATH_LEN 1024
+int read_file_list(const char *file_list,int *n,char **argv[])
+{
+ char buf[MAX_PATH_LEN];
+ int len, nfiles = 0;
+ char **files = NULL;
+ struct stat sb;
+
+ *n = 0;
+ *argv = NULL;
+
+ FILE *fh = fopen(file_list,"r");
+ if ( !fh )
+ {
+ fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
+ return 1;
+ }
+
+ files = (char**) calloc(nfiles,sizeof(char*));
+ nfiles = 0;
+ while ( fgets(buf,MAX_PATH_LEN,fh) )
+ {
+ // allow empty lines and trailing spaces
+ len = strlen(buf);
+ while ( len>0 && isspace(buf[len-1]) ) len--;
+ if ( !len ) continue;
+
+ // check sanity of the file list
+ buf[len] = 0;
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
+ {
+ // no such file, check if it is safe to print its name
+ int i, safe_to_print = 1;
+ for (i=0; i<len; i++)
+ if (!isprint(buf[i])) { safe_to_print = 0; break; }
+ if ( safe_to_print )
+ fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+ else
+ fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+ return 1;
+ }
+
+ nfiles++;
+ files = (char**) realloc(files,nfiles*sizeof(char*));
+ files[nfiles-1] = strdup(buf);
+ }
+ fclose(fh);
+ if ( !nfiles )
+ {
+ fprintf(stderr,"No files read from %s\n", file_list);
+ return 1;
+ }
+ *argv = files;
+ *n = nfiles;
+ return 0;
+}
+#undef MAX_PATH_LEN
+
+int parse_format_flag(const char *str)
+{
+ int i, flag = 0, n_tags;
+ char **tags = hts_readlist(str, 0, &n_tags);
+ for(i=0; i<n_tags; i++)
+ {
+ if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
+ else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
+ else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
+ else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
+ else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
+ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
+ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
+ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+ else
+ {
+ fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
+ exit(EXIT_FAILURE);
+ }
+ free(tags[i]);
+ }
+ if (n_tags) free(tags);
+ return flag;
+}
+
+static void list_annotations(FILE *fp)
+{
+ fprintf(fp,
+"\n"
+"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
+"\n"
+" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+"\n"
+"INFO annotation tags available:\n"
+"\n"
+" INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
+" INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+"\n");
+}
+
+static void print_usage(FILE *fp, const mplp_conf_t *mplp)
+{
+ char *tmp_require = bam_flag2str(mplp->rflag_require);
+ char *tmp_filter = bam_flag2str(mplp->rflag_filter);
+
+ // Display usage information, formatted for the standard 80 columns.
+ // (The unusual string formatting here aids the readability of this
+ // source code in 80 columns, to the extent that's possible.)
+
+ fprintf(fp,
+"\n"
+"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+"\n"
+"Input options:\n"
+" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
+" -A, --count-orphans do not discard anomalous read pairs\n"
+" -b, --bam-list FILE list of input BAM filenames, one per line\n"
+" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
+" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
+" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ fprintf(fp,
+" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
+" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
+" --no-reference do not require fasta reference file\n"
+" -G, --read-groups FILE select or exclude read groups listed in the file\n"
+" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ fprintf(fp,
+" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ fprintf(fp,
+" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+" -R, --regions-file FILE restrict to regions listed in a file\n"
+" --ignore-RG ignore RG tags (one BAM = one sample)\n"
+" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ fprintf(fp,
+" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+" [%s]\n", tmp_filter);
+ fprintf(fp,
+" -s, --samples LIST comma separated list of samples to include\n"
+" -S, --samples-file FILE file of samples to include\n"
+" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+" -x, --ignore-overlaps disable read-pair overlap detection\n"
+"\n"
+"Output options:\n"
+" -a, --annotate LIST optional tags to output; '?' to list []\n"
+" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
+" to minimum per-sample DP\n"
+" --no-version do not append version and command line to the header\n"
+" -o, --output FILE write output to FILE [standard output]\n"
+" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
+" 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+" --threads INT number of extra output compression threads [0]\n"
+"\n"
+"SNP/INDEL genotype likelihoods options:\n"
+" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+ fprintf(fp,
+" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ fprintf(fp,
+" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ fprintf(fp,
+" -I, --skip-indels do not perform indel calling\n"
+" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ fprintf(fp,
+" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ fprintf(fp,
+" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ fprintf(fp,
+" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
+" -P, --platforms STR comma separated list of platforms for indels [all]\n"
+"\n"
+"Notes: Assuming diploid individuals.\n"
+"\n");
+
+ free(tmp_require);
+ free(tmp_filter);
+}
+
+int bam_mpileup(int argc, char *argv[])
+{
+ int c;
+ const char *file_list = NULL;
+ char **fn = NULL;
+ int nfiles = 0, use_orphan = 0, noref = 0;
+ mplp_conf_t mplp;
+ memset(&mplp, 0, sizeof(mplp_conf_t));
+ mplp.min_baseQ = 13;
+ mplp.capQ_thres = 0;
+ mplp.max_depth = 250; mplp.max_indel_depth = 250;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
+ mplp.min_frac = 0.002; mplp.min_support = 1;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+ mplp.argc = argc; mplp.argv = argv;
+ mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
+ mplp.output_fname = NULL;
+ mplp.output_type = FT_VCF;
+ mplp.record_cmd_line = 1;
+ mplp.n_threads = 0;
+ mplp.bsmpl = bam_smpl_init();
+
+ static const struct option lopts[] =
+ {
+ {"rf", required_argument, NULL, 1}, // require flag
+ {"ff", required_argument, NULL, 2}, // filter flag
+ {"incl-flags", required_argument, NULL, 1},
+ {"excl-flags", required_argument, NULL, 2},
+ {"output", required_argument, NULL, 3},
+ {"open-prob", required_argument, NULL, 4},
+ {"ignore-RG", no_argument, NULL, 5},
+ {"ignore-rg", no_argument, NULL, 5},
+ {"gvcf", required_argument, NULL, 'g'},
+ {"non-reference", no_argument, NULL, 7},
+ {"no-version", no_argument, NULL, 8},
+ {"threads",required_argument,NULL,9},
+ {"illumina1.3+", no_argument, NULL, '6'},
+ {"count-orphans", no_argument, NULL, 'A'},
+ {"bam-list", required_argument, NULL, 'b'},
+ {"no-BAQ", no_argument, NULL, 'B'},
+ {"no-baq", no_argument, NULL, 'B'},
+ {"adjust-MQ", required_argument, NULL, 'C'},
+ {"adjust-mq", required_argument, NULL, 'C'},
+ {"max-depth", required_argument, NULL, 'd'},
+ {"redo-BAQ", no_argument, NULL, 'E'},
+ {"redo-baq", no_argument, NULL, 'E'},
+ {"fasta-ref", required_argument, NULL, 'f'},
+ {"read-groups", required_argument, NULL, 'G'},
+ {"region", required_argument, NULL, 'r'},
+ {"regions", required_argument, NULL, 'r'},
+ {"regions-file", required_argument, NULL, 'R'},
+ {"targets", required_argument, NULL, 't'},
+ {"targets-file", required_argument, NULL, 'T'},
+ {"min-MQ", required_argument, NULL, 'q'},
+ {"min-mq", required_argument, NULL, 'q'},
+ {"min-BQ", required_argument, NULL, 'Q'},
+ {"min-bq", required_argument, NULL, 'Q'},
+ {"ignore-overlaps", no_argument, NULL, 'x'},
+ {"output-type", required_argument, NULL, 'O'},
+ {"samples", required_argument, NULL, 's'},
+ {"samples-file", required_argument, NULL, 'S'},
+ {"annotate", required_argument, NULL, 'a'},
+ {"ext-prob", required_argument, NULL, 'e'},
+ {"gap-frac", required_argument, NULL, 'F'},
+ {"tandem-qual", required_argument, NULL, 'h'},
+ {"skip-indels", no_argument, NULL, 'I'},
+ {"max-idepth", required_argument, NULL, 'L'},
+ {"min-ireads ", required_argument, NULL, 'm'},
+ {"per-sample-mF", no_argument, NULL, 'p'},
+ {"per-sample-mf", no_argument, NULL, 'p'},
+ {"platforms", required_argument, NULL, 'P'},
+ {NULL, 0, NULL, 0}
+ };
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+ switch (c) {
+ case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
+ case 1 :
+ mplp.rflag_require = bam_str2flag(optarg);
+ if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; }
+ break;
+ case 2 :
+ mplp.rflag_filter = bam_str2flag(optarg);
+ if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; }
+ break;
+ case 3 : mplp.output_fname = optarg; break;
+ case 4 : mplp.openQ = atoi(optarg); break;
+ case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break;
+ case 'g':
+ mplp.gvcf = gvcf_init(optarg);
+ if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
+ break;
+ case 'f':
+ mplp.fai = fai_load(optarg);
+ if (mplp.fai == NULL) return 1;
+ mplp.fai_fname = optarg;
+ break;
+ case 7 : noref = 1; break;
+ case 8 : mplp.record_cmd_line = 0; break;
+ case 9 : mplp.n_threads = strtol(optarg, 0, 0); break;
+ case 'd': mplp.max_depth = atoi(optarg); break;
+ case 'r': mplp.reg_fname = strdup(optarg); break;
+ case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break;
+ case 't':
+ // In the original version the whole BAM was streamed which is inefficient
+ // with few BED intervals and big BAMs. Todo: devise a heuristic to determine
+ // best strategy, that is streaming or jumping.
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL);
+ mplp.bed_itr = regitr_init(mplp.bed);
+ if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
+ {
+ fprintf(stderr,"Could not parse the targets: %s\n", optarg);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case 'T':
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
+ if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
+ break;
+ case 'P': mplp.pl_list = strdup(optarg); break;
+ case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
+ case 'B': mplp.flag &= ~MPLP_REALN; break;
+ case 'I': mplp.flag |= MPLP_NO_INDEL; break;
+ case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
+ case '6': mplp.flag |= MPLP_ILLUMINA13; break;
+ case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': mplp.output_type = FT_BCF_GZ; break;
+ case 'u': mplp.output_type = FT_BCF; break;
+ case 'z': mplp.output_type = FT_VCF_GZ; break;
+ case 'v': mplp.output_type = FT_VCF; break;
+ default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n", optarg);
+ }
+ break;
+ case 'C': mplp.capQ_thres = atoi(optarg); break;
+ case 'q': mplp.min_mq = atoi(optarg); break;
+ case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 'b': file_list = optarg; break;
+ case 'o': {
+ char *end;
+ long value = strtol(optarg, &end, 10);
+ // Distinguish between -o INT and -o FILE (a bit of a hack!)
+ if (*end == '\0') mplp.openQ = value;
+ else mplp.output_fname = optarg;
+ }
+ break;
+ case 'e': mplp.extQ = atoi(optarg); break;
+ case 'h': mplp.tandemQ = atoi(optarg); break;
+ case 'A': use_orphan = 1; break;
+ case 'F': mplp.min_frac = atof(optarg); break;
+ case 'm': mplp.min_support = atoi(optarg); break;
+ case 'L': mplp.max_indel_depth = atoi(optarg); break;
+ case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break;
+ case 'a':
+ if (optarg[0]=='?') {
+ list_annotations(stderr);
+ return 1;
+ }
+ mplp.fmt_flag |= parse_format_flag(optarg);
+ break;
+ default:
+ fprintf(stderr,"Invalid option: '%c'\n", c);
+ return 1;
+ }
+ }
+
+ if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) )
+ {
+ fprintf(stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n");
+ mplp.fmt_flag |= B2B_FMT_DP;
+ }
+ if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) )
+ {
+ if ( mplp.flag&MPLP_VCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF;
+ else mplp.output_type = FT_VCF_GZ;
+ }
+ else if ( mplp.flag&MPLP_BCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF;
+ else mplp.output_type = FT_BCF_GZ;
+ }
+ }
+ if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
+ {
+ fprintf(stderr,"Error: The -B option cannot be combined with -E\n");
+ return 1;
+ }
+ if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
+ if (argc == 1)
+ {
+ print_usage(stderr, &mplp);
+ return 1;
+ }
+ if (!mplp.fai && !noref) {
+ fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
+ return 1;
+ }
+ int ret,i;
+ if (file_list)
+ {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ mplp.files = fn;
+ mplp.nfiles = nfiles;
+ }
+ else
+ {
+ mplp.nfiles = argc - optind;
+ mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*));
+ for (i=0; i<mplp.nfiles; i++) mplp.files[i] = strdup(argv[optind+i]);
+ }
+ ret = mpileup(&mplp);
+
+ for (i=0; i<mplp.nfiles; i++) free(mplp.files[i]);
+ free(mplp.files);
+ free(mplp.reg_fname); free(mplp.pl_list);
+ if (mplp.fai) fai_destroy(mplp.fai);
+ if (mplp.bed)
+ {
+ regidx_destroy(mplp.bed);
+ regitr_destroy(mplp.bed_itr);
+ }
+ if (mplp.reg) regidx_destroy(mplp.reg);
+ bam_smpl_destroy(mplp.bsmpl);
+ return ret;
+}
diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c
new file mode 100644
index 0000000..6ef6838
--- /dev/null
+++ b/bcftools/mpileup.c.pysam.c
@@ -0,0 +1,1112 @@
+#include "pysam.h"
+
+/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
+
+ Copyright (C) 2008-2017 Genome Research Ltd.
+ Portions copyright (C) 2009-2012 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <string.h>
+#include <strings.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <getopt.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <assert.h>
+#include "regidx.h"
+#include "bcftools.h"
+#include "bam2bcf.h"
+#include "bam_sample.h"
+#include "gvcf.h"
+
+#define MPLP_BCF 1
+#define MPLP_VCF (1<<1)
+#define MPLP_NO_COMP (1<<2)
+#define MPLP_NO_ORPHAN (1<<3)
+#define MPLP_REALN (1<<4)
+#define MPLP_NO_INDEL (1<<5)
+#define MPLP_REDO_BAQ (1<<6)
+#define MPLP_ILLUMINA13 (1<<7)
+#define MPLP_IGNORE_RG (1<<8)
+#define MPLP_PRINT_POS (1<<9)
+#define MPLP_PRINT_MAPQ (1<<10)
+#define MPLP_PER_SAMPLE (1<<11)
+#define MPLP_SMART_OVERLAPS (1<<12)
+
+typedef struct _mplp_aux_t mplp_aux_t;
+typedef struct _mplp_pileup_t mplp_pileup_t;
+
+// Data shared by all bam files
+typedef struct {
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int rflag_require, rflag_filter, output_type;
+ int openQ, extQ, tandemQ, min_support; // for indels
+ double min_frac; // for indels
+ char *reg_fname, *pl_list, *fai_fname, *output_fname;
+ int reg_is_file, record_cmd_line, n_threads;
+ faidx_t *fai;
+ regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions
+ regitr_t *bed_itr, *reg_itr;
+ int bed_logic; // 1: include region, 0: exclude region
+ gvcf_t *gvcf;
+
+ // auxiliary structures for calling
+ bcf_callaux_t *bca;
+ bcf_callret1_t *bcr;
+ bcf_call_t bc;
+ bam_mplp_t iter;
+ mplp_aux_t **mplp_data;
+ int nfiles;
+ char **files;
+ mplp_pileup_t *gplp;
+ int *n_plp;
+ const bam_pileup1_t **plp;
+ bam_smpl_t *bsmpl;
+ kstring_t buf;
+ bcf1_t *bcf_rec;
+ htsFile *bcf_fp;
+ bcf_hdr_t *bcf_hdr;
+ int argc;
+ char **argv;
+} mplp_conf_t;
+
+typedef struct {
+ char *ref[2];
+ int ref_id[2];
+ int ref_len[2];
+} mplp_ref_t;
+
+#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
+
+// Data specific to each bam file
+struct _mplp_aux_t {
+ samFile *fp;
+ hts_itr_t *iter;
+ bam_hdr_t *h;
+ mplp_ref_t *ref;
+ const mplp_conf_t *conf;
+ int bam_id;
+ hts_idx_t *idx; // maintained only with more than one -r regions
+};
+
+// Data passed to htslib/mpileup
+struct _mplp_pileup_t {
+ int n;
+ int *n_plp, *m_plp;
+ bam_pileup1_t **plp;
+};
+
+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
+ mplp_ref_t *r = ma->ref;
+
+ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
+
+ if (!r || !ma->conf->fai) {
+ *ref = NULL;
+ return 0;
+ }
+
+ // Do we need to reference count this so multiple mplp_aux_t can
+ // track which references are in use?
+ // For now we just cache the last two. Sufficient?
+ if (tid == r->ref_id[0]) {
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+ if (tid == r->ref_id[1]) {
+ // Last, swap over
+ int tmp;
+ tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp;
+ tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
+
+ char *tc;
+ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+
+ // New, so migrate to old and load new
+ free(r->ref[1]);
+ r->ref[1] = r->ref[0];
+ r->ref_id[1] = r->ref_id[0];
+ r->ref_len[1] = r->ref_len[0];
+
+ r->ref_id[0] = tid;
+ r->ref[0] = faidx_fetch_seq(ma->conf->fai,
+ ma->h->target_name[r->ref_id[0]],
+ 0,
+ INT_MAX,
+ &r->ref_len[0]);
+
+ if (!r->ref[0]) {
+ r->ref[0] = NULL;
+ r->ref_id[0] = -1;
+ r->ref_len[0] = 0;
+ *ref = NULL;
+ return 0;
+ }
+
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+}
+
+static int mplp_func(void *data, bam1_t *b)
+{
+ char *ref;
+ mplp_aux_t *ma = (mplp_aux_t*)data;
+ int ret, ref_len;
+ while (1)
+ {
+ int has_ref;
+ ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
+ if (ret < 0) break;
+ // The 'B' cigar operation is not part of the specification, considering as obsolete.
+ // bam_remove_B(b);
+ if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads
+ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue;
+ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue;
+ if (ma->conf->bed)
+ {
+ // test overlap
+ regitr_t *itr = ma->conf->bed_itr;
+ int beg = b->core.pos, end = bam_endpos(b)-1;
+ int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr);
+ if ( !ma->conf->bed_logic && !overlap )
+ {
+ // exclude only reads which are fully contained in the region
+ while ( regitr_overlap(itr) )
+ {
+ if ( beg < itr->beg ) { overlap = 1; break; }
+ if ( end > itr->end ) { overlap = 1; break; }
+ }
+ }
+ if ( !overlap ) continue;
+ }
+ if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue;
+ if (ma->conf->flag & MPLP_ILLUMINA13) {
+ int i;
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ qual[i] = qual[i] > 31? qual[i] - 31 : 0;
+ }
+
+ if (ma->conf->fai && b->core.tid >= 0) {
+ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
+ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
+ fprintf(pysam_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
+ __func__, b->core.pos, ref_len, b->core.tid);
+ continue;
+ }
+ } else {
+ has_ref = 0;
+ }
+
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && ma->conf->capQ_thres > 10) {
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
+ if (q < 0) continue; // skip
+ else if (b->core.qual > q) b->core.qual = q;
+ }
+ if (b->core.qual < ma->conf->min_mq) continue;
+ else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue;
+
+ return ret;
+ };
+ return ret;
+}
+
+// Called once per new bam added to the pileup.
+// We cache sample information here so we don't have to keep recomputing this
+// on each and every pileup column.
+//
+// Cd is an arbitrary block of data we can write into, which ends up in
+// the pileup structures. We stash the sample ID there.
+static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ mplp_aux_t *ma = (mplp_aux_t *)data;
+ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+ return 0;
+}
+
+static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, j;
+ memset(m->n_plp, 0, m->n * sizeof(int));
+ for (i = 0; i < n; ++i) // iterate over all bams
+ {
+ for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position
+ {
+ const bam_pileup1_t *p = plp[i] + j;
+ int id = p->cd.i;
+ if (m->n_plp[id] == m->m_plp[id])
+ {
+ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
+ m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
+ }
+ m->plp[id][m->n_plp[id]++] = *p;
+ }
+ }
+}
+
+static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec)
+{
+ if ( !conf->gvcf )
+ {
+ if ( rec ) bcf_write1(fp, hdr, rec);
+ return;
+ }
+
+ if ( !rec )
+ {
+ gvcf_write(conf->gvcf, fp, hdr, NULL, 0);
+ return;
+ }
+
+ int is_ref = 0;
+ if ( rec->n_allele==1 ) is_ref = 1;
+ else if ( rec->n_allele==2 )
+ {
+ // second allele is mpileup's X, not a variant
+ if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1;
+ }
+ rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref);
+ if ( rec ) bcf_write1(fp,hdr,rec);
+}
+
+static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
+{
+ bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
+
+ int ret, i, tid, pos, ref_len;
+ char *ref;
+
+ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
+ {
+ if ( end && (pos<beg || pos>end) ) continue;
+ if ( conf->bed && tid >= 0 )
+ {
+ int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL);
+ if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
+ if ( !overlap ) continue;
+ }
+ mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+
+ int total_depth, _ref0, ref16;
+ for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
+ group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp);
+ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
+ ref16 = seq_nt16_table[_ref0];
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i);
+ conf->bc.tid = tid; conf->bc.pos = pos;
+ bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+
+ // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
+ // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+ && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+ {
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ {
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ }
+ }
+ }
+ return 0;
+}
+
+static int mpileup(mplp_conf_t *conf)
+{
+ if (conf->nfiles == 0) {
+ fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ mplp_ref_t mp_ref = MPLP_REF_INIT;
+ conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
+ conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
+ conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
+ conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));
+
+ // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
+ // must be kept in the memory for the whole time which can be a problem with many bams.
+ // Therefore if none or only one region is requested, we initialize the bam iterator as
+ // before and free the index. Only when multiple regions are queried, we keep the index.
+ int nregs = 0;
+ if ( conf->reg_fname )
+ {
+ if ( conf->reg_is_file )
+ {
+ conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
+ if ( !conf->reg ) {
+ fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else
+ {
+ conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
+ fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ nregs = regidx_nregs(conf->reg);
+ conf->reg_itr = regitr_init(conf->reg);
+ regitr_loop(conf->reg_itr); // region iterator now positioned at the first region
+ }
+
+ // read the header of each file in the list and initialize data
+ // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
+ bam_hdr_t *hdr = NULL; // header of first file in input list
+ int i;
+ for (i = 0; i < conf->nfiles; ++i) {
+ bam_hdr_t *h_tmp;
+ conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
+ conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
+ if ( !conf->mplp_data[i]->fp )
+ {
+ fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ exit(EXIT_FAILURE);
+ }
+ if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(pysam_stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->conf = conf;
+ conf->mplp_data[i]->ref = &mp_ref;
+ h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
+ if ( !h_tmp ) {
+ fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
+ conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
+ if ( conf->mplp_data[i]->bam_id<0 )
+ {
+ // no usable readgroups in this bam, it can be skipped
+ sam_close(conf->mplp_data[i]->fp);
+ free(conf->mplp_data[i]);
+ bam_hdr_destroy(h_tmp);
+ free(conf->files[i]);
+ if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
+ conf->nfiles--;
+ i--;
+ continue;
+ }
+ if (conf->reg) {
+ hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
+ if (idx == NULL) {
+ fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ if ( nregs==1 ) // no need to keep the index in memory
+ hts_idx_destroy(idx);
+ else
+ conf->mplp_data[i]->idx = idx;
+ }
+
+ if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
+ else {
+ // FIXME: check consistency between h and h_tmp
+ bam_hdr_destroy(h_tmp);
+
+ // we store only the first file's header; it's (alleged to be)
+ // compatible with the i-th file's target_name lookup needs
+ conf->mplp_data[i]->h = hdr;
+ }
+ }
+ // allocate data storage proportionate to number of samples being studied sm->n
+ bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
+ conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
+
+ fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
+ // write the VCF header
+ conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+ if (conf->bcf_fp == NULL) {
+ fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
+
+ // BCF header creation
+ conf->bcf_hdr = bcf_hdr_init("w");
+ conf->buf.l = 0;
+
+ if (conf->record_cmd_line)
+ {
+ ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
+ for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
+ kputc('\n', &conf->buf);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ if (conf->fai_fname)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ // Translate BAM @SQ tags to BCF ##contig tags
+ // todo: use/write new BAM header manipulation routines, fill also UR, M5
+ for (i=0; i<hdr->n_targets; i++)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+ conf->buf.l = 0;
+
+ bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+#if CDF_MWU_TESTS
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
+#endif
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
+ if ( conf->fmt_flag&B2B_FMT_DP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DV )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_INFO_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_FMT_DP4 )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
+ if ( conf->fmt_flag&B2B_FMT_SP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
+ if ( conf->fmt_flag&B2B_FMT_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
+ if ( conf->fmt_flag&B2B_FMT_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_FMT_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
+ if ( conf->fmt_flag&B2B_INFO_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
+ if ( conf->fmt_flag&B2B_INFO_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_INFO_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
+ if ( conf->gvcf )
+ gvcf_update_header(conf->gvcf, conf->bcf_hdr);
+
+ int nsmpl;
+ const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
+ for (i=0; i<nsmpl; i++)
+ bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
+ bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr);
+
+ conf->bca = bcf_call_init(-1., conf->min_baseQ);
+ conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
+ conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+ conf->bca->min_frac = conf->min_frac;
+ conf->bca->min_support = conf->min_support;
+ conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
+
+ conf->bc.bcf_hdr = conf->bcf_hdr;
+ conf->bc.n = nsmpl;
+ conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+ if (conf->fmt_flag)
+ {
+ assert( sizeof(float)==sizeof(int32_t) );
+ conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
+ conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
+ if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
+ {
+ // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
+ conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ for (i=0; i<nsmpl; i++)
+ {
+ conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
+ conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
+ }
+ }
+ }
+
+ // init mpileup
+ conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
+ if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
+ if ( (double)conf->max_depth * conf->nfiles > 1<<20)
+ fprintf(pysam_stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
+ if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
+ fprintf(pysam_stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
+ bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
+ conf->max_indel_depth = conf->max_indel_depth * nsmpl;
+ conf->bcf_rec = bcf_init1();
+ bam_mplp_constructor(conf->iter, pileup_constructor);
+
+ // Run mpileup for multiple regions
+ if ( nregs )
+ {
+ int ireg = 0;
+ do
+ {
+ // first region is already positioned
+ if ( ireg++ > 0 )
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end);
+
+ for (i=0; i<conf->nfiles; i++)
+ {
+ hts_itr_destroy(conf->mplp_data[i]->iter);
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ bam_mplp_reset(conf->iter);
+ }
+ }
+ mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+ }
+ while ( regitr_loop(conf->reg_itr) );
+ }
+ else
+ mpileup_reg(conf,0,0);
+
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
+
+ // clean up
+ free(conf->bc.tmp.s);
+ bcf_destroy1(conf->bcf_rec);
+ if (conf->bcf_fp)
+ {
+ hts_close(conf->bcf_fp);
+ bcf_hdr_destroy(conf->bcf_hdr);
+ bcf_call_destroy(conf->bca);
+ free(conf->bc.PL);
+ free(conf->bc.DP4);
+ free(conf->bc.ADR);
+ free(conf->bc.ADF);
+ free(conf->bc.fmt_arr);
+ free(conf->bcr);
+ }
+ if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
+ free(conf->buf.s);
+ for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
+ free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
+ bam_mplp_destroy(conf->iter);
+ bam_hdr_destroy(hdr);
+ for (i = 0; i < conf->nfiles; ++i) {
+ if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
+ sam_close(conf->mplp_data[i]->fp);
+ if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
+ free(conf->mplp_data[i]);
+ }
+ if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
+ free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
+ free(mp_ref.ref[0]);
+ free(mp_ref.ref[1]);
+ return 0;
+}
+
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
+#define MAX_PATH_LEN 1024
+int read_file_list(const char *file_list,int *n,char **argv[])
+{
+ char buf[MAX_PATH_LEN];
+ int len, nfiles = 0;
+ char **files = NULL;
+ struct stat sb;
+
+ *n = 0;
+ *argv = NULL;
+
+ FILE *fh = fopen(file_list,"r");
+ if ( !fh )
+ {
+ fprintf(pysam_stderr,"%s: %s\n", file_list,strerror(errno));
+ return 1;
+ }
+
+ files = (char**) calloc(nfiles,sizeof(char*));
+ nfiles = 0;
+ while ( fgets(buf,MAX_PATH_LEN,fh) )
+ {
+ // allow empty lines and trailing spaces
+ len = strlen(buf);
+ while ( len>0 && isspace(buf[len-1]) ) len--;
+ if ( !len ) continue;
+
+ // check sanity of the file list
+ buf[len] = 0;
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
+ {
+ // no such file, check if it is safe to print its name
+ int i, safe_to_print = 1;
+ for (i=0; i<len; i++)
+ if (!isprint(buf[i])) { safe_to_print = 0; break; }
+ if ( safe_to_print )
+ fprintf(pysam_stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+ else
+ fprintf(pysam_stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+ return 1;
+ }
+
+ nfiles++;
+ files = (char**) realloc(files,nfiles*sizeof(char*));
+ files[nfiles-1] = strdup(buf);
+ }
+ fclose(fh);
+ if ( !nfiles )
+ {
+ fprintf(pysam_stderr,"No files read from %s\n", file_list);
+ return 1;
+ }
+ *argv = files;
+ *n = nfiles;
+ return 0;
+}
+#undef MAX_PATH_LEN
+
+int parse_format_flag(const char *str)
+{
+ int i, flag = 0, n_tags;
+ char **tags = hts_readlist(str, 0, &n_tags);
+ for(i=0; i<n_tags; i++)
+ {
+ if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
+ else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(pysam_stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
+ else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(pysam_stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(pysam_stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(pysam_stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
+ else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
+ else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
+ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
+ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
+ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+ else
+ {
+ fprintf(pysam_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
+ exit(EXIT_FAILURE);
+ }
+ free(tags[i]);
+ }
+ if (n_tags) free(tags);
+ return flag;
+}
+
+static void list_annotations(FILE *fp)
+{
+ fprintf(fp,
+"\n"
+"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
+"\n"
+" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+"\n"
+"INFO annotation tags available:\n"
+"\n"
+" INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
+" INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+"\n");
+}
+
+static void print_usage(FILE *fp, const mplp_conf_t *mplp)
+{
+ char *tmp_require = bam_flag2str(mplp->rflag_require);
+ char *tmp_filter = bam_flag2str(mplp->rflag_filter);
+
+ // Display usage information, formatted for the standard 80 columns.
+ // (The unusual string formatting here aids the readability of this
+ // source code in 80 columns, to the extent that's possible.)
+
+ fprintf(fp,
+"\n"
+"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+"\n"
+"Input options:\n"
+" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
+" -A, --count-orphans do not discard anomalous read pairs\n"
+" -b, --bam-list FILE list of input BAM filenames, one per line\n"
+" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
+" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
+" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ fprintf(fp,
+" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
+" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
+" --no-reference do not require fasta reference file\n"
+" -G, --read-groups FILE select or exclude read groups listed in the file\n"
+" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ fprintf(fp,
+" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ fprintf(fp,
+" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+" -R, --regions-file FILE restrict to regions listed in a file\n"
+" --ignore-RG ignore RG tags (one BAM = one sample)\n"
+" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ fprintf(fp,
+" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+" [%s]\n", tmp_filter);
+ fprintf(fp,
+" -s, --samples LIST comma separated list of samples to include\n"
+" -S, --samples-file FILE file of samples to include\n"
+" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+" -x, --ignore-overlaps disable read-pair overlap detection\n"
+"\n"
+"Output options:\n"
+" -a, --annotate LIST optional tags to output; '?' to list []\n"
+" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
+" to minimum per-sample DP\n"
+" --no-version do not append version and command line to the header\n"
+" -o, --output FILE write output to FILE [standard output]\n"
+" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
+" 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+" --threads INT number of extra output compression threads [0]\n"
+"\n"
+"SNP/INDEL genotype likelihoods options:\n"
+" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+ fprintf(fp,
+" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ fprintf(fp,
+" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ fprintf(fp,
+" -I, --skip-indels do not perform indel calling\n"
+" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ fprintf(fp,
+" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ fprintf(fp,
+" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ fprintf(fp,
+" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
+" -P, --platforms STR comma separated list of platforms for indels [all]\n"
+"\n"
+"Notes: Assuming diploid individuals.\n"
+"\n");
+
+ free(tmp_require);
+ free(tmp_filter);
+}
+
+int bam_mpileup(int argc, char *argv[])
+{
+ int c;
+ const char *file_list = NULL;
+ char **fn = NULL;
+ int nfiles = 0, use_orphan = 0, noref = 0;
+ mplp_conf_t mplp;
+ memset(&mplp, 0, sizeof(mplp_conf_t));
+ mplp.min_baseQ = 13;
+ mplp.capQ_thres = 0;
+ mplp.max_depth = 250; mplp.max_indel_depth = 250;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
+ mplp.min_frac = 0.002; mplp.min_support = 1;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+ mplp.argc = argc; mplp.argv = argv;
+ mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
+ mplp.output_fname = NULL;
+ mplp.output_type = FT_VCF;
+ mplp.record_cmd_line = 1;
+ mplp.n_threads = 0;
+ mplp.bsmpl = bam_smpl_init();
+
+ static const struct option lopts[] =
+ {
+ {"rf", required_argument, NULL, 1}, // require flag
+ {"ff", required_argument, NULL, 2}, // filter flag
+ {"incl-flags", required_argument, NULL, 1},
+ {"excl-flags", required_argument, NULL, 2},
+ {"output", required_argument, NULL, 3},
+ {"open-prob", required_argument, NULL, 4},
+ {"ignore-RG", no_argument, NULL, 5},
+ {"ignore-rg", no_argument, NULL, 5},
+ {"gvcf", required_argument, NULL, 'g'},
+ {"non-reference", no_argument, NULL, 7},
+ {"no-version", no_argument, NULL, 8},
+ {"threads",required_argument,NULL,9},
+ {"illumina1.3+", no_argument, NULL, '6'},
+ {"count-orphans", no_argument, NULL, 'A'},
+ {"bam-list", required_argument, NULL, 'b'},
+ {"no-BAQ", no_argument, NULL, 'B'},
+ {"no-baq", no_argument, NULL, 'B'},
+ {"adjust-MQ", required_argument, NULL, 'C'},
+ {"adjust-mq", required_argument, NULL, 'C'},
+ {"max-depth", required_argument, NULL, 'd'},
+ {"redo-BAQ", no_argument, NULL, 'E'},
+ {"redo-baq", no_argument, NULL, 'E'},
+ {"fasta-ref", required_argument, NULL, 'f'},
+ {"read-groups", required_argument, NULL, 'G'},
+ {"region", required_argument, NULL, 'r'},
+ {"regions", required_argument, NULL, 'r'},
+ {"regions-file", required_argument, NULL, 'R'},
+ {"targets", required_argument, NULL, 't'},
+ {"targets-file", required_argument, NULL, 'T'},
+ {"min-MQ", required_argument, NULL, 'q'},
+ {"min-mq", required_argument, NULL, 'q'},
+ {"min-BQ", required_argument, NULL, 'Q'},
+ {"min-bq", required_argument, NULL, 'Q'},
+ {"ignore-overlaps", no_argument, NULL, 'x'},
+ {"output-type", required_argument, NULL, 'O'},
+ {"samples", required_argument, NULL, 's'},
+ {"samples-file", required_argument, NULL, 'S'},
+ {"annotate", required_argument, NULL, 'a'},
+ {"ext-prob", required_argument, NULL, 'e'},
+ {"gap-frac", required_argument, NULL, 'F'},
+ {"tandem-qual", required_argument, NULL, 'h'},
+ {"skip-indels", no_argument, NULL, 'I'},
+ {"max-idepth", required_argument, NULL, 'L'},
+ {"min-ireads ", required_argument, NULL, 'm'},
+ {"per-sample-mF", no_argument, NULL, 'p'},
+ {"per-sample-mf", no_argument, NULL, 'p'},
+ {"platforms", required_argument, NULL, 'P'},
+ {NULL, 0, NULL, 0}
+ };
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+ switch (c) {
+ case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
+ case 1 :
+ mplp.rflag_require = bam_str2flag(optarg);
+ if ( mplp.rflag_require<0 ) { fprintf(pysam_stderr,"Could not parse --rf %s\n", optarg); return 1; }
+ break;
+ case 2 :
+ mplp.rflag_filter = bam_str2flag(optarg);
+ if ( mplp.rflag_filter<0 ) { fprintf(pysam_stderr,"Could not parse --ff %s\n", optarg); return 1; }
+ break;
+ case 3 : mplp.output_fname = optarg; break;
+ case 4 : mplp.openQ = atoi(optarg); break;
+ case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break;
+ case 'g':
+ mplp.gvcf = gvcf_init(optarg);
+ if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
+ break;
+ case 'f':
+ mplp.fai = fai_load(optarg);
+ if (mplp.fai == NULL) return 1;
+ mplp.fai_fname = optarg;
+ break;
+ case 7 : noref = 1; break;
+ case 8 : mplp.record_cmd_line = 0; break;
+ case 9 : mplp.n_threads = strtol(optarg, 0, 0); break;
+ case 'd': mplp.max_depth = atoi(optarg); break;
+ case 'r': mplp.reg_fname = strdup(optarg); break;
+ case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break;
+ case 't':
+ // In the original version the whole BAM was streamed which is inefficient
+ // with few BED intervals and big BAMs. Todo: devise a heuristic to determine
+ // best strategy, that is streaming or jumping.
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL);
+ mplp.bed_itr = regitr_init(mplp.bed);
+ if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
+ {
+ fprintf(pysam_stderr,"Could not parse the targets: %s\n", optarg);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case 'T':
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
+ if (!mplp.bed) { fprintf(pysam_stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
+ break;
+ case 'P': mplp.pl_list = strdup(optarg); break;
+ case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
+ case 'B': mplp.flag &= ~MPLP_REALN; break;
+ case 'I': mplp.flag |= MPLP_NO_INDEL; break;
+ case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
+ case '6': mplp.flag |= MPLP_ILLUMINA13; break;
+ case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': mplp.output_type = FT_BCF_GZ; break;
+ case 'u': mplp.output_type = FT_BCF; break;
+ case 'z': mplp.output_type = FT_VCF_GZ; break;
+ case 'v': mplp.output_type = FT_VCF; break;
+ default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n", optarg);
+ }
+ break;
+ case 'C': mplp.capQ_thres = atoi(optarg); break;
+ case 'q': mplp.min_mq = atoi(optarg); break;
+ case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 'b': file_list = optarg; break;
+ case 'o': {
+ char *end;
+ long value = strtol(optarg, &end, 10);
+ // Distinguish between -o INT and -o FILE (a bit of a hack!)
+ if (*end == '\0') mplp.openQ = value;
+ else mplp.output_fname = optarg;
+ }
+ break;
+ case 'e': mplp.extQ = atoi(optarg); break;
+ case 'h': mplp.tandemQ = atoi(optarg); break;
+ case 'A': use_orphan = 1; break;
+ case 'F': mplp.min_frac = atof(optarg); break;
+ case 'm': mplp.min_support = atoi(optarg); break;
+ case 'L': mplp.max_indel_depth = atoi(optarg); break;
+ case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break;
+ case 'a':
+ if (optarg[0]=='?') {
+ list_annotations(pysam_stderr);
+ return 1;
+ }
+ mplp.fmt_flag |= parse_format_flag(optarg);
+ break;
+ default:
+ fprintf(pysam_stderr,"Invalid option: '%c'\n", c);
+ return 1;
+ }
+ }
+
+ if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) )
+ {
+ fprintf(pysam_stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n");
+ mplp.fmt_flag |= B2B_FMT_DP;
+ }
+ if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) )
+ {
+ if ( mplp.flag&MPLP_VCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF;
+ else mplp.output_type = FT_VCF_GZ;
+ }
+ else if ( mplp.flag&MPLP_BCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF;
+ else mplp.output_type = FT_BCF_GZ;
+ }
+ }
+ if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
+ {
+ fprintf(pysam_stderr,"Error: The -B option cannot be combined with -E\n");
+ return 1;
+ }
+ if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
+ if (argc == 1)
+ {
+ print_usage(pysam_stderr, &mplp);
+ return 1;
+ }
+ if (!mplp.fai && !noref) {
+ fprintf(pysam_stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
+ return 1;
+ }
+ int ret,i;
+ if (file_list)
+ {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ mplp.files = fn;
+ mplp.nfiles = nfiles;
+ }
+ else
+ {
+ mplp.nfiles = argc - optind;
+ mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*));
+ for (i=0; i<mplp.nfiles; i++) mplp.files[i] = strdup(argv[optind+i]);
+ }
+ ret = mpileup(&mplp);
+
+ for (i=0; i<mplp.nfiles; i++) free(mplp.files[i]);
+ free(mplp.files);
+ free(mplp.reg_fname); free(mplp.pl_list);
+ if (mplp.fai) fai_destroy(mplp.fai);
+ if (mplp.bed)
+ {
+ regidx_destroy(mplp.bed);
+ regitr_destroy(mplp.bed_itr);
+ }
+ if (mplp.reg) regidx_destroy(mplp.reg);
+ bam_smpl_destroy(mplp.bsmpl);
+ return ret;
+}
diff --git a/bcftools/mw.h b/bcftools/mw.h
new file mode 100644
index 0000000..3e68cbf
--- /dev/null
+++ b/bcftools/mw.h
@@ -0,0 +1,1944 @@
+/* mw.h -- a table of precomputed Mann Whitney coefficients (for bam2bcf.c)
+
+ The MIT License
+
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: James Bonfield <jkb at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+// Code to build this table is below
+#ifdef BUILD_MW
+#include <stdio.h>
+
+double mann_whitney_1947(int n, int m, int U)
+{
+ if (U<0) return 0;
+ if (n==0||m==0) return U==0 ? 1 : 0;
+ return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
+}
+
+int main(void) {
+ int i, j, k;
+ printf("static double mw[6][6][50] = // [2-7][2-7][0-49]\n{\n");
+ for (i = 2; i < 8; i++) {
+ printf(" {\n");
+ for (j = 2; j < 8; j++) {
+ printf(" {\n");
+ for (k = 0; k < 50; k++) {
+ printf(" %.17f,\n", mann_whitney_1947(i,j,k));
+ }
+ printf(" },\n");
+ }
+ printf(" },\n");
+ }
+ printf("};\n");
+ return 0;
+}
+#endif
+
+static double mw[6][6][50] = // [2-7][2-7][0-49]
+{
+ {
+ {
+ 0.16666666666666666,
+ 0.16666666666666666,
+ 0.33333333333333331,
+ 0.16666666666666666,
+ 0.16666666666666666,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.09999999999999999,
+ 0.09999999999999999,
+ 0.19999999999999998,
+ 0.20000000000000001,
+ 0.20000000000000001,
+ 0.10000000000000001,
+ 0.10000000000000001,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.06666666666666665,
+ 0.06666666666666665,
+ 0.13333333333333330,
+ 0.13333333333333333,
+ 0.20000000000000001,
+ 0.13333333333333333,
+ 0.13333333333333333,
+ 0.06666666666666667,
+ 0.06666666666666667,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.04761904761904761,
+ 0.04761904761904761,
+ 0.09523809523809522,
+ 0.09523809523809523,
+ 0.14285714285714288,
+ 0.14285714285714285,
+ 0.14285714285714285,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.04761904761904762,
+ 0.04761904761904762,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.07142857142857141,
+ 0.07142857142857142,
+ 0.10714285714285715,
+ 0.10714285714285714,
+ 0.14285714285714285,
+ 0.10714285714285715,
+ 0.10714285714285715,
+ 0.07142857142857144,
+ 0.07142857142857142,
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.02777777777777777,
+ 0.02777777777777777,
+ 0.05555555555555555,
+ 0.05555555555555555,
+ 0.08333333333333334,
+ 0.08333333333333333,
+ 0.11111111111111110,
+ 0.11111111111111113,
+ 0.11111111111111113,
+ 0.08333333333333334,
+ 0.08333333333333334,
+ 0.05555555555555556,
+ 0.05555555555555555,
+ 0.02777777777777778,
+ 0.02777777777777778,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.10000000000000001,
+ 0.10000000000000001,
+ 0.20000000000000001,
+ 0.20000000000000001,
+ 0.19999999999999998,
+ 0.09999999999999999,
+ 0.09999999999999999,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.05000000000000000,
+ 0.05000000000000000,
+ 0.10000000000000001,
+ 0.14999999999999999,
+ 0.14999999999999999,
+ 0.14999999999999999,
+ 0.14999999999999999,
+ 0.10000000000000001,
+ 0.05000000000000000,
+ 0.05000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.05714285714285714,
+ 0.08571428571428570,
+ 0.11428571428571427,
+ 0.11428571428571427,
+ 0.14285714285714282,
+ 0.11428571428571428,
+ 0.11428571428571428,
+ 0.08571428571428572,
+ 0.05714285714285714,
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.03571428571428571,
+ 0.05357142857142856,
+ 0.07142857142857142,
+ 0.08928571428571427,
+ 0.10714285714285711,
+ 0.10714285714285712,
+ 0.10714285714285714,
+ 0.10714285714285715,
+ 0.08928571428571427,
+ 0.07142857142857142,
+ 0.05357142857142857,
+ 0.03571428571428571,
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.02380952380952381,
+ 0.03571428571428571,
+ 0.04761904761904762,
+ 0.05952380952380951,
+ 0.08333333333333330,
+ 0.08333333333333331,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.05952380952380952,
+ 0.04761904761904762,
+ 0.03571428571428571,
+ 0.02380952380952381,
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.01666666666666666,
+ 0.02499999999999999,
+ 0.03333333333333333,
+ 0.04166666666666666,
+ 0.05833333333333331,
+ 0.06666666666666665,
+ 0.07499999999999998,
+ 0.08333333333333331,
+ 0.08333333333333331,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.07500000000000000,
+ 0.06666666666666667,
+ 0.05833333333333333,
+ 0.04166666666666666,
+ 0.03333333333333333,
+ 0.02500000000000000,
+ 0.01666666666666667,
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.06666666666666667,
+ 0.06666666666666667,
+ 0.13333333333333333,
+ 0.13333333333333333,
+ 0.20000000000000001,
+ 0.13333333333333333,
+ 0.13333333333333330,
+ 0.06666666666666665,
+ 0.06666666666666665,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.05714285714285714,
+ 0.08571428571428572,
+ 0.11428571428571428,
+ 0.11428571428571428,
+ 0.14285714285714282,
+ 0.11428571428571427,
+ 0.11428571428571427,
+ 0.08571428571428570,
+ 0.05714285714285714,
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01428571428571429,
+ 0.01428571428571429,
+ 0.02857142857142857,
+ 0.04285714285714286,
+ 0.07142857142857142,
+ 0.07142857142857142,
+ 0.09999999999999998,
+ 0.09999999999999998,
+ 0.11428571428571427,
+ 0.09999999999999998,
+ 0.09999999999999998,
+ 0.07142857142857142,
+ 0.07142857142857142,
+ 0.04285714285714286,
+ 0.02857142857142857,
+ 0.01428571428571429,
+ 0.01428571428571429,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.01587301587301587,
+ 0.02380952380952381,
+ 0.03968253968253968,
+ 0.04761904761904762,
+ 0.06349206349206349,
+ 0.07142857142857142,
+ 0.08730158730158730,
+ 0.08730158730158730,
+ 0.09523809523809522,
+ 0.08730158730158728,
+ 0.08730158730158730,
+ 0.07142857142857142,
+ 0.06349206349206349,
+ 0.04761904761904761,
+ 0.03968253968253968,
+ 0.02380952380952381,
+ 0.01587301587301587,
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00952380952380952,
+ 0.01428571428571429,
+ 0.02380952380952381,
+ 0.02857142857142857,
+ 0.04285714285714286,
+ 0.04761904761904762,
+ 0.06190476190476190,
+ 0.06666666666666665,
+ 0.07619047619047617,
+ 0.07619047619047617,
+ 0.08571428571428569,
+ 0.07619047619047617,
+ 0.07619047619047620,
+ 0.06666666666666667,
+ 0.06190476190476191,
+ 0.04761904761904762,
+ 0.04285714285714286,
+ 0.02857142857142857,
+ 0.02380952380952381,
+ 0.01428571428571429,
+ 0.00952380952380952,
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00606060606060606,
+ 0.00909090909090909,
+ 0.01515151515151515,
+ 0.01818181818181818,
+ 0.02727272727272727,
+ 0.03333333333333333,
+ 0.04242424242424242,
+ 0.04848484848484847,
+ 0.05757575757575756,
+ 0.06060606060606059,
+ 0.06969696969696967,
+ 0.06969696969696967,
+ 0.07272727272727272,
+ 0.06969696969696969,
+ 0.06969696969696970,
+ 0.06060606060606059,
+ 0.05757575757575757,
+ 0.04848484848484848,
+ 0.04242424242424242,
+ 0.03333333333333333,
+ 0.02727272727272727,
+ 0.01818181818181818,
+ 0.01515151515151515,
+ 0.00909090909090909,
+ 0.00606060606060606,
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.04761904761904762,
+ 0.04761904761904762,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.14285714285714285,
+ 0.14285714285714285,
+ 0.14285714285714288,
+ 0.09523809523809523,
+ 0.09523809523809522,
+ 0.04761904761904761,
+ 0.04761904761904761,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.03571428571428571,
+ 0.05357142857142857,
+ 0.07142857142857142,
+ 0.08928571428571427,
+ 0.10714285714285715,
+ 0.10714285714285714,
+ 0.10714285714285712,
+ 0.10714285714285711,
+ 0.08928571428571427,
+ 0.07142857142857142,
+ 0.05357142857142856,
+ 0.03571428571428571,
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.01587301587301587,
+ 0.02380952380952381,
+ 0.03968253968253968,
+ 0.04761904761904761,
+ 0.06349206349206349,
+ 0.07142857142857142,
+ 0.08730158730158730,
+ 0.08730158730158728,
+ 0.09523809523809522,
+ 0.08730158730158730,
+ 0.08730158730158730,
+ 0.07142857142857142,
+ 0.06349206349206349,
+ 0.04761904761904762,
+ 0.03968253968253968,
+ 0.02380952380952381,
+ 0.01587301587301587,
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00396825396825397,
+ 0.00396825396825397,
+ 0.00793650793650794,
+ 0.01190476190476190,
+ 0.01984126984126984,
+ 0.02777777777777777,
+ 0.03571428571428571,
+ 0.04365079365079365,
+ 0.05555555555555555,
+ 0.06349206349206349,
+ 0.07142857142857142,
+ 0.07539682539682539,
+ 0.07936507936507936,
+ 0.07936507936507936,
+ 0.07539682539682539,
+ 0.07142857142857142,
+ 0.06349206349206349,
+ 0.05555555555555555,
+ 0.04365079365079365,
+ 0.03571428571428571,
+ 0.02777777777777777,
+ 0.01984126984126984,
+ 0.01190476190476190,
+ 0.00793650793650794,
+ 0.00396825396825397,
+ 0.00396825396825397,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00432900432900433,
+ 0.00649350649350649,
+ 0.01082251082251082,
+ 0.01515151515151515,
+ 0.02164502164502164,
+ 0.02597402597402597,
+ 0.03463203463203463,
+ 0.04112554112554112,
+ 0.04978354978354978,
+ 0.05411255411255411,
+ 0.06277056277056275,
+ 0.06493506493506493,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06493506493506492,
+ 0.06277056277056275,
+ 0.05411255411255410,
+ 0.04978354978354978,
+ 0.04112554112554112,
+ 0.03463203463203463,
+ 0.02597402597402597,
+ 0.02164502164502164,
+ 0.01515151515151515,
+ 0.01082251082251082,
+ 0.00649350649350649,
+ 0.00432900432900433,
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00252525252525253,
+ 0.00378787878787879,
+ 0.00631313131313131,
+ 0.00883838383838384,
+ 0.01262626262626262,
+ 0.01641414141414141,
+ 0.02146464646464646,
+ 0.02651515151515151,
+ 0.03282828282828283,
+ 0.03787878787878787,
+ 0.04419191919191919,
+ 0.04924242424242424,
+ 0.05429292929292929,
+ 0.05808080808080808,
+ 0.06060606060606059,
+ 0.06186868686868686,
+ 0.06186868686868686,
+ 0.06060606060606059,
+ 0.05808080808080807,
+ 0.05429292929292930,
+ 0.04924242424242424,
+ 0.04419191919191920,
+ 0.03787878787878787,
+ 0.03282828282828282,
+ 0.02651515151515152,
+ 0.02146464646464646,
+ 0.01641414141414142,
+ 0.01262626262626263,
+ 0.00883838383838384,
+ 0.00631313131313131,
+ 0.00378787878787879,
+ 0.00252525252525253,
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.07142857142857142,
+ 0.07142857142857144,
+ 0.10714285714285715,
+ 0.10714285714285715,
+ 0.14285714285714285,
+ 0.10714285714285714,
+ 0.10714285714285715,
+ 0.07142857142857142,
+ 0.07142857142857141,
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.02380952380952381,
+ 0.03571428571428571,
+ 0.04761904761904762,
+ 0.05952380952380952,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.08333333333333331,
+ 0.08333333333333330,
+ 0.05952380952380951,
+ 0.04761904761904762,
+ 0.03571428571428571,
+ 0.02380952380952381,
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00952380952380952,
+ 0.01428571428571429,
+ 0.02380952380952381,
+ 0.02857142857142857,
+ 0.04285714285714286,
+ 0.04761904761904762,
+ 0.06190476190476191,
+ 0.06666666666666667,
+ 0.07619047619047620,
+ 0.07619047619047617,
+ 0.08571428571428569,
+ 0.07619047619047617,
+ 0.07619047619047617,
+ 0.06666666666666665,
+ 0.06190476190476190,
+ 0.04761904761904762,
+ 0.04285714285714286,
+ 0.02857142857142857,
+ 0.02380952380952381,
+ 0.01428571428571429,
+ 0.00952380952380952,
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00432900432900433,
+ 0.00649350649350649,
+ 0.01082251082251082,
+ 0.01515151515151515,
+ 0.02164502164502164,
+ 0.02597402597402597,
+ 0.03463203463203463,
+ 0.04112554112554112,
+ 0.04978354978354978,
+ 0.05411255411255410,
+ 0.06277056277056275,
+ 0.06493506493506492,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06493506493506493,
+ 0.06277056277056275,
+ 0.05411255411255411,
+ 0.04978354978354978,
+ 0.04112554112554112,
+ 0.03463203463203463,
+ 0.02597402597402597,
+ 0.02164502164502164,
+ 0.01515151515151515,
+ 0.01082251082251082,
+ 0.00649350649350649,
+ 0.00432900432900433,
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00108225108225108,
+ 0.00108225108225108,
+ 0.00216450216450216,
+ 0.00324675324675325,
+ 0.00541125541125541,
+ 0.00757575757575758,
+ 0.01190476190476190,
+ 0.01406926406926407,
+ 0.01948051948051948,
+ 0.02380952380952381,
+ 0.03030303030303030,
+ 0.03463203463203463,
+ 0.04220779220779219,
+ 0.04545454545454544,
+ 0.05194805194805194,
+ 0.05519480519480519,
+ 0.05952380952380951,
+ 0.05952380952380952,
+ 0.06277056277056275,
+ 0.05952380952380952,
+ 0.05952380952380951,
+ 0.05519480519480519,
+ 0.05194805194805194,
+ 0.04545454545454544,
+ 0.04220779220779219,
+ 0.03463203463203463,
+ 0.03030303030303030,
+ 0.02380952380952381,
+ 0.01948051948051948,
+ 0.01406926406926407,
+ 0.01190476190476190,
+ 0.00757575757575758,
+ 0.00541125541125541,
+ 0.00324675324675325,
+ 0.00216450216450216,
+ 0.00108225108225108,
+ 0.00108225108225108,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00116550116550117,
+ 0.00174825174825175,
+ 0.00291375291375291,
+ 0.00407925407925408,
+ 0.00641025641025641,
+ 0.00815850815850816,
+ 0.01107226107226107,
+ 0.01398601398601398,
+ 0.01806526806526806,
+ 0.02156177156177156,
+ 0.02680652680652679,
+ 0.03030303030303030,
+ 0.03554778554778554,
+ 0.03962703962703962,
+ 0.04428904428904428,
+ 0.04720279720279720,
+ 0.05128205128205127,
+ 0.05244755244755244,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05244755244755243,
+ 0.05128205128205127,
+ 0.04720279720279720,
+ 0.04428904428904428,
+ 0.03962703962703962,
+ 0.03554778554778555,
+ 0.03030303030303030,
+ 0.02680652680652681,
+ 0.02156177156177156,
+ 0.01806526806526806,
+ 0.01398601398601399,
+ 0.01107226107226107,
+ 0.00815850815850816,
+ 0.00641025641025641,
+ 0.00407925407925408,
+ 0.00291375291375291,
+ 0.00174825174825175,
+ 0.00116550116550117,
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.02777777777777778,
+ 0.02777777777777778,
+ 0.05555555555555555,
+ 0.05555555555555556,
+ 0.08333333333333334,
+ 0.08333333333333334,
+ 0.11111111111111113,
+ 0.11111111111111113,
+ 0.11111111111111110,
+ 0.08333333333333333,
+ 0.08333333333333334,
+ 0.05555555555555555,
+ 0.05555555555555555,
+ 0.02777777777777777,
+ 0.02777777777777777,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.01666666666666667,
+ 0.02500000000000000,
+ 0.03333333333333333,
+ 0.04166666666666666,
+ 0.05833333333333333,
+ 0.06666666666666667,
+ 0.07500000000000000,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.08333333333333331,
+ 0.08333333333333331,
+ 0.07499999999999998,
+ 0.06666666666666665,
+ 0.05833333333333331,
+ 0.04166666666666666,
+ 0.03333333333333333,
+ 0.02499999999999999,
+ 0.01666666666666666,
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00606060606060606,
+ 0.00909090909090909,
+ 0.01515151515151515,
+ 0.01818181818181818,
+ 0.02727272727272727,
+ 0.03333333333333333,
+ 0.04242424242424242,
+ 0.04848484848484848,
+ 0.05757575757575757,
+ 0.06060606060606059,
+ 0.06969696969696970,
+ 0.06969696969696969,
+ 0.07272727272727272,
+ 0.06969696969696967,
+ 0.06969696969696967,
+ 0.06060606060606059,
+ 0.05757575757575756,
+ 0.04848484848484847,
+ 0.04242424242424242,
+ 0.03333333333333333,
+ 0.02727272727272727,
+ 0.01818181818181818,
+ 0.01515151515151515,
+ 0.00909090909090909,
+ 0.00606060606060606,
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00252525252525253,
+ 0.00378787878787879,
+ 0.00631313131313131,
+ 0.00883838383838384,
+ 0.01262626262626263,
+ 0.01641414141414142,
+ 0.02146464646464646,
+ 0.02651515151515152,
+ 0.03282828282828282,
+ 0.03787878787878787,
+ 0.04419191919191920,
+ 0.04924242424242424,
+ 0.05429292929292930,
+ 0.05808080808080807,
+ 0.06060606060606059,
+ 0.06186868686868686,
+ 0.06186868686868686,
+ 0.06060606060606059,
+ 0.05808080808080808,
+ 0.05429292929292929,
+ 0.04924242424242424,
+ 0.04419191919191919,
+ 0.03787878787878787,
+ 0.03282828282828283,
+ 0.02651515151515151,
+ 0.02146464646464646,
+ 0.01641414141414141,
+ 0.01262626262626262,
+ 0.00883838383838384,
+ 0.00631313131313131,
+ 0.00378787878787879,
+ 0.00252525252525253,
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00116550116550117,
+ 0.00174825174825175,
+ 0.00291375291375291,
+ 0.00407925407925408,
+ 0.00641025641025641,
+ 0.00815850815850816,
+ 0.01107226107226107,
+ 0.01398601398601399,
+ 0.01806526806526806,
+ 0.02156177156177156,
+ 0.02680652680652681,
+ 0.03030303030303030,
+ 0.03554778554778555,
+ 0.03962703962703962,
+ 0.04428904428904428,
+ 0.04720279720279720,
+ 0.05128205128205127,
+ 0.05244755244755243,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05244755244755244,
+ 0.05128205128205127,
+ 0.04720279720279720,
+ 0.04428904428904428,
+ 0.03962703962703962,
+ 0.03554778554778554,
+ 0.03030303030303030,
+ 0.02680652680652679,
+ 0.02156177156177156,
+ 0.01806526806526806,
+ 0.01398601398601398,
+ 0.01107226107226107,
+ 0.00815850815850816,
+ 0.00641025641025641,
+ 0.00407925407925408,
+ 0.00291375291375291,
+ 0.00174825174825175,
+ 0.00116550116550117,
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00029137529137529,
+ 0.00029137529137529,
+ 0.00058275058275058,
+ 0.00087412587412587,
+ 0.00145687645687646,
+ 0.00203962703962704,
+ 0.00320512820512821,
+ 0.00437062937062937,
+ 0.00582750582750583,
+ 0.00757575757575758,
+ 0.00990675990675991,
+ 0.01223776223776224,
+ 0.01544289044289044,
+ 0.01835664335664336,
+ 0.02185314685314686,
+ 0.02534965034965035,
+ 0.02913752913752913,
+ 0.03263403263403263,
+ 0.03642191142191141,
+ 0.03962703962703962,
+ 0.04254079254079253,
+ 0.04516317016317015,
+ 0.04720279720279719,
+ 0.04836829836829836,
+ 0.04924242424242423,
+ 0.04924242424242423,
+ 0.04836829836829836,
+ 0.04720279720279719,
+ 0.04516317016317015,
+ 0.04254079254079253,
+ 0.03962703962703962,
+ 0.03642191142191141,
+ 0.03263403263403263,
+ 0.02913752913752913,
+ 0.02534965034965035,
+ 0.02185314685314686,
+ 0.01835664335664336,
+ 0.01544289044289044,
+ 0.01223776223776224,
+ 0.00990675990675991,
+ 0.00757575757575758,
+ 0.00582750582750583,
+ 0.00437062937062937,
+ 0.00320512820512821,
+ 0.00203962703962704,
+ 0.00145687645687646,
+ 0.00087412587412587,
+ 0.00058275058275058,
+ 0.00029137529137529,
+ 0.00029137529137529,
+ },
+ },
+};
diff --git a/bcftools/ploidy.c b/bcftools/ploidy.c
index 719e175..550ba87 100644
--- a/bcftools/ploidy.c
+++ b/bcftools/ploidy.c
@@ -1,4 +1,4 @@
-/*
+/*
Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -22,7 +22,6 @@
THE SOFTWARE.
*/
-#include <htslib/regidx.h>
#include <htslib/khash_str2int.h>
#include <htslib/kseq.h>
#include <htslib/hts.h>
@@ -35,6 +34,7 @@ struct _ploidy_t
int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
int *sex2dflt;
regidx_t *idx;
+ regitr_t *itr;
void *sex2id;
char **id2sex;
kstring_t tmp_str;
@@ -52,7 +52,7 @@ regidx_t *ploidy_regions(ploidy_t *ploidy)
return ploidy->idx;
}
-int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
{
int i, ret;
ploidy_t *ploidy = (ploidy_t*) usr;
@@ -68,7 +68,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v
else
{
// Fill CHR,FROM,TO
- ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
if ( ret!=0 ) return ret;
}
@@ -144,6 +144,7 @@ ploidy_t *ploidy_init(const char *fname, int dflt)
ploidy_destroy(pld);
return NULL;
}
+ pld->itr = regitr_init(pld->idx);
_set_defaults(pld,dflt);
return pld;
}
@@ -156,6 +157,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+ pld->itr = regitr_init(pld->idx);
kstring_t tmp = {0,0,0};
const char *ss = str;
@@ -170,7 +172,6 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
while ( *se && isspace(*se) ) se++;
ss = se;
}
- regidx_insert(pld->idx,NULL);
free(tmp.s);
_set_defaults(pld,dflt);
@@ -180,6 +181,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
void ploidy_destroy(ploidy_t *ploidy)
{
if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
+ if ( ploidy->itr ) regitr_destroy(ploidy->itr);
if ( ploidy->idx ) regidx_destroy(ploidy->idx);
free(ploidy->id2sex);
free(ploidy->tmp_str.s);
@@ -189,8 +191,7 @@ void ploidy_destroy(ploidy_t *ploidy)
int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
{
- regitr_t itr;
- int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr);
+ int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
if ( !sex2ploidy && !min && !max ) return ret;
@@ -207,17 +208,16 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min
int _min = INT_MAX, _max = -1;
if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
- while ( REGITR_OVERLAP(itr,pos,pos) )
+ while ( regitr_overlap(ploidy->itr) )
{
- int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex;
- int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy;
+ int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
+ int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
if ( pld!=ploidy->dflt )
{
if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
if ( _min > pld ) _min = pld;
if ( _max < pld ) _max = pld;
}
- itr.i++;
}
if ( _max==-1 ) _max = _min = ploidy->dflt;
if ( max ) *max = _max;
diff --git a/bcftools/ploidy.c.pysam.c b/bcftools/ploidy.c.pysam.c
index d0468b9..2eb9bd8 100644
--- a/bcftools/ploidy.c.pysam.c
+++ b/bcftools/ploidy.c.pysam.c
@@ -1,6 +1,6 @@
#include "pysam.h"
-/*
+/*
Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -24,7 +24,6 @@
THE SOFTWARE.
*/
-#include <htslib/regidx.h>
#include <htslib/khash_str2int.h>
#include <htslib/kseq.h>
#include <htslib/hts.h>
@@ -37,6 +36,7 @@ struct _ploidy_t
int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
int *sex2dflt;
regidx_t *idx;
+ regitr_t *itr;
void *sex2id;
char **id2sex;
kstring_t tmp_str;
@@ -54,7 +54,7 @@ regidx_t *ploidy_regions(ploidy_t *ploidy)
return ploidy->idx;
}
-int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
{
int i, ret;
ploidy_t *ploidy = (ploidy_t*) usr;
@@ -70,7 +70,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v
else
{
// Fill CHR,FROM,TO
- ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
if ( ret!=0 ) return ret;
}
@@ -146,6 +146,7 @@ ploidy_t *ploidy_init(const char *fname, int dflt)
ploidy_destroy(pld);
return NULL;
}
+ pld->itr = regitr_init(pld->idx);
_set_defaults(pld,dflt);
return pld;
}
@@ -158,6 +159,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+ pld->itr = regitr_init(pld->idx);
kstring_t tmp = {0,0,0};
const char *ss = str;
@@ -172,7 +174,6 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
while ( *se && isspace(*se) ) se++;
ss = se;
}
- regidx_insert(pld->idx,NULL);
free(tmp.s);
_set_defaults(pld,dflt);
@@ -182,6 +183,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
void ploidy_destroy(ploidy_t *ploidy)
{
if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
+ if ( ploidy->itr ) regitr_destroy(ploidy->itr);
if ( ploidy->idx ) regidx_destroy(ploidy->idx);
free(ploidy->id2sex);
free(ploidy->tmp_str.s);
@@ -191,8 +193,7 @@ void ploidy_destroy(ploidy_t *ploidy)
int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
{
- regitr_t itr;
- int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr);
+ int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
if ( !sex2ploidy && !min && !max ) return ret;
@@ -209,17 +210,16 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min
int _min = INT_MAX, _max = -1;
if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
- while ( REGITR_OVERLAP(itr,pos,pos) )
+ while ( regitr_overlap(ploidy->itr) )
{
- int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex;
- int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy;
+ int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
+ int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
if ( pld!=ploidy->dflt )
{
if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
if ( _min > pld ) _min = pld;
if ( _max < pld ) _max = pld;
}
- itr.i++;
}
if ( _max==-1 ) _max = _min = ploidy->dflt;
if ( max ) *max = _max;
diff --git a/bcftools/ploidy.h b/bcftools/ploidy.h
index 6deef73..1e7d2f7 100644
--- a/bcftools/ploidy.h
+++ b/bcftools/ploidy.h
@@ -55,7 +55,7 @@
#ifndef __PLOIDY_H__
#define __PLOIDY_H__
-#include <htslib/regidx.h>
+#include "regidx.h"
typedef struct _ploidy_t ploidy_t;
diff --git a/bcftools/prob1.c b/bcftools/prob1.c
index 8f4463f..954d43c 100644
--- a/bcftools/prob1.c
+++ b/bcftools/prob1.c
@@ -157,8 +157,9 @@ int test16(bcf1_t *b, anno16_t *a);
static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
{
int i, j;
- long *p, tmp;
- p = (long*) alloca(b->n_allele * sizeof(long));
+ long p_a[16], *p=p_a, tmp;
+ if (b->n_allele > 16)
+ p = (long*) malloc(b->n_allele * sizeof(long));
memset(p, 0, sizeof(long) * b->n_allele);
// Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk
@@ -177,12 +178,14 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
for (i = b->n_allele - 1; i >= 0; --i)
if ((p[i]&0xf) == 0) break;
+ if (p != p_a)
+ free(p);
return i;
}
-/* f0 is minor allele fraction */
-int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+/* f0 is freq of the ref allele */
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var)
{
double sum, g[3];
double max, f3[3], *pdg = ma->pdg + k * 3;
@@ -203,6 +206,7 @@ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
g[i] /= sum;
if (g[i] > max) max = g[i], max_i = i;
}
+ if ( !is_var ) { max_i = 2; max = g[2]; } // force 0/0 genotype if the site is non-variant
max = 1. - max;
if (max < 1e-308) max = 1e-308;
q = (int)(-4.343 * log(max) + .499);
diff --git a/bcftools/prob1.c.pysam.c b/bcftools/prob1.c.pysam.c
index a59ec44..f4f4271 100644
--- a/bcftools/prob1.c.pysam.c
+++ b/bcftools/prob1.c.pysam.c
@@ -159,8 +159,9 @@ int test16(bcf1_t *b, anno16_t *a);
static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
{
int i, j;
- long *p, tmp;
- p = (long*) alloca(b->n_allele * sizeof(long));
+ long p_a[16], *p=p_a, tmp;
+ if (b->n_allele > 16)
+ p = (long*) malloc(b->n_allele * sizeof(long));
memset(p, 0, sizeof(long) * b->n_allele);
// Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk
@@ -179,12 +180,14 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
for (i = b->n_allele - 1; i >= 0; --i)
if ((p[i]&0xf) == 0) break;
+ if (p != p_a)
+ free(p);
return i;
}
-/* f0 is minor allele fraction */
-int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+/* f0 is freq of the ref allele */
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var)
{
double sum, g[3];
double max, f3[3], *pdg = ma->pdg + k * 3;
@@ -205,6 +208,7 @@ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
g[i] /= sum;
if (g[i] > max) max = g[i], max_i = i;
}
+ if ( !is_var ) { max_i = 2; max = g[2]; } // force 0/0 genotype if the site is non-variant
max = 1. - max;
if (max < 1e-308) max = 1e-308;
q = (int)(-4.343 * log(max) + .499);
diff --git a/bcftools/prob1.h b/bcftools/prob1.h
index 1594d3f..a3d4b0d 100644
--- a/bcftools/prob1.h
+++ b/bcftools/prob1.h
@@ -78,7 +78,7 @@ extern "C" {
void bcf_p1_destroy(bcf_p1aux_t *ma);
void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma);
int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
- int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
+ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var);
void bcf_p1_dump_afs(bcf_p1aux_t *ma);
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
diff --git a/bcftools/regidx.c b/bcftools/regidx.c
new file mode 100644
index 0000000..84646a8
--- /dev/null
+++ b/bcftools/regidx.c
@@ -0,0 +1,598 @@
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <strings.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include "regidx.h"
+
+#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based
+
+#define iBIN(x) ((x)>>13)
+
+typedef struct
+{
+ uint32_t beg, end;
+}
+reg_t;
+
+typedef struct
+{
+ uint32_t pos, ireg; // y-coordinate and a pointer to reglist.reg and reglist.dat
+}
+pos_t;
+
+typedef struct _reglist_t reglist_t;
+
+typedef struct
+{
+ uint32_t beg, end, ireg; // query coordinates and the active region
+ regidx_t *ridx;
+ reglist_t *list;
+ int active;
+}
+_itr_t;
+
+// List of regions for one chromosome.
+struct _reglist_t
+{
+ uint32_t *idx, nidx; // index to list.reg+1
+ uint32_t nreg, mreg; // n:used, m:allocated
+ reg_t *reg; // regions
+ void *dat; // payload data
+ char *seq; // sequence name
+ int unsorted;
+
+};
+
+// Container of all sequences
+struct _regidx_t
+{
+ int nseq, mseq; // n:used, m:alloced
+ reglist_t *seq; // regions for each sequence
+ void *seq2regs; // hash for fast lookup from chr name to regions
+ char **seq_names;
+ regidx_free_f free; // function to free any data allocated by regidx_parse_f
+ regidx_parse_f parse; // parse one input line
+ void *usr; // user data to pass to regidx_parse_f
+ int payload_size;
+ void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand)
+ kstring_t str;
+};
+
+int regidx_seq_nregs(regidx_t *idx, const char *seq)
+{
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
+ return idx->seq[iseq].nreg;
+}
+
+int regidx_nregs(regidx_t *idx)
+{
+ int i, nreg = 0;
+ for (i=0; i<idx->nseq; i++) nreg += idx->seq[i].nreg;
+ return nreg;
+}
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+ *n = idx->nseq;
+ return idx->seq_names;
+}
+
+int regidx_insert_list(regidx_t *idx, char *line, char delim)
+{
+ kstring_t tmp = {0,0,0};
+ char *ss = line;
+ while ( *ss )
+ {
+ char *se = ss;
+ while ( *se && *se!=delim ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ if ( regidx_insert(idx,tmp.s) < 0 )
+ {
+ free(tmp.s);
+ return -1;
+ }
+ if ( !*se ) break;
+ ss = se+1;
+ }
+ free(tmp.s);
+ return 0;
+}
+
+static inline int cmp_regs(reg_t *a, reg_t *b)
+{
+ if ( a->beg < b->beg ) return -1;
+ if ( a->beg > b->beg ) return 1;
+ if ( a->end < b->end ) return 1; // longer intervals come first
+ if ( a->end > b->end ) return -1;
+ return 0;
+}
+static int cmp_reg_ptrs(const void *a, const void *b)
+{
+ return cmp_regs((reg_t*)a,(reg_t*)b);
+}
+static int cmp_reg_ptrs2(const void *a, const void *b)
+{
+ return cmp_regs(*((reg_t**)a),*((reg_t**)b));
+}
+
+inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload)
+{
+ if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0;
+ if ( end > MAX_COOR_0 ) end = MAX_COOR_0;
+
+ int rid;
+ idx->str.l = 0;
+ kputsn(chr_beg, chr_end-chr_beg+1, &idx->str);
+ if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+ {
+ // new chromosome
+ idx->nseq++;
+ int m_prev = idx->mseq;
+ hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+ hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+ idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+ rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+ }
+
+ reglist_t *list = &idx->seq[rid];
+ list->seq = idx->seq_names[rid];
+ list->nreg++;
+ int mreg = list->mreg;
+ hts_expand(reg_t,list->nreg,list->mreg,list->reg);
+ list->reg[list->nreg-1].beg = beg;
+ list->reg[list->nreg-1].end = end;
+ if ( idx->payload_size )
+ {
+ if ( mreg != list->mreg ) list->dat = realloc(list->dat,idx->payload_size*list->mreg);
+ memcpy((char *)list->dat + idx->payload_size*(list->nreg-1), payload, idx->payload_size);
+ }
+ if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1;
+ return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+ if ( !line ) return 0;
+ char *chr_from, *chr_to;
+ uint32_t beg,end;
+ int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr);
+ if ( ret==-2 ) return -1; // error
+ if ( ret==-1 ) return 0; // skip the line
+ regidx_push(idx, chr_from,chr_to,beg,end,idx->payload);
+ return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ if ( !parser )
+ {
+ if ( !fname ) parser = regidx_parse_tab;
+ else
+ {
+ int len = strlen(fname);
+ if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
+ parser = regidx_parse_bed;
+ else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
+ parser = regidx_parse_bed;
+ else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
+ parser = regidx_parse_bed;
+ else
+ parser = regidx_parse_tab;
+ }
+ }
+
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ idx->free = free_f;
+ idx->parse = parser;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ if ( !fname ) return idx;
+
+ kstring_t str = {0,0,0};
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) goto error;
+
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( regidx_insert(idx, str.s) ) goto error;
+ }
+
+ free(str.s);
+ hts_close(fp);
+ return idx;
+
+error:
+ free(str.s);
+ if ( fp ) hts_close(fp);
+ regidx_destroy(idx);
+ return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+ int i, j;
+ for (i=0; i<idx->nseq; i++)
+ {
+ reglist_t *list = &idx->seq[i];
+ if ( idx->free )
+ {
+ for (j=0; j<list->nreg; j++)
+ idx->free((char *)list->dat + idx->payload_size*j);
+ }
+ free(list->dat);
+ free(list->reg);
+ free(list->idx);
+ }
+ free(idx->seq_names);
+ free(idx->seq);
+ free(idx->str.s);
+ free(idx->payload);
+ khash_str2int_destroy_free(idx->seq2regs);
+ free(idx);
+}
+
+int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+{
+ int i;
+ if ( list->unsorted )
+ {
+ if ( !regidx->payload_size )
+ qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
+ else
+ {
+ reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
+ for (i=0; i<list->nreg; i++) ptr[i] = list->reg + i;
+ qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
+
+ void *tmp_dat = malloc(regidx->payload_size*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ memcpy((char *)tmp_dat+i*regidx->payload_size,
+ (char *)list->dat+iori*regidx->payload_size,
+ regidx->payload_size);
+ }
+ free(list->dat);
+ list->dat = tmp_dat;
+
+ reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ tmp_reg[i] = list->reg[iori];
+ }
+ free(ptr);
+ free(list->reg);
+ list->reg = tmp_reg;
+ list->mreg = list->nreg;
+ }
+ list->unsorted = 0;
+ }
+
+ list->nidx = 0;
+ int j,k, midx = 0;
+ for (j=0; j<list->nreg; j++)
+ {
+ int ibeg = iBIN(list->reg[j].beg);
+ int iend = iBIN(list->reg[j].end);
+ if ( midx <= iend )
+ {
+ int old_midx = midx;
+ midx = iend + 1;
+ kroundup32(midx);
+ list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t));
+ memset(list->idx+old_midx, 0, sizeof(uint32_t)*(midx-old_midx));
+ }
+ if ( ibeg==iend )
+ {
+ if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1;
+ }
+ else
+ {
+ for (k=ibeg; k<=iend; k++)
+ if ( !list->idx[k] ) list->idx[k] = j + 1;
+ }
+ if ( list->nidx < iend+1 ) list->nidx = iend+1;
+ }
+
+ return 0;
+}
+
+int regidx_overlap(regidx_t *regidx, const char *chr, uint32_t beg, uint32_t end, regitr_t *regitr)
+{
+ if ( regitr ) regitr->seq = NULL;
+
+ int iseq, ireg;
+ if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+ reglist_t *list = ®idx->seq[iseq];
+ if ( !list->nreg ) return 0;
+
+ if ( list->nreg==1 )
+ {
+ if ( beg > list->reg[0].end ) return 0;
+ if ( end < list->reg[0].beg ) return 0;
+ ireg = 0;
+ }
+ else
+ {
+ if ( !list->idx )
+ _reglist_build_index(regidx,list);
+
+ int ibeg = iBIN(beg);
+ if ( ibeg >= list->nidx ) return 0; // beg is too big
+
+ // find a matching region
+ uint32_t i = list->idx[ibeg];
+ if ( !i )
+ {
+ int iend = iBIN(end);
+ if ( iend > list->nidx ) iend = list->nidx;
+ for (i=ibeg; i<iend; i++)
+ if ( list->idx[i] ) break;
+ if ( i==iend ) return 0;
+ i = list->idx[i];
+ }
+
+ for (ireg=i-1; ireg<list->nreg; ireg++)
+ {
+ if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region
+ if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found
+ }
+
+ if ( ireg >= list->nreg ) return 0; // no match
+ }
+
+ if ( !regitr ) return 1; // match, but no more info to save
+
+ // may need to iterate over the matching regions later
+ _itr_t *itr = (_itr_t*)regitr->itr;
+ itr->ridx = regidx;
+ itr->list = list;
+ itr->beg = beg;
+ itr->end = end;
+ itr->ireg = ireg;
+ itr->active = 0;
+
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[ireg].beg;
+ regitr->end = list->reg[ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)list->dat + regidx->payload_size*ireg;
+
+ return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ ss = se+1;
+ *end = strtod(ss, &se) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && *se!=':' ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(stderr,"Could not parse reg line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = se[0]=='-' ? MAX_COOR_0 : *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+regitr_t *regitr_init(regidx_t *regidx)
+{
+ regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t));
+ regitr->itr = (_itr_t*) calloc(1,sizeof(_itr_t));
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ itr->ridx = regidx;
+ itr->list = NULL;
+ return regitr;
+}
+
+void regitr_reset(regidx_t *regidx, regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ memset(itr,0,sizeof(_itr_t));
+ itr->ridx = regidx;
+}
+
+void regitr_destroy(regitr_t *regitr)
+{
+ free(regitr->itr);
+ free(regitr);
+}
+
+int regitr_overlap(regitr_t *regitr)
+{
+ if ( !regitr->seq ) return 0;
+
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ if ( !itr->active )
+ {
+ // is this the first call after regidx_overlap?
+ itr->active = 1;
+ itr->ireg++;
+ return 1;
+ }
+
+ reglist_t *list = itr->list;
+
+ int i;
+ for (i=itr->ireg; i<list->nreg; i++)
+ {
+ if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region
+ if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found
+ }
+
+ if ( i >= list->nreg ) return 0; // no match
+
+ itr->ireg = i + 1;
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[i].beg;
+ regitr->end = list->reg[i].end;
+ if ( itr->ridx->payload_size )
+ regitr->payload = (char *)list->dat + itr->ridx->payload_size*i;
+
+ return 1;
+}
+
+int regitr_loop(regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ regidx_t *regidx = itr->ridx;
+
+ if ( !itr->list ) // first time here
+ {
+ itr->list = regidx->seq;
+ itr->ireg = 0;
+ }
+
+ size_t iseq = itr->list - regidx->seq;
+ if ( iseq >= regidx->nseq ) return 0;
+
+ if ( itr->ireg >= itr->list->nreg )
+ {
+ iseq++;
+ if ( iseq >= regidx->nseq ) return 0; // no more sequences, done
+ itr->ireg = 0;
+ itr->list = ®idx->seq[iseq];
+ }
+
+ regitr->seq = itr->list->seq;
+ regitr->beg = itr->list->reg[itr->ireg].beg;
+ regitr->end = itr->list->reg[itr->ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg;
+ itr->ireg++;
+
+ return 1;
+}
+
+
+
diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c
new file mode 100644
index 0000000..4d6dcda
--- /dev/null
+++ b/bcftools/regidx.c.pysam.c
@@ -0,0 +1,600 @@
+#include "pysam.h"
+
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <strings.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include "regidx.h"
+
+#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based
+
+#define iBIN(x) ((x)>>13)
+
+typedef struct
+{
+ uint32_t beg, end;
+}
+reg_t;
+
+typedef struct
+{
+ uint32_t pos, ireg; // y-coordinate and a pointer to reglist.reg and reglist.dat
+}
+pos_t;
+
+typedef struct _reglist_t reglist_t;
+
+typedef struct
+{
+ uint32_t beg, end, ireg; // query coordinates and the active region
+ regidx_t *ridx;
+ reglist_t *list;
+ int active;
+}
+_itr_t;
+
+// List of regions for one chromosome.
+struct _reglist_t
+{
+ uint32_t *idx, nidx; // index to list.reg+1
+ uint32_t nreg, mreg; // n:used, m:allocated
+ reg_t *reg; // regions
+ void *dat; // payload data
+ char *seq; // sequence name
+ int unsorted;
+
+};
+
+// Container of all sequences
+struct _regidx_t
+{
+ int nseq, mseq; // n:used, m:alloced
+ reglist_t *seq; // regions for each sequence
+ void *seq2regs; // hash for fast lookup from chr name to regions
+ char **seq_names;
+ regidx_free_f free; // function to free any data allocated by regidx_parse_f
+ regidx_parse_f parse; // parse one input line
+ void *usr; // user data to pass to regidx_parse_f
+ int payload_size;
+ void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand)
+ kstring_t str;
+};
+
+int regidx_seq_nregs(regidx_t *idx, const char *seq)
+{
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
+ return idx->seq[iseq].nreg;
+}
+
+int regidx_nregs(regidx_t *idx)
+{
+ int i, nreg = 0;
+ for (i=0; i<idx->nseq; i++) nreg += idx->seq[i].nreg;
+ return nreg;
+}
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+ *n = idx->nseq;
+ return idx->seq_names;
+}
+
+int regidx_insert_list(regidx_t *idx, char *line, char delim)
+{
+ kstring_t tmp = {0,0,0};
+ char *ss = line;
+ while ( *ss )
+ {
+ char *se = ss;
+ while ( *se && *se!=delim ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ if ( regidx_insert(idx,tmp.s) < 0 )
+ {
+ free(tmp.s);
+ return -1;
+ }
+ if ( !*se ) break;
+ ss = se+1;
+ }
+ free(tmp.s);
+ return 0;
+}
+
+static inline int cmp_regs(reg_t *a, reg_t *b)
+{
+ if ( a->beg < b->beg ) return -1;
+ if ( a->beg > b->beg ) return 1;
+ if ( a->end < b->end ) return 1; // longer intervals come first
+ if ( a->end > b->end ) return -1;
+ return 0;
+}
+static int cmp_reg_ptrs(const void *a, const void *b)
+{
+ return cmp_regs((reg_t*)a,(reg_t*)b);
+}
+static int cmp_reg_ptrs2(const void *a, const void *b)
+{
+ return cmp_regs(*((reg_t**)a),*((reg_t**)b));
+}
+
+inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload)
+{
+ if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0;
+ if ( end > MAX_COOR_0 ) end = MAX_COOR_0;
+
+ int rid;
+ idx->str.l = 0;
+ kputsn(chr_beg, chr_end-chr_beg+1, &idx->str);
+ if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+ {
+ // new chromosome
+ idx->nseq++;
+ int m_prev = idx->mseq;
+ hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+ hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+ idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+ rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+ }
+
+ reglist_t *list = &idx->seq[rid];
+ list->seq = idx->seq_names[rid];
+ list->nreg++;
+ int mreg = list->mreg;
+ hts_expand(reg_t,list->nreg,list->mreg,list->reg);
+ list->reg[list->nreg-1].beg = beg;
+ list->reg[list->nreg-1].end = end;
+ if ( idx->payload_size )
+ {
+ if ( mreg != list->mreg ) list->dat = realloc(list->dat,idx->payload_size*list->mreg);
+ memcpy((char *)list->dat + idx->payload_size*(list->nreg-1), payload, idx->payload_size);
+ }
+ if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1;
+ return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+ if ( !line ) return 0;
+ char *chr_from, *chr_to;
+ uint32_t beg,end;
+ int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr);
+ if ( ret==-2 ) return -1; // error
+ if ( ret==-1 ) return 0; // skip the line
+ regidx_push(idx, chr_from,chr_to,beg,end,idx->payload);
+ return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ if ( !parser )
+ {
+ if ( !fname ) parser = regidx_parse_tab;
+ else
+ {
+ int len = strlen(fname);
+ if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
+ parser = regidx_parse_bed;
+ else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
+ parser = regidx_parse_bed;
+ else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
+ parser = regidx_parse_bed;
+ else
+ parser = regidx_parse_tab;
+ }
+ }
+
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ idx->free = free_f;
+ idx->parse = parser;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ if ( !fname ) return idx;
+
+ kstring_t str = {0,0,0};
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) goto error;
+
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( regidx_insert(idx, str.s) ) goto error;
+ }
+
+ free(str.s);
+ hts_close(fp);
+ return idx;
+
+error:
+ free(str.s);
+ if ( fp ) hts_close(fp);
+ regidx_destroy(idx);
+ return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+ int i, j;
+ for (i=0; i<idx->nseq; i++)
+ {
+ reglist_t *list = &idx->seq[i];
+ if ( idx->free )
+ {
+ for (j=0; j<list->nreg; j++)
+ idx->free((char *)list->dat + idx->payload_size*j);
+ }
+ free(list->dat);
+ free(list->reg);
+ free(list->idx);
+ }
+ free(idx->seq_names);
+ free(idx->seq);
+ free(idx->str.s);
+ free(idx->payload);
+ khash_str2int_destroy_free(idx->seq2regs);
+ free(idx);
+}
+
+int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+{
+ int i;
+ if ( list->unsorted )
+ {
+ if ( !regidx->payload_size )
+ qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
+ else
+ {
+ reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
+ for (i=0; i<list->nreg; i++) ptr[i] = list->reg + i;
+ qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
+
+ void *tmp_dat = malloc(regidx->payload_size*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ memcpy((char *)tmp_dat+i*regidx->payload_size,
+ (char *)list->dat+iori*regidx->payload_size,
+ regidx->payload_size);
+ }
+ free(list->dat);
+ list->dat = tmp_dat;
+
+ reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ tmp_reg[i] = list->reg[iori];
+ }
+ free(ptr);
+ free(list->reg);
+ list->reg = tmp_reg;
+ list->mreg = list->nreg;
+ }
+ list->unsorted = 0;
+ }
+
+ list->nidx = 0;
+ int j,k, midx = 0;
+ for (j=0; j<list->nreg; j++)
+ {
+ int ibeg = iBIN(list->reg[j].beg);
+ int iend = iBIN(list->reg[j].end);
+ if ( midx <= iend )
+ {
+ int old_midx = midx;
+ midx = iend + 1;
+ kroundup32(midx);
+ list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t));
+ memset(list->idx+old_midx, 0, sizeof(uint32_t)*(midx-old_midx));
+ }
+ if ( ibeg==iend )
+ {
+ if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1;
+ }
+ else
+ {
+ for (k=ibeg; k<=iend; k++)
+ if ( !list->idx[k] ) list->idx[k] = j + 1;
+ }
+ if ( list->nidx < iend+1 ) list->nidx = iend+1;
+ }
+
+ return 0;
+}
+
+int regidx_overlap(regidx_t *regidx, const char *chr, uint32_t beg, uint32_t end, regitr_t *regitr)
+{
+ if ( regitr ) regitr->seq = NULL;
+
+ int iseq, ireg;
+ if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+ reglist_t *list = ®idx->seq[iseq];
+ if ( !list->nreg ) return 0;
+
+ if ( list->nreg==1 )
+ {
+ if ( beg > list->reg[0].end ) return 0;
+ if ( end < list->reg[0].beg ) return 0;
+ ireg = 0;
+ }
+ else
+ {
+ if ( !list->idx )
+ _reglist_build_index(regidx,list);
+
+ int ibeg = iBIN(beg);
+ if ( ibeg >= list->nidx ) return 0; // beg is too big
+
+ // find a matching region
+ uint32_t i = list->idx[ibeg];
+ if ( !i )
+ {
+ int iend = iBIN(end);
+ if ( iend > list->nidx ) iend = list->nidx;
+ for (i=ibeg; i<iend; i++)
+ if ( list->idx[i] ) break;
+ if ( i==iend ) return 0;
+ i = list->idx[i];
+ }
+
+ for (ireg=i-1; ireg<list->nreg; ireg++)
+ {
+ if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region
+ if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found
+ }
+
+ if ( ireg >= list->nreg ) return 0; // no match
+ }
+
+ if ( !regitr ) return 1; // match, but no more info to save
+
+ // may need to iterate over the matching regions later
+ _itr_t *itr = (_itr_t*)regitr->itr;
+ itr->ridx = regidx;
+ itr->list = list;
+ itr->beg = beg;
+ itr->end = end;
+ itr->ireg = ireg;
+ itr->active = 0;
+
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[ireg].beg;
+ regitr->end = list->reg[ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)list->dat + regidx->payload_size*ireg;
+
+ return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ ss = se+1;
+ *end = strtod(ss, &se) - 1;
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse tab line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && *se!=':' ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse reg line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = se[0]=='-' ? MAX_COOR_0 : *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+regitr_t *regitr_init(regidx_t *regidx)
+{
+ regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t));
+ regitr->itr = (_itr_t*) calloc(1,sizeof(_itr_t));
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ itr->ridx = regidx;
+ itr->list = NULL;
+ return regitr;
+}
+
+void regitr_reset(regidx_t *regidx, regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ memset(itr,0,sizeof(_itr_t));
+ itr->ridx = regidx;
+}
+
+void regitr_destroy(regitr_t *regitr)
+{
+ free(regitr->itr);
+ free(regitr);
+}
+
+int regitr_overlap(regitr_t *regitr)
+{
+ if ( !regitr->seq ) return 0;
+
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ if ( !itr->active )
+ {
+ // is this the first call after regidx_overlap?
+ itr->active = 1;
+ itr->ireg++;
+ return 1;
+ }
+
+ reglist_t *list = itr->list;
+
+ int i;
+ for (i=itr->ireg; i<list->nreg; i++)
+ {
+ if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region
+ if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found
+ }
+
+ if ( i >= list->nreg ) return 0; // no match
+
+ itr->ireg = i + 1;
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[i].beg;
+ regitr->end = list->reg[i].end;
+ if ( itr->ridx->payload_size )
+ regitr->payload = (char *)list->dat + itr->ridx->payload_size*i;
+
+ return 1;
+}
+
+int regitr_loop(regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ regidx_t *regidx = itr->ridx;
+
+ if ( !itr->list ) // first time here
+ {
+ itr->list = regidx->seq;
+ itr->ireg = 0;
+ }
+
+ size_t iseq = itr->list - regidx->seq;
+ if ( iseq >= regidx->nseq ) return 0;
+
+ if ( itr->ireg >= itr->list->nreg )
+ {
+ iseq++;
+ if ( iseq >= regidx->nseq ) return 0; // no more sequences, done
+ itr->ireg = 0;
+ itr->list = ®idx->seq[iseq];
+ }
+
+ regitr->seq = itr->list->seq;
+ regitr->beg = itr->list->reg[itr->ireg].beg;
+ regitr->end = itr->list->reg[itr->ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg;
+ itr->ireg++;
+
+ return 1;
+}
+
+
+
diff --git a/bcftools/regidx.h b/bcftools/regidx.h
new file mode 100644
index 0000000..8e25fe1
--- /dev/null
+++ b/bcftools/regidx.h
@@ -0,0 +1,191 @@
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/*
+ Region indexing with an optional payload.
+
+ Example of usage:
+
+ // Init the parser and print regions. In this example the payload is a
+ // pointer to a string. For the description of parse_custom and
+ // free_custom functions, see regidx_parse_f and regidx_free_f below,
+ // and for working example see test/test-regidx.c.
+ regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
+
+ // Query overlap with chr:from-to
+ regitr_t *itr = regitr_init(idx);
+ if ( regidx_overlap(idx, chr,from,to, itr) ) printf("There is an overlap!\n");
+
+ while ( regitr_overlap(itr) )
+ {
+ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to,
+ itr->beg, itr->end, regitr_payload(itr,char*));
+ }
+
+ regidx_destroy(idx);
+ regitr_destroy(itr);
+
+
+ Another example, loop over all regions:
+
+ regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL);
+ regitr_t *itr = regitr_init(idx);
+
+ while ( regitr_loop(itr) )
+ printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg, itr->end);
+
+ regidx_destroy(idx);
+ regitr_destroy(itr);
+*/
+
+#ifndef __REGIDX_H__
+#define __REGIDX_H__
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REGIDX_MAX 2147483646 // maximum regidx coordinate (0-based)
+
+typedef struct _regidx_t regidx_t;
+typedef struct
+{
+ uint32_t beg,end;
+ void *payload;
+ char *seq;
+ void *itr;
+}
+regitr_t;
+
+#define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload))
+
+/*
+ * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
+ * or regidx_parse_tab below. The function is expected to set `chr_from` and
+ * `chr_to` to point to first and last character of chromosome name and set
+ * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was
+ * called with non-zero payload_size, the `payload` points to a memory
+ * location of the payload_size and `usr` is the data passed to regidx_init().
+ * Any memory allocated by the function will be freed by regidx_free_f called
+ * by regidx_destroy().
+ *
+ * Return value: 0 on success, -1 to skip a record, -2 on fatal error.
+ */
+typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr);
+typedef void (*regidx_free_f)(void *payload);
+
+/*
+ * A note about the parsers:
+ * - leading spaces are ignored
+ * - lines starting with "#" are ignored
+ */
+int regidx_parse_bed(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open)
+int regidx_parse_tab(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive)
+int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive)
+
+/*
+ * regidx_init() - creates new index
+ * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert()
+ * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
+ * the format will be autodected, currently either regidx_parse_tab (the default) or
+ * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that
+ * the exact autodetection algorithm will change.
+ * @param freef: NULL or see description of regidx_parse_f
+ * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
+ * @param usr: optional user data passed to regidx_parse_f
+ *
+ * Returns index on success or NULL on error.
+ */
+regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
+
+/*
+ * regidx_destroy() - free memory allocated by regidx_init
+ */
+void regidx_destroy(regidx_t *idx);
+
+/*
+ * regidx_overlap() - check overlap of the location chr:from-to with regions
+ * @param beg,end: 0-based start, end coordinate (inclusive)
+ * @param itr: pointer to iterator, can be NULL if regidx_loop not needed
+ *
+ * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
+ * regions can be iterated as shown in the example above.
+ */
+int regidx_overlap(regidx_t *idx, const char *chr, uint32_t beg, uint32_t end, regitr_t *itr);
+
+/*
+ * regidx_insert() - add a new region.
+ * regidx_insert_list() - add new regions from a list
+ * regidx_push() - low level insertion of a new region
+ *
+ * Returns 0 on success or -1 on error.
+ */
+int regidx_insert(regidx_t *idx, char *line);
+int regidx_insert_list(regidx_t *idx, char *line, char delim);
+int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload);
+
+/*
+ * regidx_seq_names() - return list of all sequence names
+ */
+char **regidx_seq_names(regidx_t *idx, int *n);
+
+/*
+ * regidx_seq_nregs() - number of regions
+ * regidx_nregs() - total number of regions
+ */
+int regidx_seq_nregs(regidx_t *idx, const char *seq);
+int regidx_nregs(regidx_t *idx);
+
+/*
+ * regitr_init() - initialize an iterator. The idx parameter is required only
+ * with regitr_loop. If only regitr_overlap is called, NULL
+ * can be given.
+ *
+ * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle.
+ * Not required with regitr_overlap.
+ */
+regitr_t *regitr_init(regidx_t *idx);
+void regitr_destroy(regitr_t *itr);
+void regitr_reset(regidx_t *idx, regitr_t *itr);
+
+/*
+ * regitr_overlap() - next overlapping region
+ * Returns 0 when done or 1 when itr is set to next region
+ */
+int regitr_overlap(regitr_t *itr);
+
+/*
+ * regitr_loop() - loop over all regions
+ * Returns 0 when done or 1 when itr is set to next region
+ */
+int regitr_loop(regitr_t *itr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bcftools/smpl_ilist.c b/bcftools/smpl_ilist.c
new file mode 100644
index 0000000..c7fa913
--- /dev/null
+++ b/bcftools/smpl_ilist.c
@@ -0,0 +1,106 @@
+/*
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "bcftools.h"
+#include "smpl_ilist.h"
+
+void smpl_ilist_destroy(smpl_ilist_t *smpl)
+{
+ free(smpl->idx);
+ free(smpl);
+}
+
+smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags)
+{
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ if ( !sample_list )
+ {
+ smpl->n = bcf_hdr_nsamples(hdr);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++) smpl->idx[i] = i;
+ return smpl;
+ }
+
+ int nlist;
+ char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist);
+ if ( !list ) error("Could not parse %s\n", sample_list);
+
+ // preserve the VCF order
+ int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int));
+ for (i=0; i<nlist; i++)
+ {
+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, list[i]);
+ if ( idx>=0 )
+ {
+ tmp[idx] = 1;
+ smpl->n++;
+ }
+ else if ( flags&SMPL_STRICT )
+ error("No such sample: %s\n", list[i]);
+ }
+
+ if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n;
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+
+ int j = 0;
+ if ( sample_list[0]!='^' )
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( tmp[i] ) smpl->idx[j++] = i;
+ }
+ else
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !tmp[i] ) smpl->idx[j++] = i;
+ }
+
+ free(tmp);
+ for (i=0; i<nlist; i++) free(list[i]);
+ free(list);
+
+ return smpl;
+}
+
+smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags)
+{
+ if ( flags&SMPL_STRICT && bcf_hdr_nsamples(hdr_a)!=bcf_hdr_nsamples(hdr_b) )
+ error("Different number of samples: %d vs %d\n", bcf_hdr_nsamples(hdr_a),bcf_hdr_nsamples(hdr_b));
+
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ smpl->n = bcf_hdr_nsamples(hdr_a);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++)
+ {
+ const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i);
+ smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name);
+ if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
+ error("The sample %s is not present in the second file\n", name);
+ }
+ return smpl;
+}
+
diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c
new file mode 100644
index 0000000..f52b8ce
--- /dev/null
+++ b/bcftools/smpl_ilist.c.pysam.c
@@ -0,0 +1,108 @@
+#include "pysam.h"
+
+/*
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "bcftools.h"
+#include "smpl_ilist.h"
+
+void smpl_ilist_destroy(smpl_ilist_t *smpl)
+{
+ free(smpl->idx);
+ free(smpl);
+}
+
+smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags)
+{
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ if ( !sample_list )
+ {
+ smpl->n = bcf_hdr_nsamples(hdr);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++) smpl->idx[i] = i;
+ return smpl;
+ }
+
+ int nlist;
+ char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist);
+ if ( !list ) error("Could not parse %s\n", sample_list);
+
+ // preserve the VCF order
+ int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int));
+ for (i=0; i<nlist; i++)
+ {
+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, list[i]);
+ if ( idx>=0 )
+ {
+ tmp[idx] = 1;
+ smpl->n++;
+ }
+ else if ( flags&SMPL_STRICT )
+ error("No such sample: %s\n", list[i]);
+ }
+
+ if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n;
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+
+ int j = 0;
+ if ( sample_list[0]!='^' )
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( tmp[i] ) smpl->idx[j++] = i;
+ }
+ else
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !tmp[i] ) smpl->idx[j++] = i;
+ }
+
+ free(tmp);
+ for (i=0; i<nlist; i++) free(list[i]);
+ free(list);
+
+ return smpl;
+}
+
+smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags)
+{
+ if ( flags&SMPL_STRICT && bcf_hdr_nsamples(hdr_a)!=bcf_hdr_nsamples(hdr_b) )
+ error("Different number of samples: %d vs %d\n", bcf_hdr_nsamples(hdr_a),bcf_hdr_nsamples(hdr_b));
+
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ smpl->n = bcf_hdr_nsamples(hdr_a);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++)
+ {
+ const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i);
+ smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name);
+ if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
+ error("The sample %s is not present in the second file\n", name);
+ }
+ return smpl;
+}
+
diff --git a/bcftools/smpl_ilist.h b/bcftools/smpl_ilist.h
new file mode 100644
index 0000000..7083cf2
--- /dev/null
+++ b/bcftools/smpl_ilist.h
@@ -0,0 +1,47 @@
+/*
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+/*
+ Parse --samples and --samples-file
+*/
+
+#ifndef __SMPL_ILIST_H__
+#define __SMPL_ILIST_H__
+
+#include <htslib/vcf.h>
+
+#define SMPL_NONE 0 // flexible error recovery
+#define SMPL_STRICT 1 // samples must exist
+
+typedef struct
+{
+ int *idx; // index to bcf_hdr_t.samples
+ int n;
+}
+smpl_ilist_t;
+
+smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags);
+smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags);
+void smpl_ilist_destroy(smpl_ilist_t *smpl);
+
+#endif
diff --git a/bcftools/tabix.c b/bcftools/tabix.c
index 2f24b92..c1874c2 100644
--- a/bcftools/tabix.c
+++ b/bcftools/tabix.c
@@ -1,7 +1,7 @@
/* tabix.c -- tabix subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -32,8 +32,8 @@ THE SOFTWARE. */
int main_tabix(int argc, char *argv[])
{
- int c, min_shift = -1, is_force = 0, is_all = 0;
- tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+ int c, min_shift = -1, is_force = 0, is_all = 0, detect = 1;
+ tbx_conf_t conf = tbx_conf_gff;
while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
if (c == '0') conf.preset |= TBX_UCSC;
else if (c == 'f') is_force = 1;
@@ -45,13 +45,14 @@ int main_tabix(int argc, char *argv[])
else if (c == 'c') conf.meta_char = *optarg;
else if (c == 'S') conf.line_skip = atoi(optarg);
else if (c == 'p') {
- if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
- else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
- else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
- else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
else {
fprintf(stderr, "The type '%s' not recognised\n", optarg);
return 1;
+ detect = 0;
}
}
@@ -79,28 +80,29 @@ int main_tabix(int argc, char *argv[])
bgzf_close(fp);
free(s.s);
} else if (optind + 2 > argc) { // create index
- if ( !conf_ptr )
+ if ( detect )
{
// auto-detect file type by file name
int l = strlen(argv[optind]);
int strcasecmp(const char *s1, const char *s2);
- if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
+ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf = tbx_conf_vcf;
}
- if ( conf_ptr ) conf = *conf_ptr;
if (!is_force) {
char *fn;
FILE *fp;
- fn = (char*)alloca(strlen(argv[optind]) + 5);
+ fn = (char*)malloc(strlen(argv[optind]) + 5);
strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
if ((fp = fopen(fn, "rb")) != 0) {
fclose(fp);
+ free(fn);
fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
return 1;
}
+ free(fn);
}
if ( tbx_index_build(argv[optind], min_shift, &conf) )
{
diff --git a/bcftools/tabix.c.pysam.c b/bcftools/tabix.c.pysam.c
index afa3619..b0c6e0e 100644
--- a/bcftools/tabix.c.pysam.c
+++ b/bcftools/tabix.c.pysam.c
@@ -3,7 +3,7 @@
/* tabix.c -- tabix subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -34,8 +34,8 @@ THE SOFTWARE. */
int main_tabix(int argc, char *argv[])
{
- int c, min_shift = -1, is_force = 0, is_all = 0;
- tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+ int c, min_shift = -1, is_force = 0, is_all = 0, detect = 1;
+ tbx_conf_t conf = tbx_conf_gff;
while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
if (c == '0') conf.preset |= TBX_UCSC;
else if (c == 'f') is_force = 1;
@@ -47,13 +47,14 @@ int main_tabix(int argc, char *argv[])
else if (c == 'c') conf.meta_char = *optarg;
else if (c == 'S') conf.line_skip = atoi(optarg);
else if (c == 'p') {
- if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
- else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
- else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
- else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
else {
fprintf(pysam_stderr, "The type '%s' not recognised\n", optarg);
return 1;
+ detect = 0;
}
}
@@ -81,28 +82,29 @@ int main_tabix(int argc, char *argv[])
bgzf_close(fp);
free(s.s);
} else if (optind + 2 > argc) { // create index
- if ( !conf_ptr )
+ if ( detect )
{
// auto-detect file type by file name
int l = strlen(argv[optind]);
int strcasecmp(const char *s1, const char *s2);
- if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
+ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf = tbx_conf_vcf;
}
- if ( conf_ptr ) conf = *conf_ptr;
if (!is_force) {
char *fn;
FILE *fp;
- fn = (char*)alloca(strlen(argv[optind]) + 5);
+ fn = (char*)malloc(strlen(argv[optind]) + 5);
strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
if ((fp = fopen(fn, "rb")) != 0) {
fclose(fp);
+ free(fn);
fprintf(pysam_stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
return 1;
}
+ free(fn);
}
if ( tbx_index_build(argv[optind], min_shift, &conf) )
{
diff --git a/bcftools/tsv2vcf.c b/bcftools/tsv2vcf.c
index 8826f18..2e1aa52 100644
--- a/bcftools/tsv2vcf.c
+++ b/bcftools/tsv2vcf.c
@@ -24,6 +24,7 @@
*/
#include <ctype.h>
+#include <strings.h>
#include "tsv2vcf.h"
tsv_t *tsv_init(const char *str)
diff --git a/bcftools/tsv2vcf.c.pysam.c b/bcftools/tsv2vcf.c.pysam.c
index 1da48d5..f5eff01 100644
--- a/bcftools/tsv2vcf.c.pysam.c
+++ b/bcftools/tsv2vcf.c.pysam.c
@@ -26,6 +26,7 @@
*/
#include <ctype.h>
+#include <strings.h>
#include "tsv2vcf.h"
tsv_t *tsv_init(const char *str)
diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c
index d5164f3..e6efda9 100644
--- a/bcftools/vcfannotate.c
+++ b/bcftools/vcfannotate.c
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -41,6 +42,7 @@ THE SOFTWARE. */
#include "vcmp.h"
#include "filter.h"
#include "convert.h"
+#include "smpl_ilist.h"
struct _args_t;
@@ -65,12 +67,12 @@ annot_line_t;
#define REPLACE_MISSING 0 // replace only missing values
#define REPLACE_ALL 1 // replace both missing and existing values
-#define REPLACE_EXISTING 2 // replace only if tgt is not missing
+#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
typedef struct _annot_col_t
{
int icol, replace, number; // number: one of BCF_VL_* types
- char *hdr_key;
+ char *hdr_key_src, *hdr_key_dst;
int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
}
annot_col_t;
@@ -109,6 +111,7 @@ typedef struct _args_t
convert_t *set_ids;
int set_ids_replace;
+ int nsmpl_annot;
int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc
int mtmpi, mtmpf, mtmps;
int mtmpi2, mtmpf2, mtmps2;
@@ -155,6 +158,7 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
}
line->d.shared_dirty |= BCF1_DIRTY_INF;
inf->vptr = NULL;
+ inf->vptr_off = inf->vptr_len = 0;
}
}
void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
@@ -187,6 +191,10 @@ void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag)
}
}
+#include "htslib/khash.h"
+KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+typedef khash_t(vdict) vdict_t;
+
static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
{
int i = 0, nrm = 0;
@@ -194,11 +202,18 @@ static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
{
if ( hdr->hrec[i]->type!=type ) { i++; continue; }
bcf_hrec_t *hrec = hdr->hrec[i];
- if ( type==BCF_HL_FMT )
+ if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
{
// everything except FORMAT/GT
int id = bcf_hrec_find_key(hrec, "ID");
- if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ if ( id>=0 )
+ {
+ if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
+ khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]);
+ kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
+ kh_val(d, k).info[type] |= 0xf;
+ }
}
nrm++;
hdr->nhrec--;
@@ -453,7 +468,7 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
line->qual = strtod(str, &str);
if ( str == tab->cols[col->icol] )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return 0;
}
static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -470,31 +485,31 @@ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *
char *str = tab->cols[col->icol];
if ( str[0]=='.' && str[1]==0 ) return 0;
- if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1);
- if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0);
+ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
+ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return -1;
}
static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL);
- bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag);
+ int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL);
+ bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag);
return 0;
}
static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
{
if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
int i;
@@ -511,7 +526,7 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int
args->tmpi2[i] = args->tmpi[ map[i] ];
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
return 0;
}
static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -537,17 +552,17 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
if ( ntmpi < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
@@ -555,26 +570,26 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
int i;
@@ -591,7 +606,7 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int
args->tmpf2[i] = args->tmpf[ map[i] ];
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
return 0;
}
static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -617,17 +632,17 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
if ( ntmpf < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
@@ -635,11 +650,11 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
@@ -652,9 +667,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
lsrc++;
}
if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
@@ -662,7 +677,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
// fill in any missing values in the target VCF (or all, if not present)
int i, empty = 0, nstr, mstr = args->tmpks.m;
- nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr);
+ nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr);
args->tmpks.m = mstr;
if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
{
@@ -695,7 +710,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
assert( ret==0 );
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
return 0;
}
static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -712,17 +727,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps);
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
if ( ntmps < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
@@ -730,11 +745,11 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -752,7 +767,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 ) // field not present in dst file
{
- if ( col->replace==REPLACE_EXISTING ) return 0;
+ if ( col->replace==REPLACE_NON_MISSING ) return 0;
hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
@@ -777,7 +792,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
if ( args->sample_map[i]==-1 ) continue;
int32_t *src = args->tmpi + nsrc*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue;
if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
for (j=0; j<nsrc; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
@@ -793,7 +808,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
int32_t *dst = args->tmpi3 + nsrc*i;
int keep_ori = 0;
if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
if ( keep_ori )
{
@@ -811,7 +826,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
}
static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
{
- int i, nmax = 0;
+ int i, nmax = 1;
for (i=icol_beg; i<icol_end; i++)
{
char *str = tab->cols[i], *end = str;
@@ -831,298 +846,306 @@ static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
}
return nmax;
}
-static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- int32_t *ptr = args->tmpi + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- ptr[ival++] = bcf_int32_missing;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
- icol++;
- }
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals);
-}
-static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- float *ptr = args->tmpf + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- bcf_float_set_missing(ptr[ival]);
- ival++;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtod(str, &end);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
- icol++;
- }
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals);
-}
-static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
-
- int i, max_len = 0;
- for (i=col->icol; i<col->icol+nsmpl; i++)
- {
- int len = strlen(tab->cols[i]);
- if ( max_len < len ) max_len = len;
- }
- hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- char *ptr = args->tmps + ismpl*max_len;
- char *str = tab->cols[icol];
- i = 0;
- while ( str[i] )
- {
- ptr[i] = str[i];
- i++;
- }
- while ( i<max_len ) ptr[i++] = 0;
- icol++;
- }
- return bcf_update_format_char(args->hdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len);
-}
-static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc);
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2);
+ int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *dst = args->tmpi2 + nsrc*i;
+ int32_t *dst = args->tmpi2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
dst[0] = bcf_int32_missing;
- for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (j=1; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
{
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ int32_t *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *src = vals + nvals*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue;
- if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ // possible cases:
+ // in annot out
+ // x y x TAG,-TAG,=TAG .. REPLACE_ALL, REPLACE_NON_MISSING, SET_OR_APPEND
+ // x y y +TAG .. REPLACE_MISSING
+ // . y . =TAG .. SET_OR_APPEND
+ // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
+ // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING
+ // x . . -TAG .. REPLACE_NON_MISSING
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *ori = args->tmpi2 + ndst*i;
- int32_t *dst = args->tmpi3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1;
- if ( keep_ori )
+ int32_t *ann = vals + nvals*args->sample_map[i];
+ int32_t *ori = args->tmpi2 + ndst*i; // ori vcf line
+ int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
- {
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc);
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2);
+ int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *dst = args->tmpf2 + nsrc*i;
+ float *dst = args->tmpf2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
bcf_float_set_missing(dst[0]);
- for (j=1; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (j=1; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
{
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ float *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- float *src = args->tmpf + nsrc*args->sample_map[i];
+ float *src = vals + nvals*args->sample_map[i];
float *dst = args->tmpf2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue;
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *ori = args->tmpf2 + ndst*i;
- float *dst = args->tmpf3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- if ( keep_ori )
+ float *ann = vals + nvals*args->sample_map[i];
+ float *ori = args->tmpf2 + ndst*i; // ori vcf line
+ float *dst = args->tmpf3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
- {
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals)
{
- bcf1_t *rec = (bcf1_t*) data;
- args->tmpp[0] = args->tmps;
- int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps);
- args->tmps = args->tmpp[0]; // tmps might be realloced
- if ( ret==-3 ) return 0; // the tag is not present
- if ( ret<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot);
int i;
args->tmpp2[0] = args->tmps2;
- ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2);
+ int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2);
args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
if ( ret<=0 ) // not present in dst
{
hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ char *tmp = args->tmps2;
+ for (i=0; i<nsmpl; i++)
{
- args->tmps2[2*i] = '.';
- args->tmps2[2*i+1] = 0;
- args->tmpp2[i] = args->tmps2+2*i;
+ tmp[0] = '.';
+ tmp[1] = 0;
+ args->tmpp2[i] = tmp;
+ tmp += 2;
}
}
+ for (i=0; i<nsmpl; i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ char **src = vals + args->sample_map[i];
+ char **dst = args->tmpp2 + i;
+
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
+ *dst = *src;
+ }
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
+}
+static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ {
+ int32_t *ptr = args->tmpi + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ ptr[ival++] = bcf_int32_missing;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
+ icol++;
+ }
+ return core_setter_format_int(args,line,col,args->tmpi,nvals);
+}
+static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
{
- int isrc = args->sample_map[i];
- if ( isrc==-1 ) continue;
- args->tmpp2[i] = args->tmpp[isrc];
+ float *ptr = args->tmpf + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ bcf_float_set_missing(ptr[ival]);
+ ival++;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
+ icol++;
}
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out));
+ return core_setter_format_real(args,line,col,args->tmpf,nvals);
}
-static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples)
+static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
+
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+
+static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ args->tmpp[0] = args->tmps;
+ int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps);
+ args->tmps = args->tmpp[0]; // tmps might be realloced
+ if ( ret==-3 ) return 0; // the tag is not present
+ if ( ret<=0 ) return 1; // error
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
{
int i;
if ( !args->sample_names )
{
+ args->nsmpl_annot = bcf_hdr_nsamples(dst);
+
+ // tab annotation file, expecting that all samples are present: sample map not needed
+ if ( !src ) return 0;
+
int nmatch = 0, order_ok = 1;
for (i=0; i<bcf_hdr_nsamples(src); i++)
{
@@ -1133,11 +1156,8 @@ static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_s
if ( i!=id ) order_ok = 0;
}
}
- if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok && !need_samples )
- return; // the same samples in both files
-
- if ( !nmatch ) error("No matching samples found in the source and the destination file\n");
- if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(stderr,"%d sample(s) in common\n", nmatch);
+ if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed
+ if ( !nmatch ) return -1; // No matching samples found in the source and the destination file
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
@@ -1146,46 +1166,70 @@ static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_s
int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
args->sample_map[i] = id; // idst -> isrc, -1 if not present
}
- return;
+ return 1;
}
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
- int nsamples = 0;
- char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples);
- for (i=0; i<nsamples; i++)
+ // possible todo: could do with smpl_ilist only
+ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT);
+ if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names);
+ char **samples = (char**) malloc(sizeof(char*)*ilist->n);
+ for (i=0; i<ilist->n; i++) samples[i] = strdup(dst->samples[i]);
+ args->nsmpl_annot = ilist->n;
+ smpl_ilist_destroy(ilist);
+ int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1;
+ if ( !src )
{
- int isrc, idst;
- char *ss = samples[i], *se = samples[i];
- while ( *se && !isspace(*se) ) se++;
- if ( !*se )
+ // tab annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
+ {
+ int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]);
+ args->sample_map[idst] = i;
+ if ( idst!=i ) need_sample_map = 1;
+ }
+ }
+ else
+ {
+ // vcf annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
{
- // only one sample name
+ int isrc, idst;
+ char *ss = samples[i], *se = samples[i];
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se )
+ {
+ // only one sample name
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+ args->sample_map[idst] = isrc;
+ if ( idst!=isrc ) need_sample_map = 1;
+ continue;
+ }
+ *se = 0;
isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+
+ ss = se+1;
+ while ( isspace(*ss) ) ss++;
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+
args->sample_map[idst] = isrc;
- continue;
+ if ( idst!=isrc ) need_sample_map = 1;
}
- *se = 0;
- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
-
- ss = se+1;
- while ( isspace(*ss) ) ss++;
- se = ss;
- while ( *se && !isspace(*se) ) se++;
-
- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
-
- args->sample_map[idst] = isrc;
}
- for (i=0; i<nsamples; i++) free(samples[i]);
+ for (i=0; i<args->nsmpl_annot; i++) free(samples[i]);
free(samples);
+ return need_sample_map;
}
static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
{
@@ -1247,8 +1291,27 @@ static char *columns_complement(char *columns, void **skip_info, void **skip_fmt
free(columns);
return str.s;
}
+static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str)
+{
+ int j, nout = 0;
+ ksprintf(str, "##%s=<", hrec->key);
+ for (j=0; j<hrec->nkeys; j++)
+ {
+ if ( !strcmp("IDX",hrec->keys[j]) ) continue;
+ if ( nout ) kputc(',',str);
+ if ( !strcmp("ID", hrec->keys[j]) )
+ ksprintf(str,"%s=%s", hrec->keys[j], tag);
+ else
+ ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
+ nout++;
+ }
+ ksprintf(str,">\n");
+}
static void init_columns(args_t *args)
{
+ int need_sample_map = 0;
+ int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
+
void *skip_fmt = NULL, *skip_info = NULL;
if ( args->tgts_is_vcf )
args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
@@ -1256,13 +1319,13 @@ static void init_columns(args_t *args)
kstring_t str = {0,0,0}, tmp = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
- int icol = -1, has_fmt_str = 0, force_samples = -1;
+ int icol = -1, has_fmt_str = 0;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
int replace = REPLACE_ALL;
if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
- else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
+ else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; }
else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
icol++;
str.l = 0;
@@ -1276,23 +1339,25 @@ static void init_columns(args_t *args)
else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol;
else if ( !strcasecmp("ID",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( !strcasecmp("FILTER",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
if ( args->tgts_is_vcf )
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
@@ -1312,18 +1377,19 @@ static void init_columns(args_t *args)
}
else if ( !strcasecmp("QUAL",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
@@ -1343,7 +1409,8 @@ static void init_columns(args_t *args)
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
@@ -1358,8 +1425,7 @@ static void init_columns(args_t *args)
else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ need_sample_map = 1;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
{
@@ -1377,8 +1443,9 @@ static void init_columns(args_t *args)
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
- if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
+ if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
@@ -1391,18 +1458,27 @@ static void init_columns(args_t *args)
}
else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
{
- char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7;
+ else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4;
+ }
+ else
+ key_src = key_dst;
+ need_sample_map = 1;
if ( args->tgts_is_vcf )
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
}
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
@@ -1410,13 +1486,14 @@ static void init_columns(args_t *args)
if ( !args->tgts_is_vcf )
{
col->icol = icol;
- icol += bcf_hdr_nsamples(args->hdr_out) - 1;
+ icol += args->nsmpl_annot - 1;
}
else
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(key);
- if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
+ if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
@@ -1428,24 +1505,33 @@ static void init_columns(args_t *args)
}
else
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
- if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); }
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s;
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5;
+ }
+ else
+ key_src = key_dst;
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
- hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
@@ -1453,7 +1539,8 @@ static void init_columns(args_t *args)
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
@@ -1480,8 +1567,13 @@ static void init_columns(args_t *args)
args->tmpp = (char**)malloc(sizeof(char*)*n);
args->tmpp2 = (char**)malloc(sizeof(char*)*n);
}
- if ( force_samples>=0 && args->tgts_is_vcf )
- set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
+ if ( !need_sample_map )
+ {
+ free(args->sample_map);
+ args->sample_map = NULL;
+ }
+ else if ( sample_map_ok<0 )
+ error("No matching samples in source and destination file?\n");
}
static void rename_chrs(args_t *args, char *fname)
@@ -1552,7 +1644,6 @@ static void init_data(args_t *args)
if ( args->mark_sites )
{
if ( !args->targets_fname ) error("The -a option not given\n");
- if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add..
bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
@@ -1564,7 +1655,8 @@ static void init_data(args_t *args)
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
bcf_hdr_write(args->out_fh, args->hdr_out);
}
}
@@ -1577,7 +1669,10 @@ static void destroy_data(args_t *args)
if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
if (args->vcmp) vcmp_destroy(args->vcmp);
for (i=0; i<args->ncols; i++)
- free(args->cols[i].hdr_key);
+ {
+ free(args->cols[i].hdr_key_src);
+ free(args->cols[i].hdr_key_dst);
+ }
free(args->cols);
for (i=0; i<args->malines; i++)
{
@@ -1718,7 +1813,7 @@ static void annotate(args_t *args, bcf1_t *line)
// there is a matching line
for (j=0; j<args->ncols; j++)
if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
}
@@ -1731,12 +1826,20 @@ static void annotate(args_t *args, bcf1_t *line)
bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?0:1);
}
}
- else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) )
+ else if ( args->files->nreaders == 2 )
{
- bcf1_t *aline = bcf_sr_get_line(args->files,1);
- for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ if ( bcf_sr_has_line(args->files,1) )
+ {
+ bcf1_t *aline = bcf_sr_get_line(args->files,1);
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0);
+ }
+ else if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0);
}
if ( args->set_ids )
{
@@ -1761,6 +1864,7 @@ static void usage(args_t *args)
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(stderr, " --collapse <string> matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
fprintf(stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
@@ -1793,7 +1897,7 @@ int main_vcfannotate(int argc, char *argv[])
args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
args->set_ids_replace = 1;
- int regions_is_file = 0;
+ int regions_is_file = 0, collapse = 0;
static struct option loptions[] =
{
@@ -1803,6 +1907,7 @@ int main_vcfannotate(int argc, char *argv[])
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"annotations",required_argument,NULL,'a'},
+ {"collapse",required_argument,NULL,2},
{"include",required_argument,NULL,'i'},
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
@@ -1847,6 +1952,16 @@ int main_vcfannotate(int argc, char *argv[])
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 'h': args->header_fname = optarg; break;
case 1 : args->rename_chrs = optarg; break;
+ case 2 :
+ if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
@@ -1877,9 +1992,10 @@ int main_vcfannotate(int argc, char *argv[])
{
args->tgts_is_vcf = 1;
args->files->require_index = 1;
- args->files->collapse |= COLLAPSE_SOME;
+ args->files->collapse = collapse ? collapse : COLLAPSE_SOME;
}
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c
index ea8398c..09f76c2 100644
--- a/bcftools/vcfannotate.c.pysam.c
+++ b/bcftools/vcfannotate.c.pysam.c
@@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -43,6 +44,7 @@ THE SOFTWARE. */
#include "vcmp.h"
#include "filter.h"
#include "convert.h"
+#include "smpl_ilist.h"
struct _args_t;
@@ -67,12 +69,12 @@ annot_line_t;
#define REPLACE_MISSING 0 // replace only missing values
#define REPLACE_ALL 1 // replace both missing and existing values
-#define REPLACE_EXISTING 2 // replace only if tgt is not missing
+#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
typedef struct _annot_col_t
{
int icol, replace, number; // number: one of BCF_VL_* types
- char *hdr_key;
+ char *hdr_key_src, *hdr_key_dst;
int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
}
annot_col_t;
@@ -111,6 +113,7 @@ typedef struct _args_t
convert_t *set_ids;
int set_ids_replace;
+ int nsmpl_annot;
int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc
int mtmpi, mtmpf, mtmps;
int mtmpi2, mtmpf2, mtmps2;
@@ -157,6 +160,7 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
}
line->d.shared_dirty |= BCF1_DIRTY_INF;
inf->vptr = NULL;
+ inf->vptr_off = inf->vptr_len = 0;
}
}
void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
@@ -189,6 +193,10 @@ void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag)
}
}
+#include "htslib/khash.h"
+KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+typedef khash_t(vdict) vdict_t;
+
static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
{
int i = 0, nrm = 0;
@@ -196,11 +204,18 @@ static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
{
if ( hdr->hrec[i]->type!=type ) { i++; continue; }
bcf_hrec_t *hrec = hdr->hrec[i];
- if ( type==BCF_HL_FMT )
+ if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
{
// everything except FORMAT/GT
int id = bcf_hrec_find_key(hrec, "ID");
- if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ if ( id>=0 )
+ {
+ if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
+ khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]);
+ kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
+ kh_val(d, k).info[type] |= 0xf;
+ }
}
nrm++;
hdr->nhrec--;
@@ -455,7 +470,7 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
line->qual = strtod(str, &str);
if ( str == tab->cols[col->icol] )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return 0;
}
static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -472,31 +487,31 @@ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *
char *str = tab->cols[col->icol];
if ( str[0]=='.' && str[1]==0 ) return 0;
- if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1);
- if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0);
+ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
+ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return -1;
}
static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL);
- bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag);
+ int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL);
+ bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag);
return 0;
}
static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
{
if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
int i;
@@ -513,7 +528,7 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int
args->tmpi2[i] = args->tmpi[ map[i] ];
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
return 0;
}
static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -539,17 +554,17 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
if ( ntmpi < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
@@ -557,26 +572,26 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
int i;
@@ -593,7 +608,7 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int
args->tmpf2[i] = args->tmpf[ map[i] ];
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
return 0;
}
static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -619,17 +634,17 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
if ( ntmpf < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
@@ -637,11 +652,11 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
@@ -654,9 +669,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
lsrc++;
}
if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
@@ -664,7 +679,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
// fill in any missing values in the target VCF (or all, if not present)
int i, empty = 0, nstr, mstr = args->tmpks.m;
- nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr);
+ nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr);
args->tmpks.m = mstr;
if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
{
@@ -697,7 +712,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
assert( ret==0 );
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
return 0;
}
static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -714,17 +729,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps);
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
if ( ntmps < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
@@ -732,11 +747,11 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -754,7 +769,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 ) // field not present in dst file
{
- if ( col->replace==REPLACE_EXISTING ) return 0;
+ if ( col->replace==REPLACE_NON_MISSING ) return 0;
hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
@@ -779,7 +794,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
if ( args->sample_map[i]==-1 ) continue;
int32_t *src = args->tmpi + nsrc*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue;
if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
for (j=0; j<nsrc; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
@@ -795,7 +810,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
int32_t *dst = args->tmpi3 + nsrc*i;
int keep_ori = 0;
if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
if ( keep_ori )
{
@@ -813,7 +828,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
}
static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
{
- int i, nmax = 0;
+ int i, nmax = 1;
for (i=icol_beg; i<icol_end; i++)
{
char *str = tab->cols[i], *end = str;
@@ -833,298 +848,306 @@ static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
}
return nmax;
}
-static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- int32_t *ptr = args->tmpi + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- ptr[ival++] = bcf_int32_missing;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
- icol++;
- }
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals);
-}
-static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- float *ptr = args->tmpf + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- bcf_float_set_missing(ptr[ival]);
- ival++;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtod(str, &end);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
- icol++;
- }
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals);
-}
-static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
-
- int i, max_len = 0;
- for (i=col->icol; i<col->icol+nsmpl; i++)
- {
- int len = strlen(tab->cols[i]);
- if ( max_len < len ) max_len = len;
- }
- hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- char *ptr = args->tmps + ismpl*max_len;
- char *str = tab->cols[icol];
- i = 0;
- while ( str[i] )
- {
- ptr[i] = str[i];
- i++;
- }
- while ( i<max_len ) ptr[i++] = 0;
- icol++;
- }
- return bcf_update_format_char(args->hdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len);
-}
-static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc);
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2);
+ int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *dst = args->tmpi2 + nsrc*i;
+ int32_t *dst = args->tmpi2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
dst[0] = bcf_int32_missing;
- for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (j=1; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
{
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ int32_t *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *src = vals + nvals*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue;
- if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ // possible cases:
+ // in annot out
+ // x y x TAG,-TAG,=TAG .. REPLACE_ALL, REPLACE_NON_MISSING, SET_OR_APPEND
+ // x y y +TAG .. REPLACE_MISSING
+ // . y . =TAG .. SET_OR_APPEND
+ // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
+ // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING
+ // x . . -TAG .. REPLACE_NON_MISSING
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *ori = args->tmpi2 + ndst*i;
- int32_t *dst = args->tmpi3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1;
- if ( keep_ori )
+ int32_t *ann = vals + nvals*args->sample_map[i];
+ int32_t *ori = args->tmpi2 + ndst*i; // ori vcf line
+ int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
- {
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc);
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2);
+ int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *dst = args->tmpf2 + nsrc*i;
+ float *dst = args->tmpf2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
bcf_float_set_missing(dst[0]);
- for (j=1; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (j=1; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
{
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ float *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- float *src = args->tmpf + nsrc*args->sample_map[i];
+ float *src = vals + nvals*args->sample_map[i];
float *dst = args->tmpf2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue;
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *ori = args->tmpf2 + ndst*i;
- float *dst = args->tmpf3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- if ( keep_ori )
+ float *ann = vals + nvals*args->sample_map[i];
+ float *ori = args->tmpf2 + ndst*i; // ori vcf line
+ float *dst = args->tmpf3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
- {
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals)
{
- bcf1_t *rec = (bcf1_t*) data;
- args->tmpp[0] = args->tmps;
- int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps);
- args->tmps = args->tmpp[0]; // tmps might be realloced
- if ( ret==-3 ) return 0; // the tag is not present
- if ( ret<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot);
int i;
args->tmpp2[0] = args->tmps2;
- ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2);
+ int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2);
args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
if ( ret<=0 ) // not present in dst
{
hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ char *tmp = args->tmps2;
+ for (i=0; i<nsmpl; i++)
{
- args->tmps2[2*i] = '.';
- args->tmps2[2*i+1] = 0;
- args->tmpp2[i] = args->tmps2+2*i;
+ tmp[0] = '.';
+ tmp[1] = 0;
+ args->tmpp2[i] = tmp;
+ tmp += 2;
}
}
+ for (i=0; i<nsmpl; i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ char **src = vals + args->sample_map[i];
+ char **dst = args->tmpp2 + i;
+
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
+ *dst = *src;
+ }
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
+}
+static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ {
+ int32_t *ptr = args->tmpi + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ ptr[ival++] = bcf_int32_missing;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
+ icol++;
+ }
+ return core_setter_format_int(args,line,col,args->tmpi,nvals);
+}
+static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
{
- int isrc = args->sample_map[i];
- if ( isrc==-1 ) continue;
- args->tmpp2[i] = args->tmpp[isrc];
+ float *ptr = args->tmpf + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ bcf_float_set_missing(ptr[ival]);
+ ival++;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
+ icol++;
}
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out));
+ return core_setter_format_real(args,line,col,args->tmpf,nvals);
}
-static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples)
+static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
+
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+
+static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ args->tmpp[0] = args->tmps;
+ int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps);
+ args->tmps = args->tmpp[0]; // tmps might be realloced
+ if ( ret==-3 ) return 0; // the tag is not present
+ if ( ret<=0 ) return 1; // error
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
{
int i;
if ( !args->sample_names )
{
+ args->nsmpl_annot = bcf_hdr_nsamples(dst);
+
+ // tab annotation file, expecting that all samples are present: sample map not needed
+ if ( !src ) return 0;
+
int nmatch = 0, order_ok = 1;
for (i=0; i<bcf_hdr_nsamples(src); i++)
{
@@ -1135,11 +1158,8 @@ static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_s
if ( i!=id ) order_ok = 0;
}
}
- if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok && !need_samples )
- return; // the same samples in both files
-
- if ( !nmatch ) error("No matching samples found in the source and the destination file\n");
- if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(pysam_stderr,"%d sample(s) in common\n", nmatch);
+ if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed
+ if ( !nmatch ) return -1; // No matching samples found in the source and the destination file
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
@@ -1148,46 +1168,70 @@ static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_s
int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
args->sample_map[i] = id; // idst -> isrc, -1 if not present
}
- return;
+ return 1;
}
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
- int nsamples = 0;
- char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples);
- for (i=0; i<nsamples; i++)
+ // possible todo: could do with smpl_ilist only
+ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT);
+ if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names);
+ char **samples = (char**) malloc(sizeof(char*)*ilist->n);
+ for (i=0; i<ilist->n; i++) samples[i] = strdup(dst->samples[i]);
+ args->nsmpl_annot = ilist->n;
+ smpl_ilist_destroy(ilist);
+ int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1;
+ if ( !src )
{
- int isrc, idst;
- char *ss = samples[i], *se = samples[i];
- while ( *se && !isspace(*se) ) se++;
- if ( !*se )
+ // tab annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
+ {
+ int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]);
+ args->sample_map[idst] = i;
+ if ( idst!=i ) need_sample_map = 1;
+ }
+ }
+ else
+ {
+ // vcf annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
{
- // only one sample name
+ int isrc, idst;
+ char *ss = samples[i], *se = samples[i];
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se )
+ {
+ // only one sample name
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+ args->sample_map[idst] = isrc;
+ if ( idst!=isrc ) need_sample_map = 1;
+ continue;
+ }
+ *se = 0;
isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+
+ ss = se+1;
+ while ( isspace(*ss) ) ss++;
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+
args->sample_map[idst] = isrc;
- continue;
+ if ( idst!=isrc ) need_sample_map = 1;
}
- *se = 0;
- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
-
- ss = se+1;
- while ( isspace(*ss) ) ss++;
- se = ss;
- while ( *se && !isspace(*se) ) se++;
-
- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
-
- args->sample_map[idst] = isrc;
}
- for (i=0; i<nsamples; i++) free(samples[i]);
+ for (i=0; i<args->nsmpl_annot; i++) free(samples[i]);
free(samples);
+ return need_sample_map;
}
static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
{
@@ -1249,8 +1293,27 @@ static char *columns_complement(char *columns, void **skip_info, void **skip_fmt
free(columns);
return str.s;
}
+static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str)
+{
+ int j, nout = 0;
+ ksprintf(str, "##%s=<", hrec->key);
+ for (j=0; j<hrec->nkeys; j++)
+ {
+ if ( !strcmp("IDX",hrec->keys[j]) ) continue;
+ if ( nout ) kputc(',',str);
+ if ( !strcmp("ID", hrec->keys[j]) )
+ ksprintf(str,"%s=%s", hrec->keys[j], tag);
+ else
+ ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
+ nout++;
+ }
+ ksprintf(str,">\n");
+}
static void init_columns(args_t *args)
{
+ int need_sample_map = 0;
+ int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
+
void *skip_fmt = NULL, *skip_info = NULL;
if ( args->tgts_is_vcf )
args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
@@ -1258,13 +1321,13 @@ static void init_columns(args_t *args)
kstring_t str = {0,0,0}, tmp = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
- int icol = -1, has_fmt_str = 0, force_samples = -1;
+ int icol = -1, has_fmt_str = 0;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
int replace = REPLACE_ALL;
if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
- else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
+ else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; }
else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
icol++;
str.l = 0;
@@ -1278,23 +1341,25 @@ static void init_columns(args_t *args)
else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol;
else if ( !strcasecmp("ID",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( !strcasecmp("FILTER",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
if ( args->tgts_is_vcf )
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
@@ -1314,18 +1379,19 @@ static void init_columns(args_t *args)
}
else if ( !strcasecmp("QUAL",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
@@ -1345,7 +1411,8 @@ static void init_columns(args_t *args)
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
@@ -1360,8 +1427,7 @@ static void init_columns(args_t *args)
else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ need_sample_map = 1;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
{
@@ -1379,8 +1445,9 @@ static void init_columns(args_t *args)
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
- if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
+ if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
@@ -1393,18 +1460,27 @@ static void init_columns(args_t *args)
}
else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
{
- char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7;
+ else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4;
+ }
+ else
+ key_src = key_dst;
+ need_sample_map = 1;
if ( args->tgts_is_vcf )
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
}
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
@@ -1412,13 +1488,14 @@ static void init_columns(args_t *args)
if ( !args->tgts_is_vcf )
{
col->icol = icol;
- icol += bcf_hdr_nsamples(args->hdr_out) - 1;
+ icol += args->nsmpl_annot - 1;
}
else
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(key);
- if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
+ if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
@@ -1430,24 +1507,33 @@ static void init_columns(args_t *args)
}
else
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
- if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); }
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s;
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5;
+ }
+ else
+ key_src = key_dst;
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
- hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
@@ -1455,7 +1541,8 @@ static void init_columns(args_t *args)
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
@@ -1482,8 +1569,13 @@ static void init_columns(args_t *args)
args->tmpp = (char**)malloc(sizeof(char*)*n);
args->tmpp2 = (char**)malloc(sizeof(char*)*n);
}
- if ( force_samples>=0 && args->tgts_is_vcf )
- set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
+ if ( !need_sample_map )
+ {
+ free(args->sample_map);
+ args->sample_map = NULL;
+ }
+ else if ( sample_map_ok<0 )
+ error("No matching samples in source and destination file?\n");
}
static void rename_chrs(args_t *args, char *fname)
@@ -1554,7 +1646,6 @@ static void init_data(args_t *args)
if ( args->mark_sites )
{
if ( !args->targets_fname ) error("The -a option not given\n");
- if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add..
bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
@@ -1566,7 +1657,8 @@ static void init_data(args_t *args)
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
bcf_hdr_write(args->out_fh, args->hdr_out);
}
}
@@ -1579,7 +1671,10 @@ static void destroy_data(args_t *args)
if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
if (args->vcmp) vcmp_destroy(args->vcmp);
for (i=0; i<args->ncols; i++)
- free(args->cols[i].hdr_key);
+ {
+ free(args->cols[i].hdr_key_src);
+ free(args->cols[i].hdr_key_dst);
+ }
free(args->cols);
for (i=0; i<args->malines; i++)
{
@@ -1720,7 +1815,7 @@ static void annotate(args_t *args, bcf1_t *line)
// there is a matching line
for (j=0; j<args->ncols; j++)
if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
}
@@ -1733,12 +1828,20 @@ static void annotate(args_t *args, bcf1_t *line)
bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?0:1);
}
}
- else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) )
+ else if ( args->files->nreaders == 2 )
{
- bcf1_t *aline = bcf_sr_get_line(args->files,1);
- for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ if ( bcf_sr_has_line(args->files,1) )
+ {
+ bcf1_t *aline = bcf_sr_get_line(args->files,1);
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0);
+ }
+ else if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0);
}
if ( args->set_ids )
{
@@ -1763,6 +1866,7 @@ static void usage(args_t *args)
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Options:\n");
fprintf(pysam_stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(pysam_stderr, " --collapse <string> matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
fprintf(pysam_stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(pysam_stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
@@ -1795,7 +1899,7 @@ int main_vcfannotate(int argc, char *argv[])
args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
args->set_ids_replace = 1;
- int regions_is_file = 0;
+ int regions_is_file = 0, collapse = 0;
static struct option loptions[] =
{
@@ -1805,6 +1909,7 @@ int main_vcfannotate(int argc, char *argv[])
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"annotations",required_argument,NULL,'a'},
+ {"collapse",required_argument,NULL,2},
{"include",required_argument,NULL,'i'},
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
@@ -1849,6 +1954,16 @@ int main_vcfannotate(int argc, char *argv[])
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 'h': args->header_fname = optarg; break;
case 1 : args->rename_chrs = optarg; break;
+ case 2 :
+ if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
@@ -1879,9 +1994,10 @@ int main_vcfannotate(int argc, char *argv[])
{
args->tgts_is_vcf = 1;
args->files->require_index = 1;
- args->files->collapse |= COLLAPSE_SOME;
+ args->files->collapse = collapse ? collapse : COLLAPSE_SOME;
}
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c
index e5bbf11..00771f7 100644
--- a/bcftools/vcfcall.c
+++ b/bcftools/vcfcall.c
@@ -24,6 +24,7 @@ THE SOFTWARE. */
#include <stdarg.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
@@ -146,7 +147,7 @@ static ploidy_predef_t ploidy_predefs[] =
"* * * F 2\n"
},
{ .alias = "GRCh38",
- .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)",
+ .about = "Human Genome reference assembly GRCh38 / hg38",
.ploidy =
"X 1 9999 M 1\n"
"X 2781480 155701381 M 1\n"
@@ -275,7 +276,7 @@ static void set_samples(args_t *args, const char *fn, int is_file)
args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
- int dflt_sex_id = ploidy_add_sex(args->ploidy, "F");
+ int dflt_sex_id = ploidy_nsex(args->ploidy) - 1;
for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
@@ -294,6 +295,7 @@ static void set_samples(args_t *args, const char *fn, int is_file)
int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+ if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
ss = se+1;
while ( *ss && isspace(*ss) ) ss++;
@@ -411,18 +413,24 @@ static void init_data(args_t *args)
{
args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
- for (i=0; i<args->nsamples; i++) args->sample2sex[i] = 0;
+ for (i=0; i<args->nsamples; i++) args->sample2sex[i] = args->nsex - 1;
}
}
if ( args->nsamples )
{
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
- for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
- for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
+ for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsamples; i++)
+ if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
}
- if ( args->gvcf )
+ if ( args->gvcf )
+ {
+ int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP");
+ if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n");
gvcf_update_header(args->gvcf, args->aux.hdr);
+ }
if ( args->samples_map )
{
@@ -554,7 +562,6 @@ static void set_ploidy(args_t *args, bcf1_t *rec)
else
args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
}
-
int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
}
@@ -569,7 +576,10 @@ ploidy_t *init_ploidy(char *alias)
if ( !pld->alias )
{
- fprintf(stderr,"Predefined ploidies:\n");
+ fprintf(stderr,"\nPRE-DEFINED PLOIDY FILES\n\n");
+ fprintf(stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(stderr," * Coordinates are 1-based inclusive.\n");
+ fprintf(stderr," * A '*' means any value not otherwise defined.\n\n");
pld = ploidy_predefs;
while ( pld->alias )
{
@@ -618,6 +628,7 @@ static void usage(args_t *args)
fprintf(stderr, "Input/output options:\n");
fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
fprintf(stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
+ fprintf(stderr, " -F, --prior-freqs <AN,AC> use prior allele frequencies\n");
fprintf(stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
@@ -630,7 +641,7 @@ static void usage(args_t *args)
fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
fprintf(stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
fprintf(stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
+ fprintf(stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
// todo (and more)
// fprintf(stderr, "\nContrast calling and association test options:\n");
@@ -667,6 +678,7 @@ int main_vcfcall(int argc, char *argv[])
{
{"help",no_argument,NULL,'h'},
{"format-fields",required_argument,NULL,'f'},
+ {"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
@@ -698,7 +710,7 @@ int main_vcfcall(int argc, char *argv[])
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0)
{
switch (c)
{
@@ -713,6 +725,13 @@ int main_vcfcall(int argc, char *argv[])
case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
case 'i': args.flag |= CF_INS_MISSED; break;
case 'v': args.aux.flag |= CALL_VARONLY; break;
+ case 'F':
+ args.aux.prior_AN = optarg;
+ args.aux.prior_AC = strchr(optarg,',');
+ if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg);
+ *args.aux.prior_AC = 0;
+ args.aux.prior_AC++;
+ break;
case 'g':
args.gvcf = gvcf_init(optarg);
if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
@@ -770,8 +789,8 @@ int main_vcfcall(int argc, char *argv[])
if ( !ploidy_fname && !ploidy )
{
- fprintf(stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
- args.ploidy = ploidy_init_string("",2);
+ if ( !args.samples_is_file ) fprintf(stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n");
+ args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2);
}
if ( !args.ploidy ) error("Could not initialize ploidy\n");
@@ -833,6 +852,7 @@ int main_vcfcall(int argc, char *argv[])
else
ret = ccall(&args.aux, bcf_rec);
if ( ret==-1 ) error("Something is wrong\n");
+ else if ( ret==-2 ) continue; // skip the site
// Normal output
if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant
diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c
index 8e59fd9..8e6721b 100644
--- a/bcftools/vcfcall.c.pysam.c
+++ b/bcftools/vcfcall.c.pysam.c
@@ -26,6 +26,7 @@ THE SOFTWARE. */
#include <stdarg.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
@@ -148,7 +149,7 @@ static ploidy_predef_t ploidy_predefs[] =
"* * * F 2\n"
},
{ .alias = "GRCh38",
- .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)",
+ .about = "Human Genome reference assembly GRCh38 / hg38",
.ploidy =
"X 1 9999 M 1\n"
"X 2781480 155701381 M 1\n"
@@ -277,7 +278,7 @@ static void set_samples(args_t *args, const char *fn, int is_file)
args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
- int dflt_sex_id = ploidy_add_sex(args->ploidy, "F");
+ int dflt_sex_id = ploidy_nsex(args->ploidy) - 1;
for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
@@ -296,6 +297,7 @@ static void set_samples(args_t *args, const char *fn, int is_file)
int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
if ( ismpl < 0 ) { fprintf(pysam_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+ if ( old2new[ismpl] != -1 ) { fprintf(pysam_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
ss = se+1;
while ( *ss && isspace(*ss) ) ss++;
@@ -413,18 +415,24 @@ static void init_data(args_t *args)
{
args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
- for (i=0; i<args->nsamples; i++) args->sample2sex[i] = 0;
+ for (i=0; i<args->nsamples; i++) args->sample2sex[i] = args->nsex - 1;
}
}
if ( args->nsamples )
{
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
- for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
- for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
+ for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsamples; i++)
+ if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
}
- if ( args->gvcf )
+ if ( args->gvcf )
+ {
+ int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP");
+ if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n");
gvcf_update_header(args->gvcf, args->aux.hdr);
+ }
if ( args->samples_map )
{
@@ -556,7 +564,6 @@ static void set_ploidy(args_t *args, bcf1_t *rec)
else
args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
}
-
int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
}
@@ -571,7 +578,10 @@ ploidy_t *init_ploidy(char *alias)
if ( !pld->alias )
{
- fprintf(pysam_stderr,"Predefined ploidies:\n");
+ fprintf(pysam_stderr,"\nPRE-DEFINED PLOIDY FILES\n\n");
+ fprintf(pysam_stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(pysam_stderr," * Coordinates are 1-based inclusive.\n");
+ fprintf(pysam_stderr," * A '*' means any value not otherwise defined.\n\n");
pld = ploidy_predefs;
while ( pld->alias )
{
@@ -620,6 +630,7 @@ static void usage(args_t *args)
fprintf(pysam_stderr, "Input/output options:\n");
fprintf(pysam_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
fprintf(pysam_stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
+ fprintf(pysam_stderr, " -F, --prior-freqs <AN,AC> use prior allele frequencies\n");
fprintf(pysam_stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(pysam_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
fprintf(pysam_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
@@ -632,7 +643,7 @@ static void usage(args_t *args)
fprintf(pysam_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
fprintf(pysam_stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
fprintf(pysam_stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(pysam_stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
+ fprintf(pysam_stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
// todo (and more)
// fprintf(pysam_stderr, "\nContrast calling and association test options:\n");
@@ -669,6 +680,7 @@ int main_vcfcall(int argc, char *argv[])
{
{"help",no_argument,NULL,'h'},
{"format-fields",required_argument,NULL,'f'},
+ {"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
@@ -700,7 +712,7 @@ int main_vcfcall(int argc, char *argv[])
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0)
{
switch (c)
{
@@ -715,6 +727,13 @@ int main_vcfcall(int argc, char *argv[])
case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
case 'i': args.flag |= CF_INS_MISSED; break;
case 'v': args.aux.flag |= CALL_VARONLY; break;
+ case 'F':
+ args.aux.prior_AN = optarg;
+ args.aux.prior_AC = strchr(optarg,',');
+ if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg);
+ *args.aux.prior_AC = 0;
+ args.aux.prior_AC++;
+ break;
case 'g':
args.gvcf = gvcf_init(optarg);
if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
@@ -772,8 +791,8 @@ int main_vcfcall(int argc, char *argv[])
if ( !ploidy_fname && !ploidy )
{
- fprintf(pysam_stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
- args.ploidy = ploidy_init_string("",2);
+ if ( !args.samples_is_file ) fprintf(pysam_stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n");
+ args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2);
}
if ( !args.ploidy ) error("Could not initialize ploidy\n");
@@ -835,6 +854,7 @@ int main_vcfcall(int argc, char *argv[])
else
ret = ccall(&args.aux, bcf_rec);
if ( ret==-1 ) error("Something is wrong\n");
+ else if ( ret==-2 ) continue; // skip the site
// Normal output
if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant
diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c
index e4b9372..ffe71c4 100644
--- a/bcftools/vcfcnv.c
+++ b/bcftools/vcfcnv.c
@@ -266,17 +266,15 @@ static void init_data(args_t *args)
hmm_init_states(args->hmm, args->iprobs);
args->summary_fh = stdout;
- if ( args->output_dir )
+ init_sample_files(&args->query_sample, args->output_dir);
+ if ( args->control_sample.name )
{
- init_sample_files(&args->query_sample, args->output_dir);
- if ( args->control_sample.name )
- {
- init_sample_files(&args->control_sample, args->output_dir);
- args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
- }
- else
- args->summary_fh = NULL; // one sample only, no two-file summary
+ init_sample_files(&args->control_sample, args->output_dir);
+ args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
}
+ else
+ args->summary_fh = NULL; // one sample only, no two-file summary
+
int i;
FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
@@ -295,6 +293,19 @@ static void init_data(args_t *args)
"# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n",
args->query_sample.name
);
+ if ( args->optimize_frac )
+ {
+ fprintf(args->query_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ fprintf(args->summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t"
+ "[5]Cell fraction:%s\t[6]Cell fraction:%s\t[7]BAF deviation:%s\t[8]BAF deviation:%s\n",
+ args->query_sample.name,args->control_sample.name,
+ args->query_sample.name,args->control_sample.name
+ );
+ }
+ }
}
char *msprintf(const char *fmt, ...);
@@ -556,6 +567,7 @@ static void destroy_data(args_t *args)
free(args->sites);
free(args->eprob);
free(args->tprob);
+ free(args->iprobs);
free(args->summary_fname);
free(args->nonref_afs);
free(args->query_sample.baf);
@@ -960,6 +972,20 @@ static void cnv_flush_viterbi(args_t *args)
if ( args->control_sample.name )
fprintf(stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
fprintf(stderr,"\n");
+
+ fprintf(args->query_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac,sqrt(args->query_sample.baf_dev2));
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->control_sample.cell_frac,sqrt(args->control_sample.baf_dev2));
+ fprintf(args->summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%.2f\t%f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac, args->control_sample.cell_frac,
+ sqrt(args->query_sample.baf_dev2), sqrt(args->control_sample.baf_dev2));
+ }
}
set_emission_probs(args);
@@ -1351,7 +1377,7 @@ int main_vcfcnv(int argc, char *argv[])
else fname = argv[optind];
if ( !fname ) usage(args);
- if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n");
+ if ( !args->output_dir ) error("Expected -o option\n");
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c
index 10a00b9..1075ef1 100644
--- a/bcftools/vcfcnv.c.pysam.c
+++ b/bcftools/vcfcnv.c.pysam.c
@@ -268,17 +268,15 @@ static void init_data(args_t *args)
hmm_init_states(args->hmm, args->iprobs);
args->summary_fh = pysam_stdout;
- if ( args->output_dir )
+ init_sample_files(&args->query_sample, args->output_dir);
+ if ( args->control_sample.name )
{
- init_sample_files(&args->query_sample, args->output_dir);
- if ( args->control_sample.name )
- {
- init_sample_files(&args->control_sample, args->output_dir);
- args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
- }
- else
- args->summary_fh = NULL; // one sample only, no two-file summary
+ init_sample_files(&args->control_sample, args->output_dir);
+ args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
}
+ else
+ args->summary_fh = NULL; // one sample only, no two-file summary
+
int i;
FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
@@ -297,6 +295,19 @@ static void init_data(args_t *args)
"# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n",
args->query_sample.name
);
+ if ( args->optimize_frac )
+ {
+ fprintf(args->query_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ fprintf(args->summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t"
+ "[5]Cell fraction:%s\t[6]Cell fraction:%s\t[7]BAF deviation:%s\t[8]BAF deviation:%s\n",
+ args->query_sample.name,args->control_sample.name,
+ args->query_sample.name,args->control_sample.name
+ );
+ }
+ }
}
char *msprintf(const char *fmt, ...);
@@ -558,6 +569,7 @@ static void destroy_data(args_t *args)
free(args->sites);
free(args->eprob);
free(args->tprob);
+ free(args->iprobs);
free(args->summary_fname);
free(args->nonref_afs);
free(args->query_sample.baf);
@@ -962,6 +974,20 @@ static void cnv_flush_viterbi(args_t *args)
if ( args->control_sample.name )
fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
fprintf(pysam_stderr,"\n");
+
+ fprintf(args->query_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac,sqrt(args->query_sample.baf_dev2));
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->control_sample.cell_frac,sqrt(args->control_sample.baf_dev2));
+ fprintf(args->summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%.2f\t%f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac, args->control_sample.cell_frac,
+ sqrt(args->query_sample.baf_dev2), sqrt(args->control_sample.baf_dev2));
+ }
}
set_emission_probs(args);
@@ -1353,7 +1379,7 @@ int main_vcfcnv(int argc, char *argv[])
else fname = argv[optind];
if ( !fname ) usage(args);
- if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n");
+ if ( !args->output_dir ) error("Expected -o option\n");
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c
index bd6a00a..3345c20 100644
--- a/bcftools/vcfconcat.c
+++ b/bcftools/vcfconcat.c
@@ -555,100 +555,138 @@ static void concat(args_t *args)
}
}
+int print_vcf_gz_header(BGZF *fp, BGZF *bgzf_out, int print_header, kstring_t *tmp)
+{
+ char *buffer = (char*) fp->uncompressed_block;
+
+ // Read the header and find the position of the data block
+ if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]);
+
+ int nskip = 1; // end of the header in the current uncompressed block
+ while (1)
+ {
+ if ( buffer[nskip]=='\n' )
+ {
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,nskip,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ // The header has finished
+ if ( buffer[nskip]!='#' )
+ {
+ kputsn(buffer,nskip,tmp);
+ break;
+ }
+ }
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,fp->block_length,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ }
+ if ( print_header )
+ {
+ if ( bgzf_write(bgzf_out,tmp->s,tmp->l) != tmp->l ) error("Failed to write %d bytes\n", tmp->l);
+ tmp->l = 0;
+ }
+ return nskip;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+static int check_header(const uint8_t *header)
+{
+ if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+ return ((header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
static void naive_concat(args_t *args)
{
// only compressed BCF atm
BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
- const size_t page_size = 32768;
- char *buf = (char*) malloc(page_size);
+ const size_t page_size = BGZF_MAX_BLOCK_SIZE;
+ uint8_t *buf = (uint8_t*) malloc(page_size);
kstring_t tmp = {0,0,0};
- int i;
+ int i, file_types = 0;
for (i=0; i<args->nfnames; i++)
{
htsFile *hts_fp = hts_open(args->fnames[i],"r");
if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
htsFormat type = *hts_get_format(hts_fp);
- if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
- if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+ if ( type.compression!=bgzf )
+ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
+ file_types |= type.format==vcf ? 1 : 2;
+ if ( file_types==3 )
+ error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n");
BGZF *fp = hts_get_bgzfp(hts_fp);
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));
- uint8_t magic[5];
- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
+ int nskip;
+ if ( type.format==bcf )
+ {
+ uint8_t magic[5];
+ if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- hts_expand(char,tmp.l,tmp.m,tmp.s);
- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ hts_expand(char,tmp.l,tmp.m,tmp.s);
+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- // write only the first header
- if ( i==0 )
+ // write only the first header
+ if ( i==0 )
+ {
+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ }
+ nskip = fp->block_offset;
+ }
+ else
{
- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp);
+ if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]);
}
// Output all non-header data that were read together with the header block
- int nskip = fp->block_offset;
if ( fp->block_length - nskip > 0 )
{
- if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
// Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
- ssize_t nread, ncached = 0, nwr;
- const int neof = 28;
- char cached[neof];
+ // The final bgzf eof block will be added by bgzf_close.
+ ssize_t nread, nblock, nwr;
+ const int nheader = 18, neof = 28;
+ const uint8_t *eof = (uint8_t*) "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
while (1)
{
- nread = bgzf_raw_read(fp, buf, page_size);
-
- // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
- if ( nread<=0 ) break;
- if ( nread<=neof ) // last block
- {
- if ( ncached )
- {
- // flush the part of the cache that won't be needed
- nwr = bgzf_raw_write(bgzf_out, cached, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
-
- // make space in the cache so that we can append to the end
- if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
- }
-
- // fill the cache and check for eof outside this loop
- memcpy(cached+neof-nread,buf,nread);
- break;
- }
-
- // not the last block, flush the cache if full
- if ( ncached )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, ncached);
- if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
- ncached = 0;
- }
-
- // fill the cache
- nread -= neof;
- memcpy(cached,buf+nread,neof);
- ncached = neof;
-
+ nread = bgzf_raw_read(fp, buf, nheader);
+ if ( !nread ) break;
+ if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]);
+ nblock = unpackInt16(buf+16) + 1;
+ assert( nblock <= page_size && nblock >= nheader );
+ nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader);
+ if ( nread!=nblock ) error("Could not read %d bytes: %s\n",nblock,args->fnames[i]);
+ if ( nread==neof && !memcmp(buf,eof,neof) ) continue;
nwr = bgzf_raw_write(bgzf_out, buf, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
- }
- if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, neof);
- if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
+ if ( nwr != nread ) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
}
if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
}
@@ -677,8 +715,8 @@ static void usage(args_t *args)
fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n");
fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n");
fprintf(stderr, " -o, --output <file> Write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c
index be2d6a2..4445a51 100644
--- a/bcftools/vcfconcat.c.pysam.c
+++ b/bcftools/vcfconcat.c.pysam.c
@@ -557,100 +557,138 @@ static void concat(args_t *args)
}
}
+int print_vcf_gz_header(BGZF *fp, BGZF *bgzf_out, int print_header, kstring_t *tmp)
+{
+ char *buffer = (char*) fp->uncompressed_block;
+
+ // Read the header and find the position of the data block
+ if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]);
+
+ int nskip = 1; // end of the header in the current uncompressed block
+ while (1)
+ {
+ if ( buffer[nskip]=='\n' )
+ {
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,nskip,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ // The header has finished
+ if ( buffer[nskip]!='#' )
+ {
+ kputsn(buffer,nskip,tmp);
+ break;
+ }
+ }
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,fp->block_length,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ }
+ if ( print_header )
+ {
+ if ( bgzf_write(bgzf_out,tmp->s,tmp->l) != tmp->l ) error("Failed to write %d bytes\n", tmp->l);
+ tmp->l = 0;
+ }
+ return nskip;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+static int check_header(const uint8_t *header)
+{
+ if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+ return ((header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
static void naive_concat(args_t *args)
{
// only compressed BCF atm
BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
- const size_t page_size = 32768;
- char *buf = (char*) malloc(page_size);
+ const size_t page_size = BGZF_MAX_BLOCK_SIZE;
+ uint8_t *buf = (uint8_t*) malloc(page_size);
kstring_t tmp = {0,0,0};
- int i;
+ int i, file_types = 0;
for (i=0; i<args->nfnames; i++)
{
htsFile *hts_fp = hts_open(args->fnames[i],"r");
if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
htsFormat type = *hts_get_format(hts_fp);
- if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
- if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+ if ( type.compression!=bgzf )
+ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
+ file_types |= type.format==vcf ? 1 : 2;
+ if ( file_types==3 )
+ error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n");
BGZF *fp = hts_get_bgzfp(hts_fp);
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));
- uint8_t magic[5];
- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
+ int nskip;
+ if ( type.format==bcf )
+ {
+ uint8_t magic[5];
+ if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- hts_expand(char,tmp.l,tmp.m,tmp.s);
- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ hts_expand(char,tmp.l,tmp.m,tmp.s);
+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- // write only the first header
- if ( i==0 )
+ // write only the first header
+ if ( i==0 )
+ {
+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ }
+ nskip = fp->block_offset;
+ }
+ else
{
- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp);
+ if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]);
}
// Output all non-header data that were read together with the header block
- int nskip = fp->block_offset;
if ( fp->block_length - nskip > 0 )
{
- if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
// Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
- ssize_t nread, ncached = 0, nwr;
- const int neof = 28;
- char cached[neof];
+ // The final bgzf eof block will be added by bgzf_close.
+ ssize_t nread, nblock, nwr;
+ const int nheader = 18, neof = 28;
+ const uint8_t *eof = (uint8_t*) "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
while (1)
{
- nread = bgzf_raw_read(fp, buf, page_size);
-
- // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
- if ( nread<=0 ) break;
- if ( nread<=neof ) // last block
- {
- if ( ncached )
- {
- // flush the part of the cache that won't be needed
- nwr = bgzf_raw_write(bgzf_out, cached, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
-
- // make space in the cache so that we can append to the end
- if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
- }
-
- // fill the cache and check for eof outside this loop
- memcpy(cached+neof-nread,buf,nread);
- break;
- }
-
- // not the last block, flush the cache if full
- if ( ncached )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, ncached);
- if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
- ncached = 0;
- }
-
- // fill the cache
- nread -= neof;
- memcpy(cached,buf+nread,neof);
- ncached = neof;
-
+ nread = bgzf_raw_read(fp, buf, nheader);
+ if ( !nread ) break;
+ if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]);
+ nblock = unpackInt16(buf+16) + 1;
+ assert( nblock <= page_size && nblock >= nheader );
+ nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader);
+ if ( nread!=nblock ) error("Could not read %d bytes: %s\n",nblock,args->fnames[i]);
+ if ( nread==neof && !memcmp(buf,eof,neof) ) continue;
nwr = bgzf_raw_write(bgzf_out, buf, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
- }
- if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, neof);
- if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
+ if ( nwr != nread ) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
}
if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
}
@@ -679,8 +717,8 @@ static void usage(args_t *args)
fprintf(pysam_stderr, " -D, --remove-duplicates Alias for -d none\n");
fprintf(pysam_stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(pysam_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
- fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(pysam_stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
+ fprintf(pysam_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n");
fprintf(pysam_stderr, " -o, --output <file> Write output to a file [standard output]\n");
fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(pysam_stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c
index 1e60d30..f650bea 100644
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -65,7 +66,7 @@ struct _args_t
int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
- char *outfname, *infname, *ref_fname;
+ char *outfname, *infname, *ref_fname, *sex_fname;
int argc, n_threads, record_cmd_line;
};
@@ -81,6 +82,9 @@ static void destroy_data(args_t *args)
static void open_vcf(args_t *args, const char *format_str)
{
args->files = bcf_sr_init();
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 )
+ error("Could not initialize --threads %d\n", args->n_threads);
+
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
@@ -129,9 +133,6 @@ static void open_vcf(args_t *args, const char *format_str)
}
if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
free(samples);
-
- if ( args->filter_str )
- args->filter = filter_init(args->header, args->filter_str);
}
static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
@@ -373,6 +374,7 @@ static void gensample_to_vcf(args_t *args)
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
@@ -493,6 +495,7 @@ static void haplegendsample_to_vcf(args_t *args)
int i, nrows, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nrows);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
nsamples = nrows - 1;
// sample_fname should contain a header line, so need to ignore first row
@@ -610,6 +613,7 @@ static void hapsample_to_vcf(args_t *args)
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
@@ -654,6 +658,32 @@ static void hapsample_to_vcf(args_t *args)
fprintf(stderr,"Number of processed rows: \t%d\n", args->n.total);
}
+char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
+{
+ int i, nlines;
+ char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1);
+ char **lines = hts_readlist(sex_fname, 1, &nlines);
+ if ( !lines ) error("Could not read %s\n", sex_fname);
+ for (i=0; i<nlines; i++)
+ {
+ char *se = lines[i]; while ( *se && !isspace(*se) ) se++;
+ char tmp = *se;
+ *se = 0;
+ int id = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, lines[i]);
+ *se = tmp;
+ if ( id<0 ) continue;
+ while ( *se && isspace(*se) ) se++;
+ if ( *se=='M' ) sample2sex[id] = '1';
+ else if ( *se=='F' ) sample2sex[id] = '2';
+ else error("Could not parse %s: %s\n", sex_fname,lines[i]);
+ }
+ for (i=0; i<nlines; i++) free(lines[i]);
+ free(lines);
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
+ return sample2sex;
+}
+
static void vcf_to_gensample(args_t *args)
{
kstring_t str = {0,0,0};
@@ -682,7 +712,7 @@ static void vcf_to_gensample(args_t *args)
char *gen_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
@@ -712,22 +742,30 @@ static void vcf_to_gensample(args_t *args)
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!gen_fname) {
if ( str.m ) free(str.s);
@@ -793,7 +831,7 @@ static void vcf_to_haplegendsample(args_t *args)
char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
@@ -829,7 +867,11 @@ static void vcf_to_haplegendsample(args_t *args)
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
@@ -839,12 +881,13 @@ static void vcf_to_haplegendsample(args_t *args)
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]);
+ ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2');
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname && !legend_fname) {
if ( str.m ) free(str.s);
@@ -853,6 +896,7 @@ static void vcf_to_haplegendsample(args_t *args)
// open haps and legend outputs
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
if (legend_fname) {
str.l = 0;
@@ -940,7 +984,7 @@ static void vcf_to_hapsample(args_t *args)
char *hap_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
@@ -970,22 +1014,30 @@ static void vcf_to_hapsample(args_t *args)
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname) {
if ( str.m ) free(str.s);
@@ -994,6 +1046,7 @@ static void vcf_to_hapsample(args_t *args)
// open haps output
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
while ( bcf_sr_next_line(args->files) )
@@ -1256,9 +1309,30 @@ static void gvcf_to_vcf(args_t *args)
if ( !pass ) continue;
}
- if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") )
+ if (!bcf_has_filter(hdr,line,"PASS"))
+ {
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+
+ // check if alleles compatible with being a gVCF record
+ int i, gallele = -1;
+ if (line->n_allele==1)
+ gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
+ else
+ {
+ if ( line->d.allele[1][0]!='<' ) continue;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
+ if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
+ if ( strcmp(line->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; } // GATK gVCF
+ }
+ }
+
+ // no gVCF compatible alleles
+ if (gallele<0)
{
- // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS
bcf_write(out_fh,hdr,line);
continue;
}
@@ -1266,7 +1340,7 @@ static void gvcf_to_vcf(args_t *args)
int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
if ( nend!=1 )
{
- // No END lineord
+ // No INFO/END => not gVCF record
bcf_write(out_fh,hdr,line);
continue;
}
@@ -1277,10 +1351,9 @@ static void gvcf_to_vcf(args_t *args)
line->pos = pos;
char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1);
- // we have already checked above that there is only one allele,
- // so fine to just update alleles with the ref allele from the fasta
- bcf_update_alleles_str(hdr, line, &ref[0]);
+ strncpy(line->d.allele[0],ref,len);
bcf_write(out_fh,hdr,line);
+ free(ref);
}
}
free(itmp);
@@ -1316,6 +1389,7 @@ static void usage(void)
fprintf(stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
fprintf(stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "gVCF conversion:\n");
@@ -1326,12 +1400,14 @@ static void usage(void)
fprintf(stderr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(stderr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "HAP/LEGEND/SAMPLE conversion:\n");
fprintf(stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "TSV conversion:\n");
@@ -1375,6 +1451,7 @@ int main_vcfconvert(int argc, char *argv[])
{"targets-file",required_argument,NULL,'T'},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
+ {"sex",required_argument,NULL,11},
{"gensample",required_argument,NULL,'g'},
{"gensample2vcf",required_argument,NULL,'G'},
{"tag",required_argument,NULL,1},
@@ -1428,6 +1505,7 @@ int main_vcfconvert(int argc, char *argv[])
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 10 : args->record_cmd_line = 0; break;
+ case 11 : args->sex_fname = optarg; break;
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
}
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c
index 12333cc..4d3469c 100644
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -67,7 +68,7 @@ struct _args_t
int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
- char *outfname, *infname, *ref_fname;
+ char *outfname, *infname, *ref_fname, *sex_fname;
int argc, n_threads, record_cmd_line;
};
@@ -83,6 +84,9 @@ static void destroy_data(args_t *args)
static void open_vcf(args_t *args, const char *format_str)
{
args->files = bcf_sr_init();
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 )
+ error("Could not initialize --threads %d\n", args->n_threads);
+
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
@@ -131,9 +135,6 @@ static void open_vcf(args_t *args, const char *format_str)
}
if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
free(samples);
-
- if ( args->filter_str )
- args->filter = filter_init(args->header, args->filter_str);
}
static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
@@ -375,6 +376,7 @@ static void gensample_to_vcf(args_t *args)
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
@@ -495,6 +497,7 @@ static void haplegendsample_to_vcf(args_t *args)
int i, nrows, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nrows);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
nsamples = nrows - 1;
// sample_fname should contain a header line, so need to ignore first row
@@ -612,6 +615,7 @@ static void hapsample_to_vcf(args_t *args)
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
@@ -656,6 +660,32 @@ static void hapsample_to_vcf(args_t *args)
fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total);
}
+char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
+{
+ int i, nlines;
+ char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1);
+ char **lines = hts_readlist(sex_fname, 1, &nlines);
+ if ( !lines ) error("Could not read %s\n", sex_fname);
+ for (i=0; i<nlines; i++)
+ {
+ char *se = lines[i]; while ( *se && !isspace(*se) ) se++;
+ char tmp = *se;
+ *se = 0;
+ int id = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, lines[i]);
+ *se = tmp;
+ if ( id<0 ) continue;
+ while ( *se && isspace(*se) ) se++;
+ if ( *se=='M' ) sample2sex[id] = '1';
+ else if ( *se=='F' ) sample2sex[id] = '2';
+ else error("Could not parse %s: %s\n", sex_fname,lines[i]);
+ }
+ for (i=0; i<nlines; i++) free(lines[i]);
+ free(lines);
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
+ return sample2sex;
+}
+
static void vcf_to_gensample(args_t *args)
{
kstring_t str = {0,0,0};
@@ -684,7 +714,7 @@ static void vcf_to_gensample(args_t *args)
char *gen_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
@@ -714,22 +744,30 @@ static void vcf_to_gensample(args_t *args)
if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!gen_fname) {
if ( str.m ) free(str.s);
@@ -795,7 +833,7 @@ static void vcf_to_haplegendsample(args_t *args)
char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
@@ -831,7 +869,11 @@ static void vcf_to_haplegendsample(args_t *args)
if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
@@ -841,12 +883,13 @@ static void vcf_to_haplegendsample(args_t *args)
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]);
+ ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2');
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname && !legend_fname) {
if ( str.m ) free(str.s);
@@ -855,6 +898,7 @@ static void vcf_to_haplegendsample(args_t *args)
// open haps and legend outputs
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
if (legend_fname) {
str.l = 0;
@@ -942,7 +986,7 @@ static void vcf_to_hapsample(args_t *args)
char *hap_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
@@ -972,22 +1016,30 @@ static void vcf_to_hapsample(args_t *args)
if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname) {
if ( str.m ) free(str.s);
@@ -996,6 +1048,7 @@ static void vcf_to_hapsample(args_t *args)
// open haps output
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
while ( bcf_sr_next_line(args->files) )
@@ -1258,9 +1311,30 @@ static void gvcf_to_vcf(args_t *args)
if ( !pass ) continue;
}
- if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") )
+ if (!bcf_has_filter(hdr,line,"PASS"))
+ {
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+
+ // check if alleles compatible with being a gVCF record
+ int i, gallele = -1;
+ if (line->n_allele==1)
+ gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
+ else
+ {
+ if ( line->d.allele[1][0]!='<' ) continue;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
+ if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
+ if ( strcmp(line->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; } // GATK gVCF
+ }
+ }
+
+ // no gVCF compatible alleles
+ if (gallele<0)
{
- // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS
bcf_write(out_fh,hdr,line);
continue;
}
@@ -1268,7 +1342,7 @@ static void gvcf_to_vcf(args_t *args)
int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
if ( nend!=1 )
{
- // No END lineord
+ // No INFO/END => not gVCF record
bcf_write(out_fh,hdr,line);
continue;
}
@@ -1279,10 +1353,9 @@ static void gvcf_to_vcf(args_t *args)
line->pos = pos;
char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1);
- // we have already checked above that there is only one allele,
- // so fine to just update alleles with the ref allele from the fasta
- bcf_update_alleles_str(hdr, line, &ref[0]);
+ strncpy(line->d.allele[0],ref,len);
bcf_write(out_fh,hdr,line);
+ free(ref);
}
}
free(itmp);
@@ -1318,6 +1391,7 @@ static void usage(void)
fprintf(pysam_stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
fprintf(pysam_stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
fprintf(pysam_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysam_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(pysam_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "gVCF conversion:\n");
@@ -1328,12 +1402,14 @@ static void usage(void)
fprintf(pysam_stderr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(pysam_stderr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysam_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "HAP/LEGEND/SAMPLE conversion:\n");
fprintf(pysam_stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(pysam_stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysam_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "TSV conversion:\n");
@@ -1377,6 +1453,7 @@ int main_vcfconvert(int argc, char *argv[])
{"targets-file",required_argument,NULL,'T'},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
+ {"sex",required_argument,NULL,11},
{"gensample",required_argument,NULL,'g'},
{"gensample2vcf",required_argument,NULL,'G'},
{"tag",required_argument,NULL,1},
@@ -1430,6 +1507,7 @@ int main_vcfconvert(int argc, char *argv[])
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 10 : args->record_cmd_line = 0; break;
+ case 11 : args->sex_fname = optarg; break;
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
}
diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c
index f979d77..c1b41f2 100644
--- a/bcftools/vcffilter.c
+++ b/bcftools/vcffilter.c
@@ -129,7 +129,8 @@ static void init_data(args_t *args)
if ( tmp.s ) kputs(" and ", &tmp);
kputs("\"IndelGap\"", &tmp);
}
- fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
+ if ( strncmp(tmp.s+1,args->soft_filter,tmp.l-2) )
+ fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
free(tmp.s);
}
diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c
index 58193da..e603bde 100644
--- a/bcftools/vcffilter.c.pysam.c
+++ b/bcftools/vcffilter.c.pysam.c
@@ -131,7 +131,8 @@ static void init_data(args_t *args)
if ( tmp.s ) kputs(" and ", &tmp);
kputs("\"IndelGap\"", &tmp);
}
- fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
+ if ( strncmp(tmp.s+1,args->soft_filter,tmp.l-2) )
+ fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
free(tmp.s);
}
diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c
index b741ef6..8835db3 100644
--- a/bcftools/vcfgtcheck.c
+++ b/bcftools/vcfgtcheck.c
@@ -35,7 +35,9 @@ THE SOFTWARE. */
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <inttypes.h>
#include "bcftools.h"
+#include "hclust.h"
typedef struct
{
@@ -43,10 +45,10 @@ typedef struct
bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
int ntmp_arr, npl_arr;
int32_t *tmp_arr, *pl_arr;
- double *lks, *sites;
+ double *lks, *sites, min_inter_err, max_intra_err;
int *cnts, *dps, hom_only, cross_check, all_sites;
char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
- int argc, no_PLs;
+ int argc, no_PLs, narr, nsmpl;
}
args_t;
@@ -133,6 +135,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample)
free(fname);
}
+#if 0
static void plot_cross_check(args_t *args)
{
char *fname;
@@ -214,6 +217,7 @@ static void plot_cross_check(args_t *args)
py_plot(fname);
free(fname);
}
+#endif
static void init_data(args_t *args)
{
@@ -230,14 +234,6 @@ static void init_data(args_t *args)
args->sites = (double*) calloc(nsamples,sizeof(double));
args->dps = (int*) calloc(nsamples,sizeof(int));
}
- else
- {
- int nsamples = bcf_hdr_nsamples(args->sm_hdr);
- int narr = (nsamples-1)*nsamples/2;
- args->lks = (double*) calloc(narr,sizeof(double));
- args->cnts = (int*) calloc(narr,sizeof(int));
- args->dps = (int*) calloc(narr,sizeof(int));
- }
}
static void destroy_data(args_t *args)
@@ -524,177 +520,181 @@ static void check_gt(args_t *args)
}
}
-static inline int is_hom_most_likely(int nals, int *pls)
+// static inline int is_hom_most_likely(int nals, int *pls)
+// {
+// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
+// for (ia=1; ia<nals; ia++)
+// {
+// for (ib=0; ib<ia; ib++)
+// {
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+// idx++;
+// }
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
+// idx++;
+// }
+// return min_is_hom;
+// }
+
+int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
+{
+ int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+
+ if ( ngt<=0 ) return 1; // GT not present
+ if ( ngt!=args->nsmpl*2 ) return 2; // not diploid
+ ngt /= args->nsmpl;
+
+ int i,j, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
+ {
+ int32_t *a = args->tmp_arr + i*ngt;
+ if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
+ int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+
+ for (j=0; j<i; j++)
+ {
+ int32_t *b = args->tmp_arr + j*ngt;
+ if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
+ int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+
+ ntot[idx]++;
+ if ( agt!=bgt ) ndif[idx]++;
+ idx++;
+ }
+ }
+ return 0;
+}
+int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
{
- int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
- for (ia=1; ia<nals; ia++)
+ int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
+
+ if ( npl<=0 ) return 1; // PL not present
+ npl /= args->nsmpl;
+
+ int i,j,k, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
{
- for (ib=0; ib<ia; ib++)
+ int32_t *a = args->tmp_arr + i*npl;
+ int imin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( a[k]==bcf_int32_vector_end ) break;
+ if ( a[k]==bcf_int32_missing ) continue;
+ if ( imin==-1 || a[imin] > a[k] ) imin = k;
+ }
+ if ( imin<0 ) { idx+=i; continue; }
+
+ for (j=0; j<i; j++)
{
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+ int32_t *b = args->tmp_arr + j*npl;
+ int jmin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( b[k]==bcf_int32_vector_end ) break;
+ if ( b[k]==bcf_int32_missing ) continue;
+ if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+ }
+ if ( jmin<0 ) { idx++; continue; }
+
+ ntot[idx]++;
+ if ( imin!=jmin ) ndif[idx]++;
idx++;
}
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
- idx++;
}
- return min_is_hom;
+ return 0;
}
static void cross_check_gts(args_t *args)
{
- int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
- unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
- int fake_pls = args->no_PLs, ignore_dp = 0;
-
- int i,j,k,idx, pl_warned = 0, dp_warned = 0;
- int32_t *dp_arr = NULL;
- int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
+ // Initialize things: check which tags are defined in the header, sample names etc.
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
{
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs )
+ if ( !args->no_PLs ) {
fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- fake_pls = 1;
+ args->no_PLs = 99;
+ }
}
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
- print_header(args, fp);
- if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");
+ args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
+ args->narr = (args->nsmpl-1)*args->nsmpl/2;
+
+ uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
+ uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
while ( bcf_sr_next_line(args->files) )
{
- bcf1_t *line = args->files->readers[0].buffer[0];
- bcf_unpack(line, BCF_UN_FMT);
-
- int npl;
- if ( !fake_pls )
- {
- npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
- if ( npl<=0 ) { pl_warned++; continue; }
- npl /= nsamples;
- }
- else
- npl = fake_PLs(args, args->sm_hdr, line);
- int mdp = 0;
- if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++;
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
- if ( args->hom_only )
+ // use PLs unless no_PLs is set and GT exists
+ if ( args->no_PLs )
{
- for (i=0; i<nsamples; i++)
- is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
+ if ( process_GT(args,line,ntot,ndif)==0 ) continue;
}
-
- double sum = 0; int nsum = 0;
- idx = 0;
- for (i=0; i<nsamples; i++)
- {
- int *ipl = &args->pl_arr[i*npl];
- if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
- if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }
-
- for (j=0; j<i; j++)
- {
- int *jpl = &args->pl_arr[j*npl];
- if ( *jpl==-1 ) { idx++; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
- if ( args->hom_only && !is_hom[j] ) { idx++; continue; }
-
- int min_pl = INT_MAX;
- for (k=0; k<npl; k++)
- {
- if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
- if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
- if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
- }
- if ( k!=npl ) { idx++; continue; }
-
- if ( args->all_sites ) { sum += min_pl; nsum++; }
- args->lks[idx] += min_pl;
- args->cnts[idx]++;
-
- if ( mdp>0 )
- {
- args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
- dp[i] += dp_arr[i]; ndp[i]++;
- dp[j] += dp_arr[j]; ndp[j]++;
- }
- else
- {
- args->dps[idx]++;
- dp[i]++; ndp[i]++;
- dp[j]++; ndp[j]++;
- }
- idx++;
- }
- }
- if ( args->all_sites )
- fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
+ process_PL(args,line,ntot,ndif);
}
- if ( dp_arr ) free(dp_arr);
- if ( args->pl_arr ) free(args->pl_arr);
- if ( args->tmp_arr ) free(args->tmp_arr);
- if ( is_hom ) free(is_hom);
+
+ FILE *fp = stdout;
+ print_header(args, fp);
- if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
- if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
+ float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
- // Output samples sorted by average discordance
- double *score = (double*) calloc(nsamples,sizeof(double));
- args->sites = (double*) calloc(nsamples,sizeof(double));
- idx = 0;
- for (i=0; i<nsamples; i++)
+ // Output pairwise distances
+ fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
+ int i,j, idx = 0;
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- score[i] += args->lks[idx];
- score[j] += args->lks[idx];
- args->sites[i] += args->cnts[idx];
- args->sites[j] += args->cnts[idx];
+ float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
+ fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ PDIST(tmp,i,j) = err;
idx++;
}
}
- for (i=0; i<nsamples; i++)
- if ( args->sites[i] ) score[i] /= args->sites[i];
- double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
- for (i=0; i<nsamples; i++) p[i] = &score[i];
- qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
- // The average discordance gives the number of differing sites in % with -G1
- fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
- for (i=0; i<nsamples; i++)
+
+ // Cluster samples
+ int nlist;
+ float clust_max_err = args->max_intra_err;
+ hclust_t *clust = hclust_init(args->nsmpl,tmp);
+ cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
+ fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
+ for (i=0; i<nlist; i++)
{
- idx = p[i] - score;
- double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
- double nsites = args->sites[idx]/(nsamples-1);
- avg_score += score[idx];
- fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
+ fprintf(fp,"CLUSTER\t%f", list[i].dist);
+ for (j=0; j<list[i].nmemb; j++)
+ fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
+ fprintf(fp,"\n");
}
-
- // // Overall score: maximum absolute deviation from the average score
- // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
- // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set
- free(p);
- free(score);
- free(dp);
- free(ndp);
-
- // Pairwise discordances
+ hclust_destroy_list(list,nlist);
+ // Debugging output: the cluster graph and data used for deciding
+ char **dbg = hclust_explain(clust,&nlist);
+ for (i=0; i<nlist; i++)
+ fprintf(fp,"DBG\t%s\n", dbg[i]);
+ fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
+ fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
+ fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
+ hclust_destroy(clust);
+ free(tmp);
+
+
+ // Deprecated output for temporary backward compatibility
+ fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
idx = 0;
- for (i=0; i<nsamples; i++)
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0,
- args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
idx++;
}
}
- fclose(fp);
- if ( args->plot )
- plot_cross_check(args);
+
+ free(ndif);
+ free(ntot);
+ free(args->tmp_arr);
}
static char *init_prefix(char *prefix)
@@ -713,6 +713,7 @@ static void usage(void)
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -a, --all-sites output comparison for all sites\n");
+ fprintf(stderr, " -c, --cluster <min,max> min inter- and max intra-sample error [0.23,-0.3]\n");
fprintf(stderr, " -g, --genotypes <file> genotypes to compare against\n");
fprintf(stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
fprintf(stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
@@ -736,8 +737,16 @@ int main_vcfgtcheck(int argc, char *argv[])
char *regions = NULL, *targets = NULL;
int regions_is_file = 0, targets_is_file = 0;
+ // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
+ // - min_inter: pairs with smaller err value will be considered identical
+ // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
+ // different. If negative, the cutoff may be heuristically lowered
+ args->min_inter_err = 0.23;
+ args->max_intra_err = -0.3;
+
static struct option loptions[] =
{
+ {"cluster",1,0,'c'},
{"GTs-only",1,0,'G'},
{"all-sites",0,0,'a'},
{"homs-only",0,0,'H'},
@@ -753,8 +762,17 @@ int main_vcfgtcheck(int argc, char *argv[])
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'c':
+ args->min_inter_err = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -c %s\n", optarg);
+ args->max_intra_err = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -c %s\n", optarg);
+ }
+ break;
case 'G':
args->no_PLs = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c
index 2f0a288..0bd6071 100644
--- a/bcftools/vcfgtcheck.c.pysam.c
+++ b/bcftools/vcfgtcheck.c.pysam.c
@@ -37,7 +37,9 @@ THE SOFTWARE. */
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <inttypes.h>
#include "bcftools.h"
+#include "hclust.h"
typedef struct
{
@@ -45,10 +47,10 @@ typedef struct
bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
int ntmp_arr, npl_arr;
int32_t *tmp_arr, *pl_arr;
- double *lks, *sites;
+ double *lks, *sites, min_inter_err, max_intra_err;
int *cnts, *dps, hom_only, cross_check, all_sites;
char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
- int argc, no_PLs;
+ int argc, no_PLs, narr, nsmpl;
}
args_t;
@@ -135,6 +137,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample)
free(fname);
}
+#if 0
static void plot_cross_check(args_t *args)
{
char *fname;
@@ -216,6 +219,7 @@ static void plot_cross_check(args_t *args)
py_plot(fname);
free(fname);
}
+#endif
static void init_data(args_t *args)
{
@@ -232,14 +236,6 @@ static void init_data(args_t *args)
args->sites = (double*) calloc(nsamples,sizeof(double));
args->dps = (int*) calloc(nsamples,sizeof(int));
}
- else
- {
- int nsamples = bcf_hdr_nsamples(args->sm_hdr);
- int narr = (nsamples-1)*nsamples/2;
- args->lks = (double*) calloc(narr,sizeof(double));
- args->cnts = (int*) calloc(narr,sizeof(int));
- args->dps = (int*) calloc(narr,sizeof(int));
- }
}
static void destroy_data(args_t *args)
@@ -526,177 +522,181 @@ static void check_gt(args_t *args)
}
}
-static inline int is_hom_most_likely(int nals, int *pls)
+// static inline int is_hom_most_likely(int nals, int *pls)
+// {
+// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
+// for (ia=1; ia<nals; ia++)
+// {
+// for (ib=0; ib<ia; ib++)
+// {
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+// idx++;
+// }
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
+// idx++;
+// }
+// return min_is_hom;
+// }
+
+int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
+{
+ int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+
+ if ( ngt<=0 ) return 1; // GT not present
+ if ( ngt!=args->nsmpl*2 ) return 2; // not diploid
+ ngt /= args->nsmpl;
+
+ int i,j, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
+ {
+ int32_t *a = args->tmp_arr + i*ngt;
+ if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
+ int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+
+ for (j=0; j<i; j++)
+ {
+ int32_t *b = args->tmp_arr + j*ngt;
+ if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
+ int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+
+ ntot[idx]++;
+ if ( agt!=bgt ) ndif[idx]++;
+ idx++;
+ }
+ }
+ return 0;
+}
+int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
{
- int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
- for (ia=1; ia<nals; ia++)
+ int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
+
+ if ( npl<=0 ) return 1; // PL not present
+ npl /= args->nsmpl;
+
+ int i,j,k, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
{
- for (ib=0; ib<ia; ib++)
+ int32_t *a = args->tmp_arr + i*npl;
+ int imin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( a[k]==bcf_int32_vector_end ) break;
+ if ( a[k]==bcf_int32_missing ) continue;
+ if ( imin==-1 || a[imin] > a[k] ) imin = k;
+ }
+ if ( imin<0 ) { idx+=i; continue; }
+
+ for (j=0; j<i; j++)
{
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+ int32_t *b = args->tmp_arr + j*npl;
+ int jmin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( b[k]==bcf_int32_vector_end ) break;
+ if ( b[k]==bcf_int32_missing ) continue;
+ if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+ }
+ if ( jmin<0 ) { idx++; continue; }
+
+ ntot[idx]++;
+ if ( imin!=jmin ) ndif[idx]++;
idx++;
}
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
- idx++;
}
- return min_is_hom;
+ return 0;
}
static void cross_check_gts(args_t *args)
{
- int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
- unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
- int fake_pls = args->no_PLs, ignore_dp = 0;
-
- int i,j,k,idx, pl_warned = 0, dp_warned = 0;
- int32_t *dp_arr = NULL;
- int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
+ // Initialize things: check which tags are defined in the header, sample names etc.
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
{
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs )
+ if ( !args->no_PLs ) {
fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- fake_pls = 1;
+ args->no_PLs = 99;
+ }
}
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout;
- print_header(args, fp);
- if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");
+ args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
+ args->narr = (args->nsmpl-1)*args->nsmpl/2;
+
+ uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
+ uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
while ( bcf_sr_next_line(args->files) )
{
- bcf1_t *line = args->files->readers[0].buffer[0];
- bcf_unpack(line, BCF_UN_FMT);
-
- int npl;
- if ( !fake_pls )
- {
- npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
- if ( npl<=0 ) { pl_warned++; continue; }
- npl /= nsamples;
- }
- else
- npl = fake_PLs(args, args->sm_hdr, line);
- int mdp = 0;
- if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++;
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
- if ( args->hom_only )
+ // use PLs unless no_PLs is set and GT exists
+ if ( args->no_PLs )
{
- for (i=0; i<nsamples; i++)
- is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
+ if ( process_GT(args,line,ntot,ndif)==0 ) continue;
}
-
- double sum = 0; int nsum = 0;
- idx = 0;
- for (i=0; i<nsamples; i++)
- {
- int *ipl = &args->pl_arr[i*npl];
- if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
- if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }
-
- for (j=0; j<i; j++)
- {
- int *jpl = &args->pl_arr[j*npl];
- if ( *jpl==-1 ) { idx++; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
- if ( args->hom_only && !is_hom[j] ) { idx++; continue; }
-
- int min_pl = INT_MAX;
- for (k=0; k<npl; k++)
- {
- if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
- if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
- if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
- }
- if ( k!=npl ) { idx++; continue; }
-
- if ( args->all_sites ) { sum += min_pl; nsum++; }
- args->lks[idx] += min_pl;
- args->cnts[idx]++;
-
- if ( mdp>0 )
- {
- args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
- dp[i] += dp_arr[i]; ndp[i]++;
- dp[j] += dp_arr[j]; ndp[j]++;
- }
- else
- {
- args->dps[idx]++;
- dp[i]++; ndp[i]++;
- dp[j]++; ndp[j]++;
- }
- idx++;
- }
- }
- if ( args->all_sites )
- fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
+ process_PL(args,line,ntot,ndif);
}
- if ( dp_arr ) free(dp_arr);
- if ( args->pl_arr ) free(args->pl_arr);
- if ( args->tmp_arr ) free(args->tmp_arr);
- if ( is_hom ) free(is_hom);
+
+ FILE *fp = pysam_stdout;
+ print_header(args, fp);
- if ( pl_warned ) fprintf(pysam_stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
- if ( dp_warned ) fprintf(pysam_stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
+ float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
- // Output samples sorted by average discordance
- double *score = (double*) calloc(nsamples,sizeof(double));
- args->sites = (double*) calloc(nsamples,sizeof(double));
- idx = 0;
- for (i=0; i<nsamples; i++)
+ // Output pairwise distances
+ fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
+ int i,j, idx = 0;
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- score[i] += args->lks[idx];
- score[j] += args->lks[idx];
- args->sites[i] += args->cnts[idx];
- args->sites[j] += args->cnts[idx];
+ float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
+ fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ PDIST(tmp,i,j) = err;
idx++;
}
}
- for (i=0; i<nsamples; i++)
- if ( args->sites[i] ) score[i] /= args->sites[i];
- double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
- for (i=0; i<nsamples; i++) p[i] = &score[i];
- qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
- // The average discordance gives the number of differing sites in % with -G1
- fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
- for (i=0; i<nsamples; i++)
+
+ // Cluster samples
+ int nlist;
+ float clust_max_err = args->max_intra_err;
+ hclust_t *clust = hclust_init(args->nsmpl,tmp);
+ cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
+ fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
+ for (i=0; i<nlist; i++)
{
- idx = p[i] - score;
- double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
- double nsites = args->sites[idx]/(nsamples-1);
- avg_score += score[idx];
- fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
+ fprintf(fp,"CLUSTER\t%f", list[i].dist);
+ for (j=0; j<list[i].nmemb; j++)
+ fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
+ fprintf(fp,"\n");
}
-
- // // Overall score: maximum absolute deviation from the average score
- // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
- // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set
- free(p);
- free(score);
- free(dp);
- free(ndp);
-
- // Pairwise discordances
+ hclust_destroy_list(list,nlist);
+ // Debugging output: the cluster graph and data used for deciding
+ char **dbg = hclust_explain(clust,&nlist);
+ for (i=0; i<nlist; i++)
+ fprintf(fp,"DBG\t%s\n", dbg[i]);
+ fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
+ fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
+ fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
+ hclust_destroy(clust);
+ free(tmp);
+
+
+ // Deprecated output for temporary backward compatibility
+ fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
idx = 0;
- for (i=0; i<nsamples; i++)
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0,
- args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
idx++;
}
}
- fclose(fp);
- if ( args->plot )
- plot_cross_check(args);
+
+ free(ndif);
+ free(ntot);
+ free(args->tmp_arr);
}
static char *init_prefix(char *prefix)
@@ -715,6 +715,7 @@ static void usage(void)
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Options:\n");
fprintf(pysam_stderr, " -a, --all-sites output comparison for all sites\n");
+ fprintf(pysam_stderr, " -c, --cluster <min,max> min inter- and max intra-sample error [0.23,-0.3]\n");
fprintf(pysam_stderr, " -g, --genotypes <file> genotypes to compare against\n");
fprintf(pysam_stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
fprintf(pysam_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
@@ -738,8 +739,16 @@ int main_vcfgtcheck(int argc, char *argv[])
char *regions = NULL, *targets = NULL;
int regions_is_file = 0, targets_is_file = 0;
+ // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
+ // - min_inter: pairs with smaller err value will be considered identical
+ // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
+ // different. If negative, the cutoff may be heuristically lowered
+ args->min_inter_err = 0.23;
+ args->max_intra_err = -0.3;
+
static struct option loptions[] =
{
+ {"cluster",1,0,'c'},
{"GTs-only",1,0,'G'},
{"all-sites",0,0,'a'},
{"homs-only",0,0,'H'},
@@ -755,8 +764,17 @@ int main_vcfgtcheck(int argc, char *argv[])
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'c':
+ args->min_inter_err = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -c %s\n", optarg);
+ args->max_intra_err = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -c %s\n", optarg);
+ }
+ break;
case 'G':
args->no_PLs = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c
index d1e9179..aa60fb2 100644
--- a/bcftools/vcfindex.c
+++ b/bcftools/vcfindex.c
@@ -1,4 +1,3 @@
-
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
Copyright (C) 2014-2016 Genome Research Ltd.
@@ -32,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <sys/stat.h>
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
+#include <htslib/kstring.h>
#include "bcftools.h"
#define BCF_LIDX_SHIFT 14
@@ -43,24 +43,22 @@ static void usage(void)
fprintf(stderr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Indexing options:\n");
- fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
- fprintf(stderr, " -f, --force overwrite index if it already exists\n");
- fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
- fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
+ fprintf(stderr, " -f, --force overwrite index if it already exists\n");
+ fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(stderr, " -o, --output-file FILE optional output index file name\n");
+ fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(stderr, " --threads sets the number of threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Stats options:\n");
fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n");
- fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n");
+ fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n");
fprintf(stderr, "\n");
exit(1);
}
int vcf_index_stats(char *fname, int stats)
{
- char *fn_out = NULL;
- FILE *out;
- out = fn_out ? fopen(fn_out, "w") : stdout;
-
const char **seq;
int i, nseq;
tbx_t *tbx = NULL;
@@ -74,12 +72,12 @@ int vcf_index_stats(char *fname, int stats)
if ( hts_get_format(fp)->format==vcf )
{
tbx = tbx_index_load(fname);
- if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; }
+ if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
}
else if ( hts_get_format(fp)->format==bcf )
{
idx = bcf_index_load(fname);
- if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; }
+ if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
}
else
{
@@ -97,7 +95,7 @@ int vcf_index_stats(char *fname, int stats)
if (stats&2 || !records) continue;
bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
+ printf("%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
}
if (!sum)
{
@@ -106,14 +104,13 @@ int vcf_index_stats(char *fname, int stats)
bcf1_t *rec = bcf_init1();
if (bcf_read1(fp, hdr, rec) >= 0)
{
- fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
+ fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
return 1;
}
bcf_destroy1(rec);
}
- if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
+ if (stats&2) printf("%" PRIu64 "\n", sum);
free(seq);
- fclose(out);
hts_close(fp);
bcf_hdr_destroy(hdr);
if (tbx)
@@ -125,8 +122,9 @@ int vcf_index_stats(char *fname, int stats)
int main_vcfindex(int argc, char *argv[])
{
- int c, force = 0, tbi = 0, stats = 0;
+ int c, force = 0, tbi = 0, stats = 0, n_threads = 0;
int min_shift = BCF_LIDX_SHIFT;
+ char *outfn = NULL;
static struct option loptions[] =
{
@@ -136,27 +134,33 @@ int main_vcfindex(int argc, char *argv[])
{"min-shift",required_argument,NULL,'m'},
{"stats",no_argument,NULL,'s'},
{"nrecords",no_argument,NULL,'n'},
+ {"threads",required_argument,NULL,9},
+ {"output-file",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'c': tbi = 0; break;
case 't': tbi = 1; min_shift = 0; break;
case 'f': force = 1; break;
- case 'm':
+ case 'm':
min_shift = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
break;
case 's': stats |= 1; break;
case 'n': stats |= 2; break;
+ case 9:
+ n_threads = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
+ break;
+ case 'o': outfn = optarg; break;
default: usage();
}
}
- if ( optind==argc ) usage();
if (stats>2)
{
fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
@@ -173,69 +177,48 @@ int main_vcfindex(int argc, char *argv[])
return 1;
}
- char *fname = argv[optind];
- if (stats) return vcf_index_stats(fname, stats);
-
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) error("Failed to read %s\n", fname);
- htsFormat type = *hts_get_format(fp);
- hts_close(fp);
-
- if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf )
+ char *fname = NULL;
+ if ( optind>=argc )
{
- fprintf(stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
- if ( type.compression!=bgzf )
- fprintf(stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
- return 1;
- }
- if (tbi && type.format==bcf)
- {
- fprintf(stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
- tbi = 0; min_shift = BCF_LIDX_SHIFT;
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
}
- if (min_shift == 0 && type.format==bcf)
- {
- fprintf(stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
- return 1;
- }
- if (!tbi && type.format==vcf && min_shift == 0)
+ else fname = argv[optind];
+ if (stats) return vcf_index_stats(fname, stats);
+
+ kstring_t idx_fname = {0,0,0};
+ if (outfn)
+ kputs(outfn,&idx_fname);
+ else
{
- fprintf(stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
- tbi = 1;
+ if (!strcmp(fname, "-")) { fprintf(stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; }
+ ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi");
}
-
if (!force)
{
// Before complaining about existing index, check if the VCF file isn't newer.
- char *idx_fname = (char*)alloca(strlen(fname) + 5);
- strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi");
struct stat stat_tbi, stat_file;
- if ( stat(idx_fname, &stat_tbi)==0 )
+ if ( stat(idx_fname.s, &stat_tbi)==0 )
{
stat(fname, &stat_file);
if ( stat_file.st_mtime <= stat_tbi.st_mtime )
{
- fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
+ fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s);
+ free(idx_fname.s);
return 1;
}
}
}
- if (type.format==bcf)
- {
- if ( bcf_index_build(fname, min_shift) != 0 )
- {
- fprintf(stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
- return 1;
- }
- }
- else
- {
- if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
- {
- fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
- return 1;
- }
+ int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
+ free(idx_fname.s);
+ if (ret != 0) {
+ if (ret == -2)
+ error("index: failed to open \"%s\"\n", fname);
+ else if (ret == -3)
+ error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname);
+ else
+ error("index: failed to create index for \"%s\"\n", fname);
}
return 0;
}
diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c
index 479fc57..ff960b9 100644
--- a/bcftools/vcfindex.c.pysam.c
+++ b/bcftools/vcfindex.c.pysam.c
@@ -1,6 +1,5 @@
#include "pysam.h"
-
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
Copyright (C) 2014-2016 Genome Research Ltd.
@@ -34,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <sys/stat.h>
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
+#include <htslib/kstring.h>
#include "bcftools.h"
#define BCF_LIDX_SHIFT 14
@@ -45,24 +45,22 @@ static void usage(void)
fprintf(pysam_stderr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Indexing options:\n");
- fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
- fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n");
- fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
- fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
+ fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n");
+ fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(pysam_stderr, " -o, --output-file FILE optional output index file name\n");
+ fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(pysam_stderr, " --threads sets the number of threads [0]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Stats options:\n");
fprintf(pysam_stderr, " -n, --nrecords print number of records based on existing index file\n");
- fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n");
+ fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n");
fprintf(pysam_stderr, "\n");
exit(1);
}
int vcf_index_stats(char *fname, int stats)
{
- char *fn_out = NULL;
- FILE *out;
- out = fn_out ? fopen(fn_out, "w") : pysam_stdout;
-
const char **seq;
int i, nseq;
tbx_t *tbx = NULL;
@@ -76,12 +74,12 @@ int vcf_index_stats(char *fname, int stats)
if ( hts_get_format(fp)->format==vcf )
{
tbx = tbx_index_load(fname);
- if ( !tbx ) { fprintf(pysam_stderr,"Could not load TBI index: %s\n", fname); return 1; }
+ if ( !tbx ) { fprintf(pysam_stderr,"Could not load index for VCF: %s\n", fname); return 1; }
}
else if ( hts_get_format(fp)->format==bcf )
{
idx = bcf_index_load(fname);
- if ( !idx ) { fprintf(pysam_stderr,"Could not load CSI index: %s\n", fname); return 1; }
+ if ( !idx ) { fprintf(pysam_stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
}
else
{
@@ -99,7 +97,7 @@ int vcf_index_stats(char *fname, int stats)
if (stats&2 || !records) continue;
bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
+ fprintf(pysam_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
}
if (!sum)
{
@@ -108,14 +106,13 @@ int vcf_index_stats(char *fname, int stats)
bcf1_t *rec = bcf_init1();
if (bcf_read1(fp, hdr, rec) >= 0)
{
- fprintf(pysam_stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
+ fprintf(pysam_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
return 1;
}
bcf_destroy1(rec);
}
- if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
+ if (stats&2) fprintf(pysam_stdout, "%" PRIu64 "\n", sum);
free(seq);
- fclose(out);
hts_close(fp);
bcf_hdr_destroy(hdr);
if (tbx)
@@ -127,8 +124,9 @@ int vcf_index_stats(char *fname, int stats)
int main_vcfindex(int argc, char *argv[])
{
- int c, force = 0, tbi = 0, stats = 0;
+ int c, force = 0, tbi = 0, stats = 0, n_threads = 0;
int min_shift = BCF_LIDX_SHIFT;
+ char *outfn = NULL;
static struct option loptions[] =
{
@@ -138,27 +136,33 @@ int main_vcfindex(int argc, char *argv[])
{"min-shift",required_argument,NULL,'m'},
{"stats",no_argument,NULL,'s'},
{"nrecords",no_argument,NULL,'n'},
+ {"threads",required_argument,NULL,9},
+ {"output-file",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'c': tbi = 0; break;
case 't': tbi = 1; min_shift = 0; break;
case 'f': force = 1; break;
- case 'm':
+ case 'm':
min_shift = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
break;
case 's': stats |= 1; break;
case 'n': stats |= 2; break;
+ case 9:
+ n_threads = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
+ break;
+ case 'o': outfn = optarg; break;
default: usage();
}
}
- if ( optind==argc ) usage();
if (stats>2)
{
fprintf(pysam_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
@@ -175,69 +179,48 @@ int main_vcfindex(int argc, char *argv[])
return 1;
}
- char *fname = argv[optind];
- if (stats) return vcf_index_stats(fname, stats);
-
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) error("Failed to read %s\n", fname);
- htsFormat type = *hts_get_format(fp);
- hts_close(fp);
-
- if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf )
+ char *fname = NULL;
+ if ( optind>=argc )
{
- fprintf(pysam_stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
- if ( type.compression!=bgzf )
- fprintf(pysam_stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
- return 1;
- }
- if (tbi && type.format==bcf)
- {
- fprintf(pysam_stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
- tbi = 0; min_shift = BCF_LIDX_SHIFT;
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
}
- if (min_shift == 0 && type.format==bcf)
- {
- fprintf(pysam_stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
- return 1;
- }
- if (!tbi && type.format==vcf && min_shift == 0)
+ else fname = argv[optind];
+ if (stats) return vcf_index_stats(fname, stats);
+
+ kstring_t idx_fname = {0,0,0};
+ if (outfn)
+ kputs(outfn,&idx_fname);
+ else
{
- fprintf(pysam_stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
- tbi = 1;
+ if (!strcmp(fname, "-")) { fprintf(pysam_stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; }
+ ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi");
}
-
if (!force)
{
// Before complaining about existing index, check if the VCF file isn't newer.
- char *idx_fname = (char*)alloca(strlen(fname) + 5);
- strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi");
struct stat stat_tbi, stat_file;
- if ( stat(idx_fname, &stat_tbi)==0 )
+ if ( stat(idx_fname.s, &stat_tbi)==0 )
{
stat(fname, &stat_file);
if ( stat_file.st_mtime <= stat_tbi.st_mtime )
{
- fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
+ fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s);
+ free(idx_fname.s);
return 1;
}
}
}
- if (type.format==bcf)
- {
- if ( bcf_index_build(fname, min_shift) != 0 )
- {
- fprintf(pysam_stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
- return 1;
- }
- }
- else
- {
- if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
- {
- fprintf(pysam_stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
- return 1;
- }
+ int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
+ free(idx_fname.s);
+ if (ret != 0) {
+ if (ret == -2)
+ error("index: failed to open \"%s\"\n", fname);
+ else if (ret == -3)
+ error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname);
+ else
+ error("index: failed to create index for \"%s\"\n", fname);
}
return 0;
}
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c
index 02fac6b..1aeb739 100644
--- a/bcftools/vcfmerge.c
+++ b/bcftools/vcfmerge.c
@@ -1,6 +1,6 @@
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -24,28 +24,39 @@ THE SOFTWARE. */
#include <stdio.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <htslib/faidx.h>
#include <math.h>
#include <ctype.h>
+#include <time.h>
#include "bcftools.h"
+#include "regidx.h"
#include "vcmp.h"
+#define DBG 0
+
#include <htslib/khash.h>
KHASH_MAP_INIT_STR(strdict, int)
typedef khash_t(strdict) strdict_t;
-#define SKIP_DONE 1
-#define SKIP_DIFF 2
+#define FLT_LOGIC_ADD 0
+#define FLT_LOGIC_REMOVE 1
+
+#define SKIP_DONE 1 // the record was processed
+#define SKIP_DIFF 2 // not compatible, merge later
#define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G)
#define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A)
#define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R)
+#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+
// For merging INFO Number=A,G,R tags
typedef struct
{
@@ -63,43 +74,61 @@ typedef struct _info_rule_t
void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule);
int type; // one of BCF_HT_*
int block_size; // number of values in a block
+ int type_size; // size of the corresponding BCF_HT_* type
int nblocks; // number of blocks in nvals (the number of merged files)
int nvals, mvals; // used and total size of vals array
void *vals; // the info tag values
}
info_rule_t;
+typedef struct
+{
+ bcf1_t *line;
+ int end, active;
+}
+gvcf_aux_t;
+
// Auxiliary merge data for selecting the right combination
// of buffered records across multiple readers. maux1_t
// corresponds to one buffered line.
typedef struct
{
int skip;
- int *map; // mapping from input alleles to the output array
+ int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles)
int mmap; // size of map array (only buffer[i].n_allele is actually used)
int als_differ;
}
maux1_t;
typedef struct
{
- int n; // number of readers
+ int rid; // current rid
+ int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+ int cur; // current line or -1 if none
+ int npos; // number of unprocessed lines at this position
+ int mrec; // allocated size of buf
+ maux1_t *rec; // buffer to keep reader's lines
+ bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+}
+buffer_t;
+typedef struct
+{
+ int n, pos, var_types; // number of readers, current position, currently available variant types
+ char *chr; // current chromosome
char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
int nals, mals, nout_als, mout_als; // size of the output array
int *cnt, ncnt; // number of records that refer to the alleles
- int *nbuf; // readers have buffers of varying lengths
int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
- int *flt, mflt, minf;
- bcf_info_t *inf;// out_line's INFO fields
bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
int nfmt_map; // number of rows in the fmt_map array
int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
void *tmp_arr;
int ntmp_arr;
- maux1_t **d; // d[i][j] i-th reader, j-th buffer line
+ buffer_t *buf;
AGR_info_t *AGR_info;
int nAGR_info, mAGR_info;
bcf_srs_t *files;
- int *has_line; // which files are being merged
+ int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present
+ gvcf_aux_t *gvcf; // buffer of gVCF lines
}
maux_t;
@@ -107,8 +136,11 @@ typedef struct
{
vcmp_t *vcmp;
maux_t *maux;
- int header_only, collapse, output_type, force_samples, merge_by_id;
+ regidx_t *regs; // apply regions only after the blocks are expanded
+ regitr_t *regs_itr;
+ int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
+ faidx_t *gvcf_fai;
info_rule_t *rules;
int nrules;
strdict_t *tmph;
@@ -122,6 +154,14 @@ typedef struct
}
args_t;
+static bcf1_t *maux_get_line(args_t *args, int i)
+{
+ maux_t *ma = args->maux;
+ int ibuf = ma->buf[i].cur;
+ if ( ibuf >= 0 ) return ma->buf[i].lines[ibuf];
+ return NULL;
+}
+
static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
{
if ( !rule->nvals ) return;
@@ -247,6 +287,32 @@ static void info_rules_init(args_t *args)
if ( str.l ) kputc(',',&str);
kputs("DP4:sum",&str);
}
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "QS")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("QS:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("MinDP:min",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "I16")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("I16:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IDV")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IDV:max",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IMF")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IMF:max",&str);
+ }
+
if ( !str.l ) return;
args->info_rules = str.s;
}
@@ -272,9 +338,12 @@ static void info_rules_init(args_t *args)
int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
- if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+ if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
+ else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
+ else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
+ else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
- while ( *ss ) ss++; ss++;
+ ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
int is_join = 0;
@@ -300,7 +369,8 @@ static void info_rules_init(args_t *args)
error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
}
- while ( *ss ) ss++; ss++; n++;
+ ss = strchr(ss, '\0'); ss++;
+ n++;
}
free(str.s);
free(tmp);
@@ -326,8 +396,10 @@ static void info_rules_reset(args_t *args)
}
static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len)
{
- int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type);
+ int msize = args->maux->ntmp_arr / rule->type_size;
+ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type);
if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret);
+ args->maux->ntmp_arr = msize * rule->type_size;
rule->nblocks++;
@@ -345,7 +417,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf
int i, j;
if ( var_len==BCF_VL_A )
{
- assert( ret==line->n_allele-1 );
+ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
// create mapping from source file ALT indexes to dst file indexes
@@ -354,7 +426,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf
}
else if ( var_len==BCF_VL_R )
{
- assert( ret==line->n_allele );
+ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
@@ -556,6 +628,8 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
{
for (i=0; i<*nb; i++)
{
+ if ( b[i][0]=='<' ) continue; // symbolic allele, do not modify
+ if ( b[i][0]=='*' ) continue; // overlapping deletion (*), do not modify
int l = strlen(b[i]);
b[i] = (char*) realloc(b[i],l+rla-rlb+1);
memcpy(b[i]+l,a[0]+rlb,rla-rlb+1);
@@ -565,13 +639,15 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
// now check if the $a alleles are present and if not add them
for (i=1; i<na; i++)
{
+ int const_ai = 1;
char *ai;
- if ( rlb>rla ) // $a alleles need expanding
+ if ( rlb>rla && a[i][0]!='<' && a[i][0]!='*' ) // $a alleles need expanding and not a symbolic allele or *
{
int l = strlen(a[i]);
ai = (char*) malloc(l+rlb-rla+1);
memcpy(ai,a[i],l);
memcpy(ai+l,b[0]+rla,rlb-rla+1);
+ const_ai = 0;
}
else
ai = a[i];
@@ -582,42 +658,59 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
if ( j<*nb ) // $b already has the same allele
{
map[i] = j;
- if ( rlb>rla ) free(ai);
+ if ( !const_ai ) free(ai);
continue;
}
// new allele
map[i] = *nb;
- b[*nb] = rlb>rla ? ai : strdup(ai);
+ if ( b[*nb] ) free(b[*nb]);
+ b[*nb] = const_ai ? strdup(ai) : ai;
(*nb)++;
}
return b;
}
-maux_t *maux_init(bcf_srs_t *files)
+maux_t *maux_init(args_t *args)
{
+ bcf_srs_t *files = args->files;
maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t));
ma->n = files->nreaders;
- ma->nbuf = (int *) calloc(ma->n,sizeof(int));
- ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*));
ma->files = files;
int i, n_smpl = 0;
for (i=0; i<ma->n; i++)
n_smpl += bcf_hdr_nsamples(files->readers[i].header);
+ if ( args->do_gvcf )
+ {
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+ for (i=0; i<ma->n; i++)
+ ma->gvcf[i].line = bcf_init1();
+ }
ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int));
ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
- ma->has_line = (int*) malloc(ma->n*sizeof(int));
+ ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
+ for (i=0; i<ma->n; i++)
+ ma->buf[i].rid = -1;
return ma;
}
void maux_destroy(maux_t *ma)
{
- int i;
+ int i,j;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
for (i=0; i<ma->n; i++) // for each reader
{
- if ( !ma->d[i] ) continue;
- int j;
- for (j=0; j<ma->nbuf[i]; j++) // for each buffered line
- if ( ma->d[i][j].map ) free(ma->d[i][j].map);
- free(ma->d[i]);
+ for (j=0; j<ma->buf[i].mrec; j++) // for each buffered line
+ free(ma->buf[i].rec[j].map);
+ free(ma->buf[i].rec);
+ }
+ free(ma->buf);
+ if ( ma->gvcf )
+ {
+ for (i=0; i<ma->n; i++) bcf_destroy(ma->gvcf[i].line);
+ free(ma->gvcf);
}
for (i=0; i<ma->mAGR_info; i++)
free(ma->AGR_info[i].buf);
@@ -626,32 +719,69 @@ void maux_destroy(maux_t *ma)
if (ma->ntmp_arr) free(ma->tmp_arr);
if (ma->nfmt_map) free(ma->fmt_map);
// ma->inf freed in bcf_destroy1
- free(ma->d);
- free(ma->nbuf);
for (i=0; i<ma->mals; i++) free(ma->als[i]);
if (ma->mout_als) free(ma->out_als);
free(ma->als);
free(ma->cnt);
free(ma->smpl_ploidy);
free(ma->smpl_nGsize);
- free(ma->has_line);
+ free(ma->chr);
free(ma);
}
-void maux_expand1(maux_t *ma, int i)
+void maux_expand1(buffer_t *buf, int size)
{
- if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer )
+ if ( buf->mrec < size )
{
- int n = ma->files->readers[i].nbuffer + 1;
- ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n);
- memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i]));
- ma->nbuf[i] = n;
+ hts_expand0(maux1_t,size,buf->mrec,buf->rec);
+ buf->mrec = size;
}
}
void maux_reset(maux_t *ma)
{
- int i;
- for (i=0; i<ma->n; i++) maux_expand1(ma, i);
- for (i=1; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ int i,j;
+ for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
+ for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
+ const char *chr = NULL;
+ ma->nals = 0;
+ ma->pos = -1;
+ for (i=0; i<ma->n; i++)
+ {
+ if ( !bcf_sr_has_line(ma->files,i) ) continue;
+ bcf1_t *line = bcf_sr_get_line(ma->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ chr = bcf_seqname(hdr,line);
+ ma->pos = line->pos;
+ break;
+ }
+ if ( chr )
+ {
+ free(ma->chr);
+ ma->chr = strdup(chr);
+ }
+ for (i=0; i<ma->n; i++)
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ ma->buf[i].rid = bcf_hdr_name2id(hdr,chr);
+ ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1;
+ for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
+ {
+ ma->buf[i].rec[j].skip = 0;
+ bcf1_t *line = ma->files->readers[i].buffer[j];
+ if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
+ }
+ ma->buf[i].end = j;
+ ma->buf[i].cur = -1;
+ if ( ma->buf[i].beg < ma->buf[i].end )
+ {
+ ma->buf[i].lines = ma->files->readers[i].buffer;
+ if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record
+ }
+ }
}
void maux_debug(maux_t *ma, int ir, int ib)
{
@@ -684,16 +814,20 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
out->pos = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_unpack(line, BCF_UN_ALL);
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- // alleles
+ // not all maux alleles are always used, mark the ones we'll need
int j;
for (j=1; j<line->n_allele; j++)
- al_idxs[ ma->d[i][0].map[j] ] = 1;
+ {
+ int irec = ma->buf[i].cur;
+ al_idxs[ ma->buf[i].rec[irec].map[j] ] = 1;
+ }
// position
if ( out->pos==-1 )
@@ -717,16 +851,15 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
}
// set QUAL to the max qual value. Not exactly correct, but good enough for now
- if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) )
+ if ( !bcf_float_is_missing(line->qual) )
{
- if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual;
+ if ( bcf_float_is_missing(out->qual) || out->qual < line->qual ) out->qual = line->qual;
}
}
// set ID
if ( !tmps->l ) kputs(".", tmps);
- if ( out->d.id ) free(out->d.id);
- out->d.id = strdup(tmps->s);
+ bcf_update_id(out_hdr, out, tmps->s);
// set alleles
ma->nout_als = 0;
@@ -740,10 +873,13 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
int ir, j;
for (ir=0; ir<files->nreaders; ir++)
{
- if ( !ma->has_line[ir] ) continue;
- bcf1_t *line = files->readers[ir].buffer[0];
+ bcf1_t *line = maux_get_line(args,ir);
+ if ( !line ) continue;
for (j=1; j<line->n_allele; j++)
- if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als;
+ {
+ int irec = ma->buf[ir].cur;
+ if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+ }
}
}
// Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block.
@@ -765,20 +901,36 @@ void merge_filter(args_t *args, bcf1_t *out)
bcf_hdr_t *out_hdr = args->out_hdr;
int i, ret;
+ if ( args->filter_logic == FLT_LOGIC_REMOVE )
+ {
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ if ( bcf_has_filter(hdr, line, "PASS") ) break;
+ }
+ if ( i<files->nreaders )
+ {
+ int flt_id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
+ bcf_add_filter(out_hdr, out, flt_id);
+ return;
+ }
+ }
+
khiter_t kitr;
strdict_t *tmph = args->tmph;
kh_clear(strdict, tmph);
- maux_t *ma = args->maux;
out->d.n_flt = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i]) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- bcf_unpack(line, BCF_UN_ALL);
int k;
for (k=0; k<line->d.n_flt; k++)
@@ -789,8 +941,8 @@ void merge_filter(args_t *args, bcf1_t *out)
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt);
if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt);
- hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt);
- ma->flt[out->d.n_flt] = id;
+ hts_expand(int,out->d.n_flt+1,out->d.m_flt,out->d.flt);
+ out->d.flt[out->d.n_flt] = id;
out->d.n_flt++;
kh_put(strdict, tmph, flt, &ret);
}
@@ -801,20 +953,17 @@ void merge_filter(args_t *args, bcf1_t *out)
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
for (i=0; i<out->d.n_flt; i++)
- if ( ma->flt[i]==id ) break;
+ if ( out->d.flt[i]==id ) break;
if ( i<out->d.n_flt )
{
out->d.n_flt--;
- for (; i<out->d.n_flt; i++) ma->flt[i] = ma->flt[i+1];
+ for (; i<out->d.n_flt; i++) out->d.flt[i] = out->d.flt[i+1];
}
}
- out->d.flt = ma->flt;
}
static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str)
{
- assert( !info->vptr_free );
-
uint8_t *ptr = info->vptr - info->vptr_off;
bcf_dec_typed_int1(ptr, &ptr);
@@ -833,8 +982,6 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t
kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str);
info->vptr = (uint8_t*) tmp_str->s + info->vptr_off;
- info->vptr_free = 1;
- line->d.shared_dirty |= BCF1_DIRTY_INF;
tmp_str->s = NULL;
tmp_str->m = 0;
tmp_str->l = 0;
@@ -1029,9 +1176,10 @@ void merge_info(args_t *args, bcf1_t *out)
info_rules_reset(args);
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
+ int irec = ma->buf[i].cur;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_info; j++)
{
@@ -1050,7 +1198,7 @@ void merge_info(args_t *args, bcf1_t *out)
info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key);
if ( rule )
{
- maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL;
+ maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->buf[i].rec[irec] : NULL;
if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue;
}
}
@@ -1061,7 +1209,7 @@ void merge_info(args_t *args, bcf1_t *out)
{
if ( kitr == kh_end(tmph) )
{
- // first occurance in this reader, alloc arrays
+ // seeing this key for the first time
ma->nAGR_info++;
hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info);
kitr = kh_put(strdict, tmph, key, &ret);
@@ -1079,37 +1227,36 @@ void merge_info(args_t *args, bcf1_t *out)
kitr = kh_get(strdict, tmph, key);
int idx = kh_val(tmph, kitr);
if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1);
- merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]);
+ merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]);
continue;
}
if ( kitr == kh_end(tmph) )
{
- hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf);
- ma->inf[out->n_info].key = id;
- ma->inf[out->n_info].type = inf->type;
- ma->inf[out->n_info].len = inf->len;
- ma->inf[out->n_info].vptr = inf->vptr;
- ma->inf[out->n_info].v1.i = inf->v1.i;
- ma->inf[out->n_info].v1.f = inf->v1.f;
- ma->inf[out->n_info].vptr_off = inf->vptr_off;
- ma->inf[out->n_info].vptr_len = inf->vptr_len;
- ma->inf[out->n_info].vptr_free = inf->vptr_free;
+ // Seeing this key for the first time. Although quite hacky,
+ // this is faster than anything else given the data structures..
+
+ hts_expand0(bcf_info_t,out->n_info+1,out->d.m_info,out->d.info);
+ out->d.info[out->n_info].key = id;
+ out->d.info[out->n_info].type = inf->type;
+ out->d.info[out->n_info].len = inf->len;
+ out->d.info[out->n_info].v1.i = inf->v1.i;
+ out->d.info[out->n_info].v1.f = inf->v1.f;
+ out->d.info[out->n_info].vptr_off = inf->vptr_off;
+ out->d.info[out->n_info].vptr_len = inf->vptr_len;
+ out->d.info[out->n_info].vptr_free = 1;
+ out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off);
+ memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off);
+ out->d.info[out->n_info].vptr += inf->vptr_off;
if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) )
- {
- // The existing packed info cannot be reused. Change the id.
- // Although quite hacky, it's faster than anything else given
- // the data structures
- bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps);
- }
+ bcf_info_set_id(out, &out->d.info[out->n_info], id, &args->tmps);
+ out->d.shared_dirty |= BCF1_DIRTY_INF;
out->n_info++;
kitr = kh_put(strdict, tmph, key, &ret);
kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value
}
}
}
- out->d.info = ma->inf;
- out->d.m_info = ma->minf;
for (i=0; i<args->nrules; i++)
args->rules[i].merger(args->out_hdr, out, &args->rules[i]);
for (i=0; i<ma->nAGR_info; i++)
@@ -1154,12 +1301,14 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
}
memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
+ int default_gt = args->missing_to_ref ? bcf_gt_unphased(0) : bcf_gt_missing;
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize;
+ int irec = ma->buf[i].cur;
int j, k;
if ( !fmt_ori )
@@ -1167,7 +1316,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
// missing values: assume maximum ploidy
for (j=0; j<bcf_hdr_nsamples(hdr); j++)
{
- for (k=0; k<nsize; k++) { tmp[k] = 0; ma->smpl_ploidy[ismpl+j]++; }
+ for (k=0; k<nsize; k++) { tmp[k] = default_gt; ma->smpl_ploidy[ismpl+j]++; }
tmp += nsize;
}
ismpl += bcf_hdr_nsamples(hdr);
@@ -1176,7 +1325,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
#define BRANCH(type_t, vector_end) { \
type_t *p_ori = (type_t*) fmt_ori->p; \
- if ( !ma->d[i][0].als_differ ) \
+ if ( !ma->buf[i].rec[irec].als_differ ) \
{ \
/* the allele numbering is unchanged */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
@@ -1206,7 +1355,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
else \
{ \
int al = (p_ori[k]>>1) - 1; \
- al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \
+ al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \
tmp[k] = (al << 1) | ((p_ori[k])&1); \
} \
} \
@@ -1239,7 +1388,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int nsize = 0, length = BCF_VL_FIXED, type = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ if ( !maux_get_line(args,i) ) continue;
if ( !fmt_map[i] ) continue;
if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key;
type = fmt_map[i]->type;
@@ -1277,10 +1426,12 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = ma->buf[i].cur;
if ( fmt_ori )
{
type = fmt_ori->type;
- int nals_ori = reader->buffer[0]->n_allele;
+ int nals_ori = line->n_allele;
if ( length==BCF_VL_G )
{
// if all fields are missing then n==1 is valid
@@ -1313,10 +1464,8 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
ismpl += bcf_hdr_nsamples(hdr); \
continue; \
} \
- assert( ma->has_line[i] ); \
- bcf1_t *line = reader->buffer[0]; \
src_type_t *src = (src_type_t*) fmt_ori->p; \
- if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \
+ if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \
{ \
/* alleles unchanged, copy over */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
@@ -1358,7 +1507,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int iori, inew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
@@ -1372,10 +1521,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int iori,jori, inew,jnew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
for (jori=0; jori<=iori; jori++) \
{ \
- jnew = ma->d[i][0].map[jori]; \
+ jnew = ma->buf[i].rec[irec].map[jori]; \
int kori = iori*(iori+1)/2 + jori; \
int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
@@ -1412,7 +1561,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int iori,inew; \
for (iori=ifrom; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori] - ifrom; \
+ inew = ma->buf[i].rec[irec].map[iori] - ifrom; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
if ( src_is_missing ) tgt_set_missing; \
@@ -1461,9 +1610,9 @@ void merge_format(args_t *args, bcf1_t *out)
int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_fmt; j++)
{
@@ -1495,9 +1644,10 @@ void merge_format(args_t *args, bcf1_t *out)
ma->fmt_map[ifmt*files->nreaders+i] = fmt;
}
// Check if the allele numbering must be changed
- for (j=1; j<reader->buffer[0]->n_allele; j++)
- if ( ma->d[i][0].map[j]!=j ) break;
- ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1;
+ int irec = ma->buf[i].cur;
+ for (j=1; j<line->n_allele; j++)
+ if ( ma->buf[i].rec[irec].map[j]!=j ) break;
+ ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
}
out->n_sample = bcf_hdr_nsamples(out_hdr);
@@ -1505,203 +1655,383 @@ void merge_format(args_t *args, bcf1_t *out)
merge_GT(args, ma->fmt_map, out);
update_AN_AC(out_hdr, out);
- if ( out->d.info!=ma->inf )
- {
- // hacky, we rely on htslib internals: bcf_update_info() reallocated the info
- ma->inf = out->d.info;
- ma->minf = out->d.m_info;
- }
-
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
out->d.indiv_dirty = 1;
}
-// The core merging function, one or none line from each reader
-void merge_line(args_t *args)
+void gvcf_set_alleles(args_t *args)
+{
+ int i,k;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ maux->nals = 0;
+
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = maux->buf[i].cur;
+
+ hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map);
+ if ( !maux->nals ) // first record, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=0; k<maux->nals; k++)
+ {
+ if ( maux->als[k] ) free(maux->als[k]);
+ maux->als[k] = strdup(line->d.allele[k]);
+ maux->buf[i].rec[irec].map[k] = k;
+ }
+ }
+ else
+ {
+ maux->als = merge_alleles(line->d.allele, line->n_allele, maux->buf[i].rec[irec].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als )
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
+ error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1);
+ }
+ }
+ }
+}
+
+/*
+ Output staged gVCF blocks, end is the last position of the block. Assuming
+ gaux[i].active flags are set and maux_get_line returns correct lines.
+*/
+void gvcf_write_block(args_t *args, int start, int end)
{
+ int i;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ assert(gaux);
+
+ // Update POS
+ int min = INT_MAX;
+ char ref = 'N';
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( ref=='N' && gaux[i].line->pos==start ) ref = gaux[i].line->d.allele[0][0];
+ gaux[i].line->pos = start;
+ }
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < start )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ gaux[i].line->d.allele[0][0] = ref;
+ if ( min > gaux[i].end ) min = gaux[i].end;
+ }
+ // Check for valid gVCF blocks in this region
+ if ( min==INT_MAX )
+ {
+ assert(0);
+ maux->gvcf_min = 0;
+ return;
+ }
+
bcf1_t *out = args->out_line;
- bcf_clear1(out);
- out->unpacked = BCF_UN_ALL;
+ gvcf_set_alleles(args);
+
+ // Merge the staged lines
merge_chrom2qual(args, out);
merge_filter(args, out);
merge_info(args, out);
merge_format(args, out);
- bcf_write1(args->out_fh, args->out_hdr, out);
-}
+ if ( args->gvcf_fai && out->d.allele[0][0]=='N' )
+ {
+ int slen = 0;
+ char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+ if (slen)
+ {
+ out->d.allele[0][0] = seq[0];
+ free(seq);
+ }
+ }
+ // Update END boundary
+ if ( end > start )
+ {
+ end++;
+ bcf_update_info_int32(args->out_hdr, out, "END", &end, 1);
+ }
+ else
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
-void debug_buffers(FILE *fp, bcf_srs_t *files);
-void debug_buffer(FILE *fp, bcf_sr_t *reader);
-#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+ // Inactivate blocks which do not extend beyond END and find new gvcf_min
+ min = INT_MAX;
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < end )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ // next min END position bigger than the current one
+ if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1;
+ }
+ maux->gvcf_min = min==INT_MAX ? 0 : min;
+}
-// Clean the reader's buffer to and make it ready for the next next_line() call.
-// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put
-// the rest to the beggining. Then shorten the buffer so that the last element
-// points to the last unfinished record. There are two special cases: the last
-// line of the buffer typically has a different position and must stay at the
-// end; next, the first record of the buffer must be one of those already
-// printed, as it will be discarded by next_line().
-//
-void shake_buffer(maux_t *maux, int ir, int pos)
+/*
+ Flush staged gVCF blocks. Flush everything if there are no more lines
+ (done=1) or if there is a new chromosome. If still on the same chromosome,
+ all hanging blocks must be ended by creating new records:
+ A
+ 1 END=10
+ B
+ 3 END=7
+ C
+ 3 END=5
+ out
+ 1 END=2 A . .
+ 3 END=5 A B C
+ 6 END=7 A B .
+ 8 END=10 A . .
+
+*/
+void gvcf_flush(args_t *args, int done)
{
- bcf_sr_t *reader = &maux->files->readers[ir];
- maux1_t *m = maux->d[ir];
-
- if ( !reader->buffer ) return;
-
int i;
- // FILE *fp = stdout;
- // fprintf(fp,"<going to shake> nbuf=%d\t", reader->nbuffer); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n");
- // debug_buffer(fp,reader);
- // fprintf(fp,"--\n");
+ maux_t *maux = args->maux;
- int a = 1, b = reader->nbuffer;
- if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards
+ if ( !maux->chr ) return; // first time here, nothing to flush
- while ( a<b )
+ int flush_until = INT_MAX;
+ if ( !done )
{
- if ( !(m[a].skip&SKIP_DONE) ) { a++; continue; }
- if ( m[b].skip&SKIP_DONE ) { b--; continue; }
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[b]);
- SWAP(maux1_t, m[a], m[b]);
- a++;
- b--;
- }
+ // Get current position and chromosome
+ for (i=0; i<maux->n; i++)
+ if ( bcf_sr_has_line(maux->files,i) ) break;
+ bcf1_t *line = bcf_sr_get_line(maux->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(maux->files,i);
- // position $a to the after the first unfinished record
- while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++;
+ if ( !strcmp(maux->chr,bcf_seqname(hdr,line)) ) flush_until = line->pos; // still on the same chr
+ }
- if ( a<reader->nbuffer )
+ // When called on a region, trim the blocks accordingly
+ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos;
+ if ( args->regs )
{
- // there is a gap between the unfinished lines at the beggining and the
- // last line. The last line must be brought forward to fill the gap
- if ( reader->buffer[reader->nbuffer]->pos != pos )
+ int rstart = -1, rend = -1;
+ if ( regidx_overlap(args->regs,maux->chr,start,flush_until,args->regs_itr) )
{
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]);
- SWAP(maux1_t, m[a], m[reader->nbuffer]);
- reader->nbuffer = a;
+ // In case there are multiple regions, we treat them as one
+ rstart = args->regs_itr->beg;
+ while ( regitr_overlap(args->regs_itr) ) rend = args->regs_itr->end;
}
+ if ( rstart > start ) start = rstart;
+ if ( rend < flush_until ) flush_until = rend+1;
}
- if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos )
+ // output all finished blocks
+ while ( maux->gvcf_min && start < flush_until )
{
- // the first record is unfinished, replace it with an empty line
- // from the end of the buffer or else next_line will remove it
- if ( reader->nbuffer + 1 >= maux->nbuf[ir] )
+ // does the block end before the new line or is it interrupted?
+ int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until;
+ if ( start > tmp-1 ) break;
+ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based
+ start = tmp;
+ }
+}
+
+/*
+ Check incoming lines for new gVCF blocks, set pointer to the current source
+ buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
+ called only after maux_reset as it relies on updated maux buffers.
+*/
+void gvcf_stage(args_t *args, int pos)
+{
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ bcf_srs_t *files = args->files;
+ int32_t *end = (int32_t*) maux->tmp_arr;
+ int i, nend = maux->ntmp_arr / sizeof(int32_t);
+
+ maux->gvcf_break = -1;
+ maux->gvcf_min = INT_MAX;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( gaux[i].active )
{
- reader->nbuffer++;
- maux_expand1(maux, ir);
- reader->nbuffer--;
- m = maux->d[ir];
+ // gvcf block should not overlap with another record
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+ continue;
}
- if ( reader->nbuffer+1 >= reader->mbuffer )
- error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer);
- if ( reader->buffer[reader->nbuffer]->pos!=pos )
+ // Does any of the lines have END set? It is enough to check only the
+ // first line, there should be no duplicate records with END in gVCF
+
+ if ( maux->buf[i].beg==maux->buf[i].end ) continue; // no new record
+
+ int irec = maux->buf[i].beg;
+ bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
+ bcf1_t *line = args->files->readers[i].buffer[irec];
+ int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+ if ( ret==1 )
{
- // 4way swap
- bcf1_t *tmp = reader->buffer[0];
- reader->buffer[0] = reader->buffer[reader->nbuffer+1];
- reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer];
- reader->buffer[reader->nbuffer] = tmp;
- m[reader->nbuffer].skip = m[0].skip;
- m[reader->nbuffer+1].skip = SKIP_DIFF;
- reader->nbuffer++;
+ // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with
+ // an empty record: the gaux line must be kept until we reach its END.
+ gaux[i].active = 1;
+ gaux[i].end = end[0] - 1;
+ SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line);
+ gaux[i].line->pos = pos;
+
+ maux->buf[i].lines = &gaux[i].line;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+
+ // Set the rid,pos of the swapped line in the buffer or else the
+ // synced reader will have a problem with the next line
+ //
+ args->files->readers[i].buffer[irec]->rid = maux->buf[i].rid;
+ args->files->readers[i].buffer[irec]->pos = maux->pos;
+
+ // Update block offsets
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
}
else
- {
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]);
- SWAP(maux1_t, m[0], m[reader->nbuffer+1]);
- }
+ maux->gvcf_break = line->pos; // must break the gvcf block
}
+ maux->ntmp_arr = nend * sizeof(int32_t);
+ maux->tmp_arr = end;
+ if ( maux->gvcf_min==INT_MAX ) maux->gvcf_min = 0;
+}
+
+
+void debug_buffers(FILE *fp, bcf_srs_t *files);
+void debug_buffer(FILE *fp, bcf_srs_t *files, int reader);
+
+/*
+ Flush all buffered and processed records with the same coordinate.
+ Note that synced reader discards buffer[0], so that needs to stay
+ untouched.
+*/
+void clean_buffer(args_t *args)
+{
+ maux_t *ma = args->maux;
+
+ int ir;
+ for (ir=0; ir<ma->n; ir++)
+ {
+ // Invalidate pointer to reader's buffer or else gvcf_flush will attempt
+ // to use the old lines via maux_get_line()
+ if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
- // debug_buffer(fp,reader);
- // fprintf(fp,"<shaken>\t"); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]);
- // fprintf(fp,"\n\n");
+ bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir);
+ if ( !reader->nbuffer ) continue; // nothing to clean
- // set position of finished buffer[0] line to -1, otherwise swapping may
- // bring it back after next_line()
- reader->buffer[0]->pos = -1;
+ bcf1_t **buf = reader->buffer;
+ if ( buf[1]->rid!=ma->buf[ir].rid || buf[1]->pos!=ma->pos ) continue; // nothing to flush
- // trim the buffer, remove finished lines from the end
- i = reader->nbuffer;
- while ( i>=1 && m[i--].skip&SKIP_DONE )
- reader->nbuffer--;
+ int a = 1, b = 2;
+ while ( b<=reader->nbuffer && buf[b]->rid==ma->buf[ir].rid && buf[b]->pos==ma->pos ) b++;
+ // b now points to the first line we want to preserve
+ while ( b<=reader->nbuffer )
+ {
+ SWAP(bcf1_t*, buf[a], buf[b]);
+ a++; b++;
+ }
+ reader->nbuffer -= b-a;
+ }
}
-void debug_maux(args_t *args, int pos, int var_type)
+void debug_maux(args_t *args)
{
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
int j,k,l;
- fprintf(stderr,"Alleles to merge at %d\n", pos+1);
+ fprintf(stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals);
for (j=0; j<files->nreaders; j++)
{
bcf_sr_t *reader = &files->readers[j];
+ buffer_t *buf = &maux->buf[j];
fprintf(stderr," reader %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
+ for (k=buf->beg; k<buf->end; k++)
{
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ if ( buf->rec[k].skip & SKIP_DONE ) continue;
bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
fprintf(stderr,"\t");
- if ( maux->d[j][k].skip ) fprintf(stderr,"["); // this record will not be merged in this round
+ if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record will not be merged in this round
for (l=0; l<line->n_allele; l++)
fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]);
- if ( maux->d[j][k].skip ) fprintf(stderr,"]");
+ if ( buf->rec[k].skip ) fprintf(stderr,"]");
}
fprintf(stderr,"\n");
}
fprintf(stderr," counts: ");
- for (j=0; j<maux->nals; j++) fprintf(stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(stderr,"\n");
- for (j=0; j<files->nreaders; j++)
- {
- bcf_sr_t *reader = &files->readers[j];
- fprintf(stderr," out %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
- {
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
- bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
- if ( maux->d[j][k].skip ) continue;
- fprintf(stderr,"\t");
- for (l=0; l<line->n_allele; l++)
- fprintf(stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
- }
- fprintf(stderr,"\n");
- }
- fprintf(stderr,"\n");
+ for (j=0; j<maux->nals; j++) fprintf(stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]);
+ fprintf(stderr,"\n\n");
}
-// Determine which line should be merged from which reader: go through all
-// readers and all buffered lines, expand REF,ALT and try to match lines with
-// the same ALTs. A step towards output independent on input ordering of the
-// lines.
-void merge_buffer(args_t *args)
+
+/*
+ Determine which line should be merged from which reader: go through all
+ readers and all buffered lines, expand REF,ALT and try to match lines with
+ the same ALTs.
+ */
+int can_merge(args_t *args)
{
bcf_srs_t *files = args->files;
- int i, pos = -1, var_type = 0;
- char *id = NULL;
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
maux_t *maux = args->maux;
- maux_reset(maux);
+ gvcf_aux_t *gaux = maux->gvcf;
+ char *id = NULL, ref = 'N';
+ maux->var_types = maux->nals = 0;
- // set the current position
+ int i,j,k, ntodo = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( bcf_sr_has_line(files,i) )
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = bcf_sr_get_line(files,i);
- pos = line->pos;
- var_type = bcf_get_variant_types(line);
- id = line->d.id;
- break;
+ // skip readers with active gvcf blocks
+ buf->rec[buf->beg].skip = SKIP_DIFF;
+ continue;
+ }
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ buf->rec[j].skip = SKIP_DIFF;
+ ntodo++;
+
+ if ( args->merge_by_id )
+ id = buf->lines[j]->d.id;
+ else
+ {
+ int var_type = bcf_get_variant_types(buf->lines[j]);
+ maux->var_types |= var_type ? var_type<<1 : 1;
+ }
}
+
+ // for gvcf: find out REF at this position
+ if ( buf->beg < buf->end && ref=='N' )
+ ref = buf->lines[buf->beg]->d.allele[0][0];
}
+ if ( !ntodo ) return 0;
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
@@ -1710,19 +2040,24 @@ void merge_buffer(args_t *args)
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
- int j, k;
- for (j=0; j<=reader->nbuffer; j++)
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = reader->buffer[j];
+ gaux[i].line->d.allele[0][0] = ref;
+ gaux[i].line->pos = maux->pos;
+ }
+
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
+
int line_type = bcf_get_variant_types(line);
+ line_type = line_type ? line_type<<1 : 1;
+
// select relevant lines
- maux->d[i][j].skip = SKIP_DIFF;
- if ( pos!=line->pos )
- {
- if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore
- continue;
- }
if ( args->merge_by_id )
{
if ( strcmp(id,line->d.id) ) continue;
@@ -1733,30 +2068,30 @@ void merge_buffer(args_t *args)
{
// All alleles of the tested record must be present in the
// selected maux record plus variant types must be the same
- if ( var_type!=line->d.var_type ) continue;
+ if ( (maux->var_types & line_type) != line_type ) continue;
if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
for (k=1; k<line->n_allele; k++)
{
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
}
- if ( k==line->n_allele ) continue; // no matching allele
+ if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele
}
if ( !(args->collapse&COLLAPSE_ANY) )
{
- int compatible = 0;
- if ( line_type==var_type ) compatible = 1;
- else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything
- else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1;
- else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1;
- if ( !compatible ) continue;
+ // Merge:
+ // - SNPs+SNPs+MNPs+REF if -m both,snps
+ // - indels+indels+REF if -m both,indels, REF only if SNPs are not present
+ // - SNPs come first
+ if ( line_type & indel_mask )
+ {
+ if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
+ if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
+ }
}
}
- maux->d[i][j].skip = 0;
+ buf->rec[j].skip = 0;
- hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map);
+ hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
if ( !maux->nals ) // first record, copy the alleles to the output
{
maux->nals = line->n_allele;
@@ -1764,111 +2099,118 @@ void merge_buffer(args_t *args)
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=0; k<maux->nals; k++)
{
+ free(maux->als[k]);
maux->als[k] = strdup(line->d.allele[k]);
- maux->d[i][j].map[k] = k;
+ buf->rec[j].map[k] = k;
maux->cnt[k] = 1;
}
- pos = line->pos;
continue;
}
-
// normalize alleles
- maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals);
- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname);
+ maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(args->out_hdr,line),line->pos+1,reader->fname);
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=1; k<line->n_allele; k++)
- maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files
+ maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files
maux->cnt[0]++;
}
}
+ return 1;
+}
- // debug_maux(args, pos, var_type);
+/*
+ Select records that have the same alleles; the input ordering of indels
+ must not matter. Multiple VCF lines can be emitted from this loop.
+ We expect only very few alleles and not many records with the same
+ position in the buffers, therefore the nested loops should not slow us
+ much.
+*/
+void stage_line(args_t *args)
+{
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
- // Select records that have the same alleles; the input ordering of indels
- // must not matter. Multiple VCF lines can be emitted from this loop.
- // We expect only very few alleles and not many records with the same
- // position in the buffers, therefore the nested loops should not slow us
- // much.
- while (1)
+ // debug_maux(args);
+
+ // take the most frequent allele present in multiple files, REF is skipped
+ int i,j,k,icnt = 1;
+ for (i=2; i<maux->nals; i++)
+ if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+
+ int nout = 0;
+ for (i=0; i<files->nreaders; i++)
{
- // take the most frequent allele present in multiple files
- int icnt = 0;
- for (i=1; i<maux->nals; i++)
- if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
- if ( maux->cnt[icnt]<0 ) break;
+ buffer_t *buf = &maux->buf[i];
+ buf->cur = -1;
+ if ( buf->beg >= buf->end ) continue; // no lines in the buffer
- int nmask = 0;
- for (i=0; i<files->nreaders; i++)
+ // find lines with the same allele
+ for (j=buf->beg; j<buf->end; j++)
{
- maux->has_line[i] = 0;
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->merge_by_id ) break;
+ if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record
- bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
+ for (k=0; k<buf->lines[j]->n_allele; k++)
+ if ( icnt==buf->rec[j].map[k] ) break;
- // find lines with the same allele
- int j;
- for (j=0; j<=reader->nbuffer; j++)
- {
- if ( maux->d[i][j].skip ) continue;
- int k;
- for (k=0; k<reader->buffer[j]->n_allele; k++)
- if ( icnt==maux->d[i][j].map[k] ) break;
- if ( k<reader->buffer[j]->n_allele ) break;
- }
- if ( j>reader->nbuffer )
- {
- // no matching allele found in this file
- if ( args->collapse==COLLAPSE_NONE ) continue;
+ if ( k<buf->lines[j]->n_allele ) break;
+ }
+ if ( j>=buf->end )
+ {
+ // no matching allele found in this file
+ if ( args->collapse==COLLAPSE_NONE ) continue;
- for (j=0; j<=reader->nbuffer; j++)
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
+ int line_type = bcf_get_variant_types(buf->lines[j]);
+ if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type==VCF_REF )
{
- if ( maux->d[i][j].skip ) continue;
- if ( args->collapse&COLLAPSE_ANY ) break;
- int line_type = bcf_get_variant_types(reader->buffer[j]);
- if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- if ( line_type==VCF_REF )
- {
- if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
- else if ( var_type==VCF_REF )
- {
- if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
+ if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ref_mask ) break;
}
- }
- if ( j<=reader->nbuffer )
- {
- // found a suitable line for merging, place it at the beggining
- if ( j>0 )
+ else if ( maux->var_types&ref_mask )
{
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]);
- SWAP(maux1_t, maux->d[i][0], maux->d[i][j]);
+ if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
}
- // mark as finished so that it's ignored next time
- maux->d[i][0].skip |= SKIP_DONE;
- maux->has_line[i] = 1;
- nmask++;
}
}
- if ( !nmask ) break; // done, no more lines suitable for merging found
- merge_line(args); // merge and output the line
- maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished
+ if ( j<buf->end )
+ {
+ // found a suitable line for merging
+ buf->cur = j;
+
+ // mark as finished so that it's ignored next time
+ buf->rec[j].skip = SKIP_DONE;
+ nout++;
+ }
}
+ assert( nout );
+}
- // clean the alleles
- for (i=0; i<maux->nals; i++)
+void merge_line(args_t *args)
+{
+ if ( args->regs )
{
- free(maux->als[i]);
- maux->als[i] = 0;
+ if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
}
- maux->nals = 0;
- // get the buffers ready for the next next_line() call
- for (i=0; i<files->nreaders; i++)
- shake_buffer(maux, i, pos);
+ bcf1_t *out = args->out_line;
+ merge_chrom2qual(args, out);
+ merge_filter(args, out);
+ merge_info(args, out);
+ if ( args->do_gvcf )
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ merge_format(args, out);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
}
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd)
@@ -1887,6 +2229,8 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c
else
ksprintf(&str, " %s", argv[i]);
}
+ kputs("; Date=", &str);
+ time_t tm; time(&tm); kputs(ctime(&tm), &str);
kputc('\n', &str);
bcf_hdr_append(hdr,str.s);
free(str.s);
@@ -1898,7 +2242,7 @@ void merge_vcf(args_t *args)
{
args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
if ( args->header_fname )
@@ -1928,14 +2272,32 @@ void merge_vcf(args_t *args)
}
if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
- args->maux = maux_init(args->files);
+ args->maux = maux_init(args);
args->out_line = bcf_init1();
args->tmph = kh_init(strdict);
- int ret;
- while ( (ret=bcf_sr_next_line(args->files)) )
+
+ while ( bcf_sr_next_line(args->files) )
{
- merge_buffer(args);
+ // output cached gVCF blocks which end before the new record
+ if ( args->do_gvcf )
+ gvcf_flush(args,0);
+
+ maux_reset(args->maux);
+
+ // determine which of the new records are gvcf blocks
+ if ( args->do_gvcf )
+ gvcf_stage(args, args->maux->pos);
+
+ while ( can_merge(args) )
+ {
+ stage_line(args);
+ merge_line(args);
+ }
+ clean_buffer(args);
}
+ if ( args->do_gvcf )
+ gvcf_flush(args,1);
+
info_rules_destroy(args);
maux_destroy(args->maux);
bcf_hdr_destroy(args->out_hdr);
@@ -1958,7 +2320,10 @@ static void usage(void)
fprintf(stderr, " --force-samples resolve duplicate sample names\n");
fprintf(stderr, " --print-header print only the merged header and exit\n");
fprintf(stderr, " --use-header <file> use the provided header\n");
+ fprintf(stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n");
fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -F, --filter-logic <x|+> remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
+ fprintf(stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
fprintf(stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(stderr, " -l, --file-list <file> read file names from the file\n");
fprintf(stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
@@ -1989,7 +2354,9 @@ int main_vcfmerge(int argc, char *argv[])
{
{"help",no_argument,NULL,'h'},
{"merge",required_argument,NULL,'m'},
+ {"gvcf",required_argument,NULL,'g'},
{"file-list",required_argument,NULL,'l'},
+ {"missing-to-ref",no_argument,NULL,'0'},
{"apply-filters",required_argument,NULL,'f'},
{"use-header",required_argument,NULL,1},
{"print-header",no_argument,NULL,2},
@@ -2001,10 +2368,25 @@ int main_vcfmerge(int argc, char *argv[])
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
+ {"filter-logic",required_argument,NULL,'F'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
switch (c) {
+ case 'F':
+ if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
+ else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
+ else error("Filter logic not recognised: %s\n", optarg);
+ break;
+ case '0': args->missing_to_ref = 1; break;
+ case 'g':
+ args->do_gvcf = 1;
+ if ( strcmp("-",optarg) )
+ {
+ args->gvcf_fai = fai_load(optarg);
+ if ( !args->gvcf_fai ) error("Failed to load the fai index: %s\n", optarg);
+ }
+ break;
case 'l': args->file_list = optarg; break;
case 'i': args->info_rules = optarg; break;
case 'o': args->output_fname = optarg; break;
@@ -2045,9 +2427,23 @@ int main_vcfmerge(int argc, char *argv[])
if ( argc-optind<2 && !args->file_list ) usage();
args->files->require_index = 1;
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( regions_is_file )
+ args->regs = regidx_init(args->regions_list,NULL,NULL,sizeof(char*),NULL);
+ else
+ {
+ args->regs = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(args->regs,args->regions_list,',') !=0 ) error("Could not parse the regions: %s\n", args->regions_list);
+ regidx_insert(args->regs,NULL);
+ }
+ if ( !args->regs ) error("Could not parse the regions: %s\n", args->regions_list);
+ args->regs_itr = regitr_init(args->regs);
+ }
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
while (optind<argc)
{
if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
@@ -2065,6 +2461,9 @@ int main_vcfmerge(int argc, char *argv[])
}
merge_vcf(args);
bcf_sr_destroy(args->files);
+ if ( args->regs ) regidx_destroy(args->regs);
+ if ( args->regs_itr ) regitr_destroy(args->regs_itr);
+ if ( args->gvcf_fai ) fai_destroy(args->gvcf_fai);
free(args);
return 0;
}
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c
index daac458..db9aff5 100644
--- a/bcftools/vcfmerge.c.pysam.c
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -26,28 +26,39 @@ THE SOFTWARE. */
#include <stdio.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <htslib/faidx.h>
#include <math.h>
#include <ctype.h>
+#include <time.h>
#include "bcftools.h"
+#include "regidx.h"
#include "vcmp.h"
+#define DBG 0
+
#include <htslib/khash.h>
KHASH_MAP_INIT_STR(strdict, int)
typedef khash_t(strdict) strdict_t;
-#define SKIP_DONE 1
-#define SKIP_DIFF 2
+#define FLT_LOGIC_ADD 0
+#define FLT_LOGIC_REMOVE 1
+
+#define SKIP_DONE 1 // the record was processed
+#define SKIP_DIFF 2 // not compatible, merge later
#define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G)
#define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A)
#define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R)
+#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+
// For merging INFO Number=A,G,R tags
typedef struct
{
@@ -65,43 +76,61 @@ typedef struct _info_rule_t
void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule);
int type; // one of BCF_HT_*
int block_size; // number of values in a block
+ int type_size; // size of the corresponding BCF_HT_* type
int nblocks; // number of blocks in nvals (the number of merged files)
int nvals, mvals; // used and total size of vals array
void *vals; // the info tag values
}
info_rule_t;
+typedef struct
+{
+ bcf1_t *line;
+ int end, active;
+}
+gvcf_aux_t;
+
// Auxiliary merge data for selecting the right combination
// of buffered records across multiple readers. maux1_t
// corresponds to one buffered line.
typedef struct
{
int skip;
- int *map; // mapping from input alleles to the output array
+ int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles)
int mmap; // size of map array (only buffer[i].n_allele is actually used)
int als_differ;
}
maux1_t;
typedef struct
{
- int n; // number of readers
+ int rid; // current rid
+ int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+ int cur; // current line or -1 if none
+ int npos; // number of unprocessed lines at this position
+ int mrec; // allocated size of buf
+ maux1_t *rec; // buffer to keep reader's lines
+ bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+}
+buffer_t;
+typedef struct
+{
+ int n, pos, var_types; // number of readers, current position, currently available variant types
+ char *chr; // current chromosome
char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
int nals, mals, nout_als, mout_als; // size of the output array
int *cnt, ncnt; // number of records that refer to the alleles
- int *nbuf; // readers have buffers of varying lengths
int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
- int *flt, mflt, minf;
- bcf_info_t *inf;// out_line's INFO fields
bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
int nfmt_map; // number of rows in the fmt_map array
int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
void *tmp_arr;
int ntmp_arr;
- maux1_t **d; // d[i][j] i-th reader, j-th buffer line
+ buffer_t *buf;
AGR_info_t *AGR_info;
int nAGR_info, mAGR_info;
bcf_srs_t *files;
- int *has_line; // which files are being merged
+ int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present
+ gvcf_aux_t *gvcf; // buffer of gVCF lines
}
maux_t;
@@ -109,8 +138,11 @@ typedef struct
{
vcmp_t *vcmp;
maux_t *maux;
- int header_only, collapse, output_type, force_samples, merge_by_id;
+ regidx_t *regs; // apply regions only after the blocks are expanded
+ regitr_t *regs_itr;
+ int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
+ faidx_t *gvcf_fai;
info_rule_t *rules;
int nrules;
strdict_t *tmph;
@@ -124,6 +156,14 @@ typedef struct
}
args_t;
+static bcf1_t *maux_get_line(args_t *args, int i)
+{
+ maux_t *ma = args->maux;
+ int ibuf = ma->buf[i].cur;
+ if ( ibuf >= 0 ) return ma->buf[i].lines[ibuf];
+ return NULL;
+}
+
static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
{
if ( !rule->nvals ) return;
@@ -249,6 +289,32 @@ static void info_rules_init(args_t *args)
if ( str.l ) kputc(',',&str);
kputs("DP4:sum",&str);
}
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "QS")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("QS:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("MinDP:min",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "I16")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("I16:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IDV")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IDV:max",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IMF")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IMF:max",&str);
+ }
+
if ( !str.l ) return;
args->info_rules = str.s;
}
@@ -274,9 +340,12 @@ static void info_rules_init(args_t *args)
int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
- if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+ if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
+ else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
+ else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
+ else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
- while ( *ss ) ss++; ss++;
+ ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
int is_join = 0;
@@ -302,7 +371,8 @@ static void info_rules_init(args_t *args)
error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
}
- while ( *ss ) ss++; ss++; n++;
+ ss = strchr(ss, '\0'); ss++;
+ n++;
}
free(str.s);
free(tmp);
@@ -328,8 +398,10 @@ static void info_rules_reset(args_t *args)
}
static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len)
{
- int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type);
+ int msize = args->maux->ntmp_arr / rule->type_size;
+ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type);
if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret);
+ args->maux->ntmp_arr = msize * rule->type_size;
rule->nblocks++;
@@ -347,7 +419,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf
int i, j;
if ( var_len==BCF_VL_A )
{
- assert( ret==line->n_allele-1 );
+ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
// create mapping from source file ALT indexes to dst file indexes
@@ -356,7 +428,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf
}
else if ( var_len==BCF_VL_R )
{
- assert( ret==line->n_allele );
+ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
@@ -558,6 +630,8 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
{
for (i=0; i<*nb; i++)
{
+ if ( b[i][0]=='<' ) continue; // symbolic allele, do not modify
+ if ( b[i][0]=='*' ) continue; // overlapping deletion (*), do not modify
int l = strlen(b[i]);
b[i] = (char*) realloc(b[i],l+rla-rlb+1);
memcpy(b[i]+l,a[0]+rlb,rla-rlb+1);
@@ -567,13 +641,15 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
// now check if the $a alleles are present and if not add them
for (i=1; i<na; i++)
{
+ int const_ai = 1;
char *ai;
- if ( rlb>rla ) // $a alleles need expanding
+ if ( rlb>rla && a[i][0]!='<' && a[i][0]!='*' ) // $a alleles need expanding and not a symbolic allele or *
{
int l = strlen(a[i]);
ai = (char*) malloc(l+rlb-rla+1);
memcpy(ai,a[i],l);
memcpy(ai+l,b[0]+rla,rlb-rla+1);
+ const_ai = 0;
}
else
ai = a[i];
@@ -584,42 +660,59 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
if ( j<*nb ) // $b already has the same allele
{
map[i] = j;
- if ( rlb>rla ) free(ai);
+ if ( !const_ai ) free(ai);
continue;
}
// new allele
map[i] = *nb;
- b[*nb] = rlb>rla ? ai : strdup(ai);
+ if ( b[*nb] ) free(b[*nb]);
+ b[*nb] = const_ai ? strdup(ai) : ai;
(*nb)++;
}
return b;
}
-maux_t *maux_init(bcf_srs_t *files)
+maux_t *maux_init(args_t *args)
{
+ bcf_srs_t *files = args->files;
maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t));
ma->n = files->nreaders;
- ma->nbuf = (int *) calloc(ma->n,sizeof(int));
- ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*));
ma->files = files;
int i, n_smpl = 0;
for (i=0; i<ma->n; i++)
n_smpl += bcf_hdr_nsamples(files->readers[i].header);
+ if ( args->do_gvcf )
+ {
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+ for (i=0; i<ma->n; i++)
+ ma->gvcf[i].line = bcf_init1();
+ }
ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int));
ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
- ma->has_line = (int*) malloc(ma->n*sizeof(int));
+ ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
+ for (i=0; i<ma->n; i++)
+ ma->buf[i].rid = -1;
return ma;
}
void maux_destroy(maux_t *ma)
{
- int i;
+ int i,j;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
for (i=0; i<ma->n; i++) // for each reader
{
- if ( !ma->d[i] ) continue;
- int j;
- for (j=0; j<ma->nbuf[i]; j++) // for each buffered line
- if ( ma->d[i][j].map ) free(ma->d[i][j].map);
- free(ma->d[i]);
+ for (j=0; j<ma->buf[i].mrec; j++) // for each buffered line
+ free(ma->buf[i].rec[j].map);
+ free(ma->buf[i].rec);
+ }
+ free(ma->buf);
+ if ( ma->gvcf )
+ {
+ for (i=0; i<ma->n; i++) bcf_destroy(ma->gvcf[i].line);
+ free(ma->gvcf);
}
for (i=0; i<ma->mAGR_info; i++)
free(ma->AGR_info[i].buf);
@@ -628,32 +721,69 @@ void maux_destroy(maux_t *ma)
if (ma->ntmp_arr) free(ma->tmp_arr);
if (ma->nfmt_map) free(ma->fmt_map);
// ma->inf freed in bcf_destroy1
- free(ma->d);
- free(ma->nbuf);
for (i=0; i<ma->mals; i++) free(ma->als[i]);
if (ma->mout_als) free(ma->out_als);
free(ma->als);
free(ma->cnt);
free(ma->smpl_ploidy);
free(ma->smpl_nGsize);
- free(ma->has_line);
+ free(ma->chr);
free(ma);
}
-void maux_expand1(maux_t *ma, int i)
+void maux_expand1(buffer_t *buf, int size)
{
- if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer )
+ if ( buf->mrec < size )
{
- int n = ma->files->readers[i].nbuffer + 1;
- ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n);
- memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i]));
- ma->nbuf[i] = n;
+ hts_expand0(maux1_t,size,buf->mrec,buf->rec);
+ buf->mrec = size;
}
}
void maux_reset(maux_t *ma)
{
- int i;
- for (i=0; i<ma->n; i++) maux_expand1(ma, i);
- for (i=1; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ int i,j;
+ for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
+ for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
+ const char *chr = NULL;
+ ma->nals = 0;
+ ma->pos = -1;
+ for (i=0; i<ma->n; i++)
+ {
+ if ( !bcf_sr_has_line(ma->files,i) ) continue;
+ bcf1_t *line = bcf_sr_get_line(ma->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ chr = bcf_seqname(hdr,line);
+ ma->pos = line->pos;
+ break;
+ }
+ if ( chr )
+ {
+ free(ma->chr);
+ ma->chr = strdup(chr);
+ }
+ for (i=0; i<ma->n; i++)
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ ma->buf[i].rid = bcf_hdr_name2id(hdr,chr);
+ ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1;
+ for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
+ {
+ ma->buf[i].rec[j].skip = 0;
+ bcf1_t *line = ma->files->readers[i].buffer[j];
+ if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
+ }
+ ma->buf[i].end = j;
+ ma->buf[i].cur = -1;
+ if ( ma->buf[i].beg < ma->buf[i].end )
+ {
+ ma->buf[i].lines = ma->files->readers[i].buffer;
+ if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record
+ }
+ }
}
void maux_debug(maux_t *ma, int ir, int ib)
{
@@ -686,16 +816,20 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
out->pos = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_unpack(line, BCF_UN_ALL);
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- // alleles
+ // not all maux alleles are always used, mark the ones we'll need
int j;
for (j=1; j<line->n_allele; j++)
- al_idxs[ ma->d[i][0].map[j] ] = 1;
+ {
+ int irec = ma->buf[i].cur;
+ al_idxs[ ma->buf[i].rec[irec].map[j] ] = 1;
+ }
// position
if ( out->pos==-1 )
@@ -719,16 +853,15 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
}
// set QUAL to the max qual value. Not exactly correct, but good enough for now
- if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) )
+ if ( !bcf_float_is_missing(line->qual) )
{
- if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual;
+ if ( bcf_float_is_missing(out->qual) || out->qual < line->qual ) out->qual = line->qual;
}
}
// set ID
if ( !tmps->l ) kputs(".", tmps);
- if ( out->d.id ) free(out->d.id);
- out->d.id = strdup(tmps->s);
+ bcf_update_id(out_hdr, out, tmps->s);
// set alleles
ma->nout_als = 0;
@@ -742,10 +875,13 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
int ir, j;
for (ir=0; ir<files->nreaders; ir++)
{
- if ( !ma->has_line[ir] ) continue;
- bcf1_t *line = files->readers[ir].buffer[0];
+ bcf1_t *line = maux_get_line(args,ir);
+ if ( !line ) continue;
for (j=1; j<line->n_allele; j++)
- if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als;
+ {
+ int irec = ma->buf[ir].cur;
+ if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+ }
}
}
// Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block.
@@ -767,20 +903,36 @@ void merge_filter(args_t *args, bcf1_t *out)
bcf_hdr_t *out_hdr = args->out_hdr;
int i, ret;
+ if ( args->filter_logic == FLT_LOGIC_REMOVE )
+ {
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ if ( bcf_has_filter(hdr, line, "PASS") ) break;
+ }
+ if ( i<files->nreaders )
+ {
+ int flt_id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
+ bcf_add_filter(out_hdr, out, flt_id);
+ return;
+ }
+ }
+
khiter_t kitr;
strdict_t *tmph = args->tmph;
kh_clear(strdict, tmph);
- maux_t *ma = args->maux;
out->d.n_flt = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i]) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- bcf_unpack(line, BCF_UN_ALL);
int k;
for (k=0; k<line->d.n_flt; k++)
@@ -791,8 +943,8 @@ void merge_filter(args_t *args, bcf1_t *out)
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt);
if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt);
- hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt);
- ma->flt[out->d.n_flt] = id;
+ hts_expand(int,out->d.n_flt+1,out->d.m_flt,out->d.flt);
+ out->d.flt[out->d.n_flt] = id;
out->d.n_flt++;
kh_put(strdict, tmph, flt, &ret);
}
@@ -803,20 +955,17 @@ void merge_filter(args_t *args, bcf1_t *out)
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
for (i=0; i<out->d.n_flt; i++)
- if ( ma->flt[i]==id ) break;
+ if ( out->d.flt[i]==id ) break;
if ( i<out->d.n_flt )
{
out->d.n_flt--;
- for (; i<out->d.n_flt; i++) ma->flt[i] = ma->flt[i+1];
+ for (; i<out->d.n_flt; i++) out->d.flt[i] = out->d.flt[i+1];
}
}
- out->d.flt = ma->flt;
}
static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str)
{
- assert( !info->vptr_free );
-
uint8_t *ptr = info->vptr - info->vptr_off;
bcf_dec_typed_int1(ptr, &ptr);
@@ -835,8 +984,6 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t
kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str);
info->vptr = (uint8_t*) tmp_str->s + info->vptr_off;
- info->vptr_free = 1;
- line->d.shared_dirty |= BCF1_DIRTY_INF;
tmp_str->s = NULL;
tmp_str->m = 0;
tmp_str->l = 0;
@@ -1031,9 +1178,10 @@ void merge_info(args_t *args, bcf1_t *out)
info_rules_reset(args);
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
+ int irec = ma->buf[i].cur;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_info; j++)
{
@@ -1052,7 +1200,7 @@ void merge_info(args_t *args, bcf1_t *out)
info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key);
if ( rule )
{
- maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL;
+ maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->buf[i].rec[irec] : NULL;
if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue;
}
}
@@ -1063,7 +1211,7 @@ void merge_info(args_t *args, bcf1_t *out)
{
if ( kitr == kh_end(tmph) )
{
- // first occurance in this reader, alloc arrays
+ // seeing this key for the first time
ma->nAGR_info++;
hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info);
kitr = kh_put(strdict, tmph, key, &ret);
@@ -1081,37 +1229,36 @@ void merge_info(args_t *args, bcf1_t *out)
kitr = kh_get(strdict, tmph, key);
int idx = kh_val(tmph, kitr);
if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1);
- merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]);
+ merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]);
continue;
}
if ( kitr == kh_end(tmph) )
{
- hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf);
- ma->inf[out->n_info].key = id;
- ma->inf[out->n_info].type = inf->type;
- ma->inf[out->n_info].len = inf->len;
- ma->inf[out->n_info].vptr = inf->vptr;
- ma->inf[out->n_info].v1.i = inf->v1.i;
- ma->inf[out->n_info].v1.f = inf->v1.f;
- ma->inf[out->n_info].vptr_off = inf->vptr_off;
- ma->inf[out->n_info].vptr_len = inf->vptr_len;
- ma->inf[out->n_info].vptr_free = inf->vptr_free;
+ // Seeing this key for the first time. Although quite hacky,
+ // this is faster than anything else given the data structures..
+
+ hts_expand0(bcf_info_t,out->n_info+1,out->d.m_info,out->d.info);
+ out->d.info[out->n_info].key = id;
+ out->d.info[out->n_info].type = inf->type;
+ out->d.info[out->n_info].len = inf->len;
+ out->d.info[out->n_info].v1.i = inf->v1.i;
+ out->d.info[out->n_info].v1.f = inf->v1.f;
+ out->d.info[out->n_info].vptr_off = inf->vptr_off;
+ out->d.info[out->n_info].vptr_len = inf->vptr_len;
+ out->d.info[out->n_info].vptr_free = 1;
+ out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off);
+ memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off);
+ out->d.info[out->n_info].vptr += inf->vptr_off;
if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) )
- {
- // The existing packed info cannot be reused. Change the id.
- // Although quite hacky, it's faster than anything else given
- // the data structures
- bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps);
- }
+ bcf_info_set_id(out, &out->d.info[out->n_info], id, &args->tmps);
+ out->d.shared_dirty |= BCF1_DIRTY_INF;
out->n_info++;
kitr = kh_put(strdict, tmph, key, &ret);
kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value
}
}
}
- out->d.info = ma->inf;
- out->d.m_info = ma->minf;
for (i=0; i<args->nrules; i++)
args->rules[i].merger(args->out_hdr, out, &args->rules[i]);
for (i=0; i<ma->nAGR_info; i++)
@@ -1156,12 +1303,14 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
}
memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
+ int default_gt = args->missing_to_ref ? bcf_gt_unphased(0) : bcf_gt_missing;
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize;
+ int irec = ma->buf[i].cur;
int j, k;
if ( !fmt_ori )
@@ -1169,7 +1318,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
// missing values: assume maximum ploidy
for (j=0; j<bcf_hdr_nsamples(hdr); j++)
{
- for (k=0; k<nsize; k++) { tmp[k] = 0; ma->smpl_ploidy[ismpl+j]++; }
+ for (k=0; k<nsize; k++) { tmp[k] = default_gt; ma->smpl_ploidy[ismpl+j]++; }
tmp += nsize;
}
ismpl += bcf_hdr_nsamples(hdr);
@@ -1178,7 +1327,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
#define BRANCH(type_t, vector_end) { \
type_t *p_ori = (type_t*) fmt_ori->p; \
- if ( !ma->d[i][0].als_differ ) \
+ if ( !ma->buf[i].rec[irec].als_differ ) \
{ \
/* the allele numbering is unchanged */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
@@ -1208,7 +1357,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
else \
{ \
int al = (p_ori[k]>>1) - 1; \
- al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \
+ al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \
tmp[k] = (al << 1) | ((p_ori[k])&1); \
} \
} \
@@ -1241,7 +1390,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int nsize = 0, length = BCF_VL_FIXED, type = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ if ( !maux_get_line(args,i) ) continue;
if ( !fmt_map[i] ) continue;
if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key;
type = fmt_map[i]->type;
@@ -1279,10 +1428,12 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = ma->buf[i].cur;
if ( fmt_ori )
{
type = fmt_ori->type;
- int nals_ori = reader->buffer[0]->n_allele;
+ int nals_ori = line->n_allele;
if ( length==BCF_VL_G )
{
// if all fields are missing then n==1 is valid
@@ -1315,10 +1466,8 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
ismpl += bcf_hdr_nsamples(hdr); \
continue; \
} \
- assert( ma->has_line[i] ); \
- bcf1_t *line = reader->buffer[0]; \
src_type_t *src = (src_type_t*) fmt_ori->p; \
- if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \
+ if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \
{ \
/* alleles unchanged, copy over */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
@@ -1360,7 +1509,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int iori, inew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
@@ -1374,10 +1523,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int iori,jori, inew,jnew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
for (jori=0; jori<=iori; jori++) \
{ \
- jnew = ma->d[i][0].map[jori]; \
+ jnew = ma->buf[i].rec[irec].map[jori]; \
int kori = iori*(iori+1)/2 + jori; \
int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
@@ -1414,7 +1563,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
int iori,inew; \
for (iori=ifrom; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori] - ifrom; \
+ inew = ma->buf[i].rec[irec].map[iori] - ifrom; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
if ( src_is_missing ) tgt_set_missing; \
@@ -1463,9 +1612,9 @@ void merge_format(args_t *args, bcf1_t *out)
int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_fmt; j++)
{
@@ -1497,9 +1646,10 @@ void merge_format(args_t *args, bcf1_t *out)
ma->fmt_map[ifmt*files->nreaders+i] = fmt;
}
// Check if the allele numbering must be changed
- for (j=1; j<reader->buffer[0]->n_allele; j++)
- if ( ma->d[i][0].map[j]!=j ) break;
- ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1;
+ int irec = ma->buf[i].cur;
+ for (j=1; j<line->n_allele; j++)
+ if ( ma->buf[i].rec[irec].map[j]!=j ) break;
+ ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
}
out->n_sample = bcf_hdr_nsamples(out_hdr);
@@ -1507,203 +1657,383 @@ void merge_format(args_t *args, bcf1_t *out)
merge_GT(args, ma->fmt_map, out);
update_AN_AC(out_hdr, out);
- if ( out->d.info!=ma->inf )
- {
- // hacky, we rely on htslib internals: bcf_update_info() reallocated the info
- ma->inf = out->d.info;
- ma->minf = out->d.m_info;
- }
-
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
out->d.indiv_dirty = 1;
}
-// The core merging function, one or none line from each reader
-void merge_line(args_t *args)
+void gvcf_set_alleles(args_t *args)
+{
+ int i,k;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ maux->nals = 0;
+
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = maux->buf[i].cur;
+
+ hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map);
+ if ( !maux->nals ) // first record, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=0; k<maux->nals; k++)
+ {
+ if ( maux->als[k] ) free(maux->als[k]);
+ maux->als[k] = strdup(line->d.allele[k]);
+ maux->buf[i].rec[irec].map[k] = k;
+ }
+ }
+ else
+ {
+ maux->als = merge_alleles(line->d.allele, line->n_allele, maux->buf[i].rec[irec].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als )
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
+ error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1);
+ }
+ }
+ }
+}
+
+/*
+ Output staged gVCF blocks, end is the last position of the block. Assuming
+ gaux[i].active flags are set and maux_get_line returns correct lines.
+*/
+void gvcf_write_block(args_t *args, int start, int end)
{
+ int i;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ assert(gaux);
+
+ // Update POS
+ int min = INT_MAX;
+ char ref = 'N';
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( ref=='N' && gaux[i].line->pos==start ) ref = gaux[i].line->d.allele[0][0];
+ gaux[i].line->pos = start;
+ }
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < start )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ gaux[i].line->d.allele[0][0] = ref;
+ if ( min > gaux[i].end ) min = gaux[i].end;
+ }
+ // Check for valid gVCF blocks in this region
+ if ( min==INT_MAX )
+ {
+ assert(0);
+ maux->gvcf_min = 0;
+ return;
+ }
+
bcf1_t *out = args->out_line;
- bcf_clear1(out);
- out->unpacked = BCF_UN_ALL;
+ gvcf_set_alleles(args);
+
+ // Merge the staged lines
merge_chrom2qual(args, out);
merge_filter(args, out);
merge_info(args, out);
merge_format(args, out);
- bcf_write1(args->out_fh, args->out_hdr, out);
-}
+ if ( args->gvcf_fai && out->d.allele[0][0]=='N' )
+ {
+ int slen = 0;
+ char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+ if (slen)
+ {
+ out->d.allele[0][0] = seq[0];
+ free(seq);
+ }
+ }
+ // Update END boundary
+ if ( end > start )
+ {
+ end++;
+ bcf_update_info_int32(args->out_hdr, out, "END", &end, 1);
+ }
+ else
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
-void debug_buffers(FILE *fp, bcf_srs_t *files);
-void debug_buffer(FILE *fp, bcf_sr_t *reader);
-#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+ // Inactivate blocks which do not extend beyond END and find new gvcf_min
+ min = INT_MAX;
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < end )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ // next min END position bigger than the current one
+ if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1;
+ }
+ maux->gvcf_min = min==INT_MAX ? 0 : min;
+}
-// Clean the reader's buffer to and make it ready for the next next_line() call.
-// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put
-// the rest to the beggining. Then shorten the buffer so that the last element
-// points to the last unfinished record. There are two special cases: the last
-// line of the buffer typically has a different position and must stay at the
-// end; next, the first record of the buffer must be one of those already
-// printed, as it will be discarded by next_line().
-//
-void shake_buffer(maux_t *maux, int ir, int pos)
+/*
+ Flush staged gVCF blocks. Flush everything if there are no more lines
+ (done=1) or if there is a new chromosome. If still on the same chromosome,
+ all hanging blocks must be ended by creating new records:
+ A
+ 1 END=10
+ B
+ 3 END=7
+ C
+ 3 END=5
+ out
+ 1 END=2 A . .
+ 3 END=5 A B C
+ 6 END=7 A B .
+ 8 END=10 A . .
+
+*/
+void gvcf_flush(args_t *args, int done)
{
- bcf_sr_t *reader = &maux->files->readers[ir];
- maux1_t *m = maux->d[ir];
-
- if ( !reader->buffer ) return;
-
int i;
- // FILE *fp = pysam_stdout;
- // fprintf(fp,"<going to shake> nbuf=%d\t", reader->nbuffer); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n");
- // debug_buffer(fp,reader);
- // fprintf(fp,"--\n");
+ maux_t *maux = args->maux;
- int a = 1, b = reader->nbuffer;
- if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards
+ if ( !maux->chr ) return; // first time here, nothing to flush
- while ( a<b )
+ int flush_until = INT_MAX;
+ if ( !done )
{
- if ( !(m[a].skip&SKIP_DONE) ) { a++; continue; }
- if ( m[b].skip&SKIP_DONE ) { b--; continue; }
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[b]);
- SWAP(maux1_t, m[a], m[b]);
- a++;
- b--;
- }
+ // Get current position and chromosome
+ for (i=0; i<maux->n; i++)
+ if ( bcf_sr_has_line(maux->files,i) ) break;
+ bcf1_t *line = bcf_sr_get_line(maux->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(maux->files,i);
- // position $a to the after the first unfinished record
- while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++;
+ if ( !strcmp(maux->chr,bcf_seqname(hdr,line)) ) flush_until = line->pos; // still on the same chr
+ }
- if ( a<reader->nbuffer )
+ // When called on a region, trim the blocks accordingly
+ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos;
+ if ( args->regs )
{
- // there is a gap between the unfinished lines at the beggining and the
- // last line. The last line must be brought forward to fill the gap
- if ( reader->buffer[reader->nbuffer]->pos != pos )
+ int rstart = -1, rend = -1;
+ if ( regidx_overlap(args->regs,maux->chr,start,flush_until,args->regs_itr) )
{
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]);
- SWAP(maux1_t, m[a], m[reader->nbuffer]);
- reader->nbuffer = a;
+ // In case there are multiple regions, we treat them as one
+ rstart = args->regs_itr->beg;
+ while ( regitr_overlap(args->regs_itr) ) rend = args->regs_itr->end;
}
+ if ( rstart > start ) start = rstart;
+ if ( rend < flush_until ) flush_until = rend+1;
}
- if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos )
+ // output all finished blocks
+ while ( maux->gvcf_min && start < flush_until )
{
- // the first record is unfinished, replace it with an empty line
- // from the end of the buffer or else next_line will remove it
- if ( reader->nbuffer + 1 >= maux->nbuf[ir] )
+ // does the block end before the new line or is it interrupted?
+ int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until;
+ if ( start > tmp-1 ) break;
+ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based
+ start = tmp;
+ }
+}
+
+/*
+ Check incoming lines for new gVCF blocks, set pointer to the current source
+ buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
+ called only after maux_reset as it relies on updated maux buffers.
+*/
+void gvcf_stage(args_t *args, int pos)
+{
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ bcf_srs_t *files = args->files;
+ int32_t *end = (int32_t*) maux->tmp_arr;
+ int i, nend = maux->ntmp_arr / sizeof(int32_t);
+
+ maux->gvcf_break = -1;
+ maux->gvcf_min = INT_MAX;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( gaux[i].active )
{
- reader->nbuffer++;
- maux_expand1(maux, ir);
- reader->nbuffer--;
- m = maux->d[ir];
+ // gvcf block should not overlap with another record
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+ continue;
}
- if ( reader->nbuffer+1 >= reader->mbuffer )
- error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer);
- if ( reader->buffer[reader->nbuffer]->pos!=pos )
+ // Does any of the lines have END set? It is enough to check only the
+ // first line, there should be no duplicate records with END in gVCF
+
+ if ( maux->buf[i].beg==maux->buf[i].end ) continue; // no new record
+
+ int irec = maux->buf[i].beg;
+ bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
+ bcf1_t *line = args->files->readers[i].buffer[irec];
+ int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+ if ( ret==1 )
{
- // 4way swap
- bcf1_t *tmp = reader->buffer[0];
- reader->buffer[0] = reader->buffer[reader->nbuffer+1];
- reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer];
- reader->buffer[reader->nbuffer] = tmp;
- m[reader->nbuffer].skip = m[0].skip;
- m[reader->nbuffer+1].skip = SKIP_DIFF;
- reader->nbuffer++;
+ // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with
+ // an empty record: the gaux line must be kept until we reach its END.
+ gaux[i].active = 1;
+ gaux[i].end = end[0] - 1;
+ SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line);
+ gaux[i].line->pos = pos;
+
+ maux->buf[i].lines = &gaux[i].line;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+
+ // Set the rid,pos of the swapped line in the buffer or else the
+ // synced reader will have a problem with the next line
+ //
+ args->files->readers[i].buffer[irec]->rid = maux->buf[i].rid;
+ args->files->readers[i].buffer[irec]->pos = maux->pos;
+
+ // Update block offsets
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
}
else
- {
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]);
- SWAP(maux1_t, m[0], m[reader->nbuffer+1]);
- }
+ maux->gvcf_break = line->pos; // must break the gvcf block
}
+ maux->ntmp_arr = nend * sizeof(int32_t);
+ maux->tmp_arr = end;
+ if ( maux->gvcf_min==INT_MAX ) maux->gvcf_min = 0;
+}
+
+
+void debug_buffers(FILE *fp, bcf_srs_t *files);
+void debug_buffer(FILE *fp, bcf_srs_t *files, int reader);
+
+/*
+ Flush all buffered and processed records with the same coordinate.
+ Note that synced reader discards buffer[0], so that needs to stay
+ untouched.
+*/
+void clean_buffer(args_t *args)
+{
+ maux_t *ma = args->maux;
+
+ int ir;
+ for (ir=0; ir<ma->n; ir++)
+ {
+ // Invalidate pointer to reader's buffer or else gvcf_flush will attempt
+ // to use the old lines via maux_get_line()
+ if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
- // debug_buffer(fp,reader);
- // fprintf(fp,"<shaken>\t"); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]);
- // fprintf(fp,"\n\n");
+ bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir);
+ if ( !reader->nbuffer ) continue; // nothing to clean
- // set position of finished buffer[0] line to -1, otherwise swapping may
- // bring it back after next_line()
- reader->buffer[0]->pos = -1;
+ bcf1_t **buf = reader->buffer;
+ if ( buf[1]->rid!=ma->buf[ir].rid || buf[1]->pos!=ma->pos ) continue; // nothing to flush
- // trim the buffer, remove finished lines from the end
- i = reader->nbuffer;
- while ( i>=1 && m[i--].skip&SKIP_DONE )
- reader->nbuffer--;
+ int a = 1, b = 2;
+ while ( b<=reader->nbuffer && buf[b]->rid==ma->buf[ir].rid && buf[b]->pos==ma->pos ) b++;
+ // b now points to the first line we want to preserve
+ while ( b<=reader->nbuffer )
+ {
+ SWAP(bcf1_t*, buf[a], buf[b]);
+ a++; b++;
+ }
+ reader->nbuffer -= b-a;
+ }
}
-void debug_maux(args_t *args, int pos, int var_type)
+void debug_maux(args_t *args)
{
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
int j,k,l;
- fprintf(pysam_stderr,"Alleles to merge at %d\n", pos+1);
+ fprintf(pysam_stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals);
for (j=0; j<files->nreaders; j++)
{
bcf_sr_t *reader = &files->readers[j];
+ buffer_t *buf = &maux->buf[j];
fprintf(pysam_stderr," reader %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
+ for (k=buf->beg; k<buf->end; k++)
{
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ if ( buf->rec[k].skip & SKIP_DONE ) continue;
bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
fprintf(pysam_stderr,"\t");
- if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round
+ if ( buf->rec[k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round
for (l=0; l<line->n_allele; l++)
fprintf(pysam_stderr,"%s%s", l==0?"":",", line->d.allele[l]);
- if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"]");
+ if ( buf->rec[k].skip ) fprintf(pysam_stderr,"]");
}
fprintf(pysam_stderr,"\n");
}
fprintf(pysam_stderr," counts: ");
- for (j=0; j<maux->nals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysam_stderr,"\n");
- for (j=0; j<files->nreaders; j++)
- {
- bcf_sr_t *reader = &files->readers[j];
- fprintf(pysam_stderr," out %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
- {
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
- bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
- if ( maux->d[j][k].skip ) continue;
- fprintf(pysam_stderr,"\t");
- for (l=0; l<line->n_allele; l++)
- fprintf(pysam_stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
- }
- fprintf(pysam_stderr,"\n");
- }
- fprintf(pysam_stderr,"\n");
+ for (j=0; j<maux->nals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]);
+ fprintf(pysam_stderr,"\n\n");
}
-// Determine which line should be merged from which reader: go through all
-// readers and all buffered lines, expand REF,ALT and try to match lines with
-// the same ALTs. A step towards output independent on input ordering of the
-// lines.
-void merge_buffer(args_t *args)
+
+/*
+ Determine which line should be merged from which reader: go through all
+ readers and all buffered lines, expand REF,ALT and try to match lines with
+ the same ALTs.
+ */
+int can_merge(args_t *args)
{
bcf_srs_t *files = args->files;
- int i, pos = -1, var_type = 0;
- char *id = NULL;
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
maux_t *maux = args->maux;
- maux_reset(maux);
+ gvcf_aux_t *gaux = maux->gvcf;
+ char *id = NULL, ref = 'N';
+ maux->var_types = maux->nals = 0;
- // set the current position
+ int i,j,k, ntodo = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( bcf_sr_has_line(files,i) )
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = bcf_sr_get_line(files,i);
- pos = line->pos;
- var_type = bcf_get_variant_types(line);
- id = line->d.id;
- break;
+ // skip readers with active gvcf blocks
+ buf->rec[buf->beg].skip = SKIP_DIFF;
+ continue;
+ }
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ buf->rec[j].skip = SKIP_DIFF;
+ ntodo++;
+
+ if ( args->merge_by_id )
+ id = buf->lines[j]->d.id;
+ else
+ {
+ int var_type = bcf_get_variant_types(buf->lines[j]);
+ maux->var_types |= var_type ? var_type<<1 : 1;
+ }
}
+
+ // for gvcf: find out REF at this position
+ if ( buf->beg < buf->end && ref=='N' )
+ ref = buf->lines[buf->beg]->d.allele[0][0];
}
+ if ( !ntodo ) return 0;
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
@@ -1712,19 +2042,24 @@ void merge_buffer(args_t *args)
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
- int j, k;
- for (j=0; j<=reader->nbuffer; j++)
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = reader->buffer[j];
+ gaux[i].line->d.allele[0][0] = ref;
+ gaux[i].line->pos = maux->pos;
+ }
+
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
+
int line_type = bcf_get_variant_types(line);
+ line_type = line_type ? line_type<<1 : 1;
+
// select relevant lines
- maux->d[i][j].skip = SKIP_DIFF;
- if ( pos!=line->pos )
- {
- if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore
- continue;
- }
if ( args->merge_by_id )
{
if ( strcmp(id,line->d.id) ) continue;
@@ -1735,30 +2070,30 @@ void merge_buffer(args_t *args)
{
// All alleles of the tested record must be present in the
// selected maux record plus variant types must be the same
- if ( var_type!=line->d.var_type ) continue;
+ if ( (maux->var_types & line_type) != line_type ) continue;
if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
for (k=1; k<line->n_allele; k++)
{
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
}
- if ( k==line->n_allele ) continue; // no matching allele
+ if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele
}
if ( !(args->collapse&COLLAPSE_ANY) )
{
- int compatible = 0;
- if ( line_type==var_type ) compatible = 1;
- else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything
- else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1;
- else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1;
- if ( !compatible ) continue;
+ // Merge:
+ // - SNPs+SNPs+MNPs+REF if -m both,snps
+ // - indels+indels+REF if -m both,indels, REF only if SNPs are not present
+ // - SNPs come first
+ if ( line_type & indel_mask )
+ {
+ if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
+ if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
+ }
}
}
- maux->d[i][j].skip = 0;
+ buf->rec[j].skip = 0;
- hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map);
+ hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
if ( !maux->nals ) // first record, copy the alleles to the output
{
maux->nals = line->n_allele;
@@ -1766,111 +2101,118 @@ void merge_buffer(args_t *args)
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=0; k<maux->nals; k++)
{
+ free(maux->als[k]);
maux->als[k] = strdup(line->d.allele[k]);
- maux->d[i][j].map[k] = k;
+ buf->rec[j].map[k] = k;
maux->cnt[k] = 1;
}
- pos = line->pos;
continue;
}
-
// normalize alleles
- maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals);
- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname);
+ maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(args->out_hdr,line),line->pos+1,reader->fname);
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=1; k<line->n_allele; k++)
- maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files
+ maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files
maux->cnt[0]++;
}
}
+ return 1;
+}
- // debug_maux(args, pos, var_type);
+/*
+ Select records that have the same alleles; the input ordering of indels
+ must not matter. Multiple VCF lines can be emitted from this loop.
+ We expect only very few alleles and not many records with the same
+ position in the buffers, therefore the nested loops should not slow us
+ much.
+*/
+void stage_line(args_t *args)
+{
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
- // Select records that have the same alleles; the input ordering of indels
- // must not matter. Multiple VCF lines can be emitted from this loop.
- // We expect only very few alleles and not many records with the same
- // position in the buffers, therefore the nested loops should not slow us
- // much.
- while (1)
+ // debug_maux(args);
+
+ // take the most frequent allele present in multiple files, REF is skipped
+ int i,j,k,icnt = 1;
+ for (i=2; i<maux->nals; i++)
+ if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+
+ int nout = 0;
+ for (i=0; i<files->nreaders; i++)
{
- // take the most frequent allele present in multiple files
- int icnt = 0;
- for (i=1; i<maux->nals; i++)
- if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
- if ( maux->cnt[icnt]<0 ) break;
+ buffer_t *buf = &maux->buf[i];
+ buf->cur = -1;
+ if ( buf->beg >= buf->end ) continue; // no lines in the buffer
- int nmask = 0;
- for (i=0; i<files->nreaders; i++)
+ // find lines with the same allele
+ for (j=buf->beg; j<buf->end; j++)
{
- maux->has_line[i] = 0;
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->merge_by_id ) break;
+ if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record
- bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
+ for (k=0; k<buf->lines[j]->n_allele; k++)
+ if ( icnt==buf->rec[j].map[k] ) break;
- // find lines with the same allele
- int j;
- for (j=0; j<=reader->nbuffer; j++)
- {
- if ( maux->d[i][j].skip ) continue;
- int k;
- for (k=0; k<reader->buffer[j]->n_allele; k++)
- if ( icnt==maux->d[i][j].map[k] ) break;
- if ( k<reader->buffer[j]->n_allele ) break;
- }
- if ( j>reader->nbuffer )
- {
- // no matching allele found in this file
- if ( args->collapse==COLLAPSE_NONE ) continue;
+ if ( k<buf->lines[j]->n_allele ) break;
+ }
+ if ( j>=buf->end )
+ {
+ // no matching allele found in this file
+ if ( args->collapse==COLLAPSE_NONE ) continue;
- for (j=0; j<=reader->nbuffer; j++)
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
+ int line_type = bcf_get_variant_types(buf->lines[j]);
+ if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type==VCF_REF )
{
- if ( maux->d[i][j].skip ) continue;
- if ( args->collapse&COLLAPSE_ANY ) break;
- int line_type = bcf_get_variant_types(reader->buffer[j]);
- if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- if ( line_type==VCF_REF )
- {
- if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
- else if ( var_type==VCF_REF )
- {
- if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
+ if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ref_mask ) break;
}
- }
- if ( j<=reader->nbuffer )
- {
- // found a suitable line for merging, place it at the beggining
- if ( j>0 )
+ else if ( maux->var_types&ref_mask )
{
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]);
- SWAP(maux1_t, maux->d[i][0], maux->d[i][j]);
+ if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
}
- // mark as finished so that it's ignored next time
- maux->d[i][0].skip |= SKIP_DONE;
- maux->has_line[i] = 1;
- nmask++;
}
}
- if ( !nmask ) break; // done, no more lines suitable for merging found
- merge_line(args); // merge and output the line
- maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished
+ if ( j<buf->end )
+ {
+ // found a suitable line for merging
+ buf->cur = j;
+
+ // mark as finished so that it's ignored next time
+ buf->rec[j].skip = SKIP_DONE;
+ nout++;
+ }
}
+ assert( nout );
+}
- // clean the alleles
- for (i=0; i<maux->nals; i++)
+void merge_line(args_t *args)
+{
+ if ( args->regs )
{
- free(maux->als[i]);
- maux->als[i] = 0;
+ if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
}
- maux->nals = 0;
- // get the buffers ready for the next next_line() call
- for (i=0; i<files->nreaders; i++)
- shake_buffer(maux, i, pos);
+ bcf1_t *out = args->out_line;
+ merge_chrom2qual(args, out);
+ merge_filter(args, out);
+ merge_info(args, out);
+ if ( args->do_gvcf )
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ merge_format(args, out);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
}
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd)
@@ -1889,6 +2231,8 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c
else
ksprintf(&str, " %s", argv[i]);
}
+ kputs("; Date=", &str);
+ time_t tm; time(&tm); kputs(ctime(&tm), &str);
kputc('\n', &str);
bcf_hdr_append(hdr,str.s);
free(str.s);
@@ -1900,7 +2244,7 @@ void merge_vcf(args_t *args)
{
args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
if ( args->header_fname )
@@ -1930,14 +2274,32 @@ void merge_vcf(args_t *args)
}
if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
- args->maux = maux_init(args->files);
+ args->maux = maux_init(args);
args->out_line = bcf_init1();
args->tmph = kh_init(strdict);
- int ret;
- while ( (ret=bcf_sr_next_line(args->files)) )
+
+ while ( bcf_sr_next_line(args->files) )
{
- merge_buffer(args);
+ // output cached gVCF blocks which end before the new record
+ if ( args->do_gvcf )
+ gvcf_flush(args,0);
+
+ maux_reset(args->maux);
+
+ // determine which of the new records are gvcf blocks
+ if ( args->do_gvcf )
+ gvcf_stage(args, args->maux->pos);
+
+ while ( can_merge(args) )
+ {
+ stage_line(args);
+ merge_line(args);
+ }
+ clean_buffer(args);
}
+ if ( args->do_gvcf )
+ gvcf_flush(args,1);
+
info_rules_destroy(args);
maux_destroy(args->maux);
bcf_hdr_destroy(args->out_hdr);
@@ -1960,7 +2322,10 @@ static void usage(void)
fprintf(pysam_stderr, " --force-samples resolve duplicate sample names\n");
fprintf(pysam_stderr, " --print-header print only the merged header and exit\n");
fprintf(pysam_stderr, " --use-header <file> use the provided header\n");
+ fprintf(pysam_stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n");
fprintf(pysam_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysam_stderr, " -F, --filter-logic <x|+> remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
+ fprintf(pysam_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
fprintf(pysam_stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(pysam_stderr, " -l, --file-list <file> read file names from the file\n");
fprintf(pysam_stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
@@ -1991,7 +2356,9 @@ int main_vcfmerge(int argc, char *argv[])
{
{"help",no_argument,NULL,'h'},
{"merge",required_argument,NULL,'m'},
+ {"gvcf",required_argument,NULL,'g'},
{"file-list",required_argument,NULL,'l'},
+ {"missing-to-ref",no_argument,NULL,'0'},
{"apply-filters",required_argument,NULL,'f'},
{"use-header",required_argument,NULL,1},
{"print-header",no_argument,NULL,2},
@@ -2003,10 +2370,25 @@ int main_vcfmerge(int argc, char *argv[])
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
+ {"filter-logic",required_argument,NULL,'F'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
switch (c) {
+ case 'F':
+ if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
+ else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
+ else error("Filter logic not recognised: %s\n", optarg);
+ break;
+ case '0': args->missing_to_ref = 1; break;
+ case 'g':
+ args->do_gvcf = 1;
+ if ( strcmp("-",optarg) )
+ {
+ args->gvcf_fai = fai_load(optarg);
+ if ( !args->gvcf_fai ) error("Failed to load the fai index: %s\n", optarg);
+ }
+ break;
case 'l': args->file_list = optarg; break;
case 'i': args->info_rules = optarg; break;
case 'o': args->output_fname = optarg; break;
@@ -2047,9 +2429,23 @@ int main_vcfmerge(int argc, char *argv[])
if ( argc-optind<2 && !args->file_list ) usage();
args->files->require_index = 1;
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( regions_is_file )
+ args->regs = regidx_init(args->regions_list,NULL,NULL,sizeof(char*),NULL);
+ else
+ {
+ args->regs = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(args->regs,args->regions_list,',') !=0 ) error("Could not parse the regions: %s\n", args->regions_list);
+ regidx_insert(args->regs,NULL);
+ }
+ if ( !args->regs ) error("Could not parse the regions: %s\n", args->regions_list);
+ args->regs_itr = regitr_init(args->regs);
+ }
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
while (optind<argc)
{
if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
@@ -2067,6 +2463,9 @@ int main_vcfmerge(int argc, char *argv[])
}
merge_vcf(args);
bcf_sr_destroy(args->files);
+ if ( args->regs ) regidx_destroy(args->regs);
+ if ( args->regs_itr ) regitr_destroy(args->regs_itr);
+ if ( args->gvcf_fai ) fai_destroy(args->gvcf_fai);
free(args);
return 0;
}
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c
index 781833c..3a1706b 100644
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -87,10 +88,21 @@ static inline int replace_iupac_codes(char *seq, int nseq)
for (i=0; i<nseq; i++)
{
char c = toupper(seq[i]);
- if ( c!='A' && c!='C' && c!='G' && c!='T' ) { seq[i] = 'N'; n++; }
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) { seq[i] = 'N'; n++; }
}
return n;
}
+static inline int has_non_acgtn(char *seq, int nseq)
+{
+ char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number
+ while ( *seq && seq<end )
+ {
+ char c = toupper(*seq);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 1;
+ seq++;
+ }
+ return 0;
+}
static void fix_ref(args_t *args, bcf1_t *line)
{
@@ -248,10 +260,11 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
}
-#define ERR_DUP_ALLELE -2
-#define ERR_REF_MISMATCH -1
-#define ERR_OK 0
-#define ERR_SYMBOLIC 1
+#define ERR_DUP_ALLELE -2
+#define ERR_REF_MISMATCH -1
+#define ERR_OK 0
+#define ERR_SYMBOLIC 1
+#define ERR_SPANNING_DELETION 2
static int realign(args_t *args, bcf1_t *line)
{
@@ -261,13 +274,17 @@ static int realign(args_t *args, bcf1_t *line)
int i, nref, reflen = strlen(line->d.allele[0]);
char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
- replace_iupac_codes(ref,nref);
+ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
- // does REF contain non-standard bases?
- if ( replace_iupac_codes(line->d.allele[0],reflen) )
+ // does VCF REF contain non-standard bases?
+ if ( has_non_acgtn(line->d.allele[0],reflen) )
{
- args->nchanged++;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ free(ref);
+ return ERR_REF_MISMATCH;
}
if ( strcasecmp(ref,line->d.allele[0]) )
{
@@ -289,6 +306,16 @@ static int realign(args_t *args, bcf1_t *line)
for (i=0; i<line->n_allele; i++)
{
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+ if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
+ if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC; // breakend, not an error
+ if ( has_non_acgtn(line->d.allele[i],0) )
+ {
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]);
+ return ERR_REF_MISMATCH;
+ }
als[i].l = 0;
kputs(line->d.allele[i], &als[i]);
@@ -390,18 +417,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
if ( len==BCF_VL_A ) \
{ \
- assert( ret==src->n_allele-1); \
+ if ( ret!=src->n_allele-1 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \
bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( ret==src->n_allele); \
+ if ( ret!=src->n_allele ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \
if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
} \
else if ( len==BCF_VL_G ) \
{ \
- assert( ret==src->n_allele*(src->n_allele+1)/2 ); \
+ if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \
if ( ialt!=0 ) \
{ \
vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
@@ -545,7 +578,9 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
} \
if ( len==BCF_VL_A ) \
{ \
- assert( nvals==(src->n_allele-1)*nsmpl); \
+ if ( nvals!=(src->n_allele-1)*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
@@ -558,7 +593,9 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( nvals==src->n_allele*nsmpl); \
+ if ( nvals!=src->n_allele*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
@@ -682,7 +719,10 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
if ( *se==',' ) nfields++;
se++;
}
- assert( nfields==src->n_allele*(src->n_allele+1)/2 || nfields==src->n_allele );
+ if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele )
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n",
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields);
+
int len = 0;
if ( nfields==src->n_allele ) // haploid
{
@@ -994,7 +1034,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
else
{
int ial = bcf_gt_allele(gt2[k]);
- assert( ial<args->maps[i].nals );
+ if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial);
gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
}
}
@@ -1583,7 +1623,8 @@ static void normalize_vcf(args_t *args)
{
htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(out, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
bcf_hdr_write(out, args->hdr);
@@ -1666,7 +1707,7 @@ static void usage(void)
fprintf(stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
- fprintf(stderr, " -f, --fasta-ref <file> reference sequence\n");
+ fprintf(stderr, " -f, --fasta-ref <file> reference sequence (MANDATORY)\n");
fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
@@ -1677,7 +1718,7 @@ static void usage(void)
fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, "\n");
exit(1);
@@ -1804,6 +1845,7 @@ int main_vcfnorm(int argc, char *argv[])
error("Failed to read the targets: %s\n", args->targets);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
init_data(args);
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c
index 200ce79..da5a2aa 100644
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -89,10 +90,21 @@ static inline int replace_iupac_codes(char *seq, int nseq)
for (i=0; i<nseq; i++)
{
char c = toupper(seq[i]);
- if ( c!='A' && c!='C' && c!='G' && c!='T' ) { seq[i] = 'N'; n++; }
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) { seq[i] = 'N'; n++; }
}
return n;
}
+static inline int has_non_acgtn(char *seq, int nseq)
+{
+ char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number
+ while ( *seq && seq<end )
+ {
+ char c = toupper(*seq);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 1;
+ seq++;
+ }
+ return 0;
+}
static void fix_ref(args_t *args, bcf1_t *line)
{
@@ -250,10 +262,11 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
}
-#define ERR_DUP_ALLELE -2
-#define ERR_REF_MISMATCH -1
-#define ERR_OK 0
-#define ERR_SYMBOLIC 1
+#define ERR_DUP_ALLELE -2
+#define ERR_REF_MISMATCH -1
+#define ERR_OK 0
+#define ERR_SYMBOLIC 1
+#define ERR_SPANNING_DELETION 2
static int realign(args_t *args, bcf1_t *line)
{
@@ -263,13 +276,17 @@ static int realign(args_t *args, bcf1_t *line)
int i, nref, reflen = strlen(line->d.allele[0]);
char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
- replace_iupac_codes(ref,nref);
+ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
- // does REF contain non-standard bases?
- if ( replace_iupac_codes(line->d.allele[0],reflen) )
+ // does VCF REF contain non-standard bases?
+ if ( has_non_acgtn(line->d.allele[0],reflen) )
{
- args->nchanged++;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(pysam_stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ free(ref);
+ return ERR_REF_MISMATCH;
}
if ( strcasecmp(ref,line->d.allele[0]) )
{
@@ -291,6 +308,16 @@ static int realign(args_t *args, bcf1_t *line)
for (i=0; i<line->n_allele; i++)
{
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+ if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
+ if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC; // breakend, not an error
+ if ( has_non_acgtn(line->d.allele[i],0) )
+ {
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(pysam_stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]);
+ return ERR_REF_MISMATCH;
+ }
als[i].l = 0;
kputs(line->d.allele[i], &als[i]);
@@ -392,18 +419,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
if ( len==BCF_VL_A ) \
{ \
- assert( ret==src->n_allele-1); \
+ if ( ret!=src->n_allele-1 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \
bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( ret==src->n_allele); \
+ if ( ret!=src->n_allele ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \
if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
} \
else if ( len==BCF_VL_G ) \
{ \
- assert( ret==src->n_allele*(src->n_allele+1)/2 ); \
+ if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \
if ( ialt!=0 ) \
{ \
vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
@@ -547,7 +580,9 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
} \
if ( len==BCF_VL_A ) \
{ \
- assert( nvals==(src->n_allele-1)*nsmpl); \
+ if ( nvals!=(src->n_allele-1)*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
@@ -560,7 +595,9 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( nvals==src->n_allele*nsmpl); \
+ if ( nvals!=src->n_allele*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
@@ -684,7 +721,10 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
if ( *se==',' ) nfields++;
se++;
}
- assert( nfields==src->n_allele*(src->n_allele+1)/2 || nfields==src->n_allele );
+ if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele )
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n",
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields);
+
int len = 0;
if ( nfields==src->n_allele ) // haploid
{
@@ -996,7 +1036,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
else
{
int ial = bcf_gt_allele(gt2[k]);
- assert( ial<args->maps[i].nals );
+ if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial);
gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
}
}
@@ -1585,7 +1625,8 @@ static void normalize_vcf(args_t *args)
{
htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(out, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
bcf_hdr_write(out, args->hdr);
@@ -1668,7 +1709,7 @@ static void usage(void)
fprintf(pysam_stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
fprintf(pysam_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
fprintf(pysam_stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
- fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence\n");
+ fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence (MANDATORY)\n");
fprintf(pysam_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
fprintf(pysam_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
@@ -1679,7 +1720,7 @@ static void usage(void)
fprintf(pysam_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(pysam_stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(pysam_stderr, "\n");
exit(1);
@@ -1806,6 +1847,7 @@ int main_vcfnorm(int argc, char *argv[])
error("Failed to read the targets: %s\n", args->targets);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
init_data(args);
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c
index 87a773f..bfd6ad2 100644
--- a/bcftools/vcfplugin.c
+++ b/bcftools/vcfplugin.c
@@ -1,6 +1,6 @@
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -47,7 +48,7 @@ typedef struct _plugin_t plugin_t;
* Plugin API:
* ----------
* const char *about(void)
- * - short description used by 'bcftools plugin -l'
+ * - short description used by 'bcftools plugin -lv'
*
* const char *usage(void)
* - longer description used by 'bcftools +name -h'
@@ -170,11 +171,11 @@ static void add_plugin_paths(args_t *args, const char *path)
args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
args->plugin_paths[args->nplugin_paths] = dir;
args->nplugin_paths++;
- if ( args->verbose ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
+ if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
}
else
{
- if ( args->verbose ) fprintf(stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
+ if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
free(dir);
}
@@ -210,7 +211,7 @@ static void *dlopen_plugin(args_t *args, const char *fname)
{
tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
else fprintf(stderr,"%s:\n\tdlopen .. ok\n", tmp);
@@ -221,7 +222,7 @@ static void *dlopen_plugin(args_t *args, const char *fname)
}
handle = dlopen(fname, RTLD_NOW);
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
else fprintf(stderr,"%s:\n\tdlopen .. ok\n", fname);
@@ -266,19 +267,19 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
if ( ret )
plugin->init = NULL;
else
- if ( args->verbose ) fprintf(stderr,"\tinit .. ok\n");
+ if ( args->verbose > 1 ) fprintf(stderr,"\tinit .. ok\n");
plugin->run = (dl_run_f) dlsym(plugin->handle, "run");
ret = dlerror();
if ( ret )
plugin->run = NULL;
else
- if ( args->verbose ) fprintf(stderr,"\trun .. ok\n");
+ if ( args->verbose > 1 ) fprintf(stderr,"\trun .. ok\n");
if ( !plugin->init && !plugin->run )
{
if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name);
- else if ( args->verbose ) fprintf(stderr,"\tinit/run .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(stderr,"\tinit/run .. not found\n");
return -1;
}
@@ -287,7 +288,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
if ( ret )
{
if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name);
- else if ( args->verbose ) fprintf(stderr,"\tversion .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(stderr,"\tversion .. not found\n");
return -1;
}
@@ -392,8 +393,13 @@ static int list_plugins(args_t *args)
qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name);
for (i=0; i<nplugins; i++)
- printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about());
- printf("\n");
+ {
+ if ( args->verbose )
+ printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about());
+ else
+ printf("%s\n", plugins[i].name);
+ }
+ if ( args->verbose ) printf("\n");
}
else
print_plugin_usage_hint();
@@ -460,12 +466,33 @@ static void usage(args_t *args)
fprintf(stderr, "Plugin options:\n");
fprintf(stderr, " -h, --help list plugin's options\n");
fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(stderr, " -v, --verbose print debugging information on plugin failure\n");
+ fprintf(stderr, " -v, --verbose print verbose information, -vv increases verbosity\n");
fprintf(stderr, " -V, --version print version string and exit\n");
fprintf(stderr, "\n");
exit(1);
}
+static int is_verbose(int argc, char *argv[])
+{
+ int c, verbose = 0, opterr_ori = opterr;
+ static struct option loptions[] =
+ {
+ {"verbose",no_argument,NULL,'v'},
+ {NULL,0,NULL,0}
+ };
+ opterr = 0;
+ while ((c = getopt_long(argc, argv, "-v",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'v': verbose++; break;
+ case 1:
+ default: break;
+ }
+ }
+ opterr = opterr_ori;
+ optind = 0;
+ return verbose;
+}
int main_plugin(int argc, char *argv[])
{
int c;
@@ -483,6 +510,7 @@ int main_plugin(int argc, char *argv[])
char *plugin_name = NULL;
if ( argv[1][0]!='-' )
{
+ args->verbose = is_verbose(argc, argv);
plugin_name = argv[1];
argc--;
argv++;
@@ -518,7 +546,7 @@ int main_plugin(int argc, char *argv[])
{
switch (c) {
case 'V': version_only = 1; break;
- case 'v': args->verbose = 1; break;
+ case 'v': args->verbose++; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c
index 8365f7e..ec1d586 100644
--- a/bcftools/vcfplugin.c.pysam.c
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -49,7 +50,7 @@ typedef struct _plugin_t plugin_t;
* Plugin API:
* ----------
* const char *about(void)
- * - short description used by 'bcftools plugin -l'
+ * - short description used by 'bcftools plugin -lv'
*
* const char *usage(void)
* - longer description used by 'bcftools +name -h'
@@ -172,11 +173,11 @@ static void add_plugin_paths(args_t *args, const char *path)
args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
args->plugin_paths[args->nplugin_paths] = dir;
args->nplugin_paths++;
- if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir);
+ if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir);
}
else
{
- if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
+ if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
free(dir);
}
@@ -212,7 +213,7 @@ static void *dlopen_plugin(args_t *args, const char *fname)
{
tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", tmp);
@@ -223,7 +224,7 @@ static void *dlopen_plugin(args_t *args, const char *fname)
}
handle = dlopen(fname, RTLD_NOW);
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", fname);
@@ -268,19 +269,19 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
if ( ret )
plugin->init = NULL;
else
- if ( args->verbose ) fprintf(pysam_stderr,"\tinit .. ok\n");
+ if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit .. ok\n");
plugin->run = (dl_run_f) dlsym(plugin->handle, "run");
ret = dlerror();
if ( ret )
plugin->run = NULL;
else
- if ( args->verbose ) fprintf(pysam_stderr,"\trun .. ok\n");
+ if ( args->verbose > 1 ) fprintf(pysam_stderr,"\trun .. ok\n");
if ( !plugin->init && !plugin->run )
{
if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name);
- else if ( args->verbose ) fprintf(pysam_stderr,"\tinit/run .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit/run .. not found\n");
return -1;
}
@@ -289,7 +290,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
if ( ret )
{
if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name);
- else if ( args->verbose ) fprintf(pysam_stderr,"\tversion .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tversion .. not found\n");
return -1;
}
@@ -394,8 +395,13 @@ static int list_plugins(args_t *args)
qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name);
for (i=0; i<nplugins; i++)
- fprintf(pysam_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about());
- fprintf(pysam_stdout, "\n");
+ {
+ if ( args->verbose )
+ fprintf(pysam_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about());
+ else
+ fprintf(pysam_stdout, "%s\n", plugins[i].name);
+ }
+ if ( args->verbose ) fprintf(pysam_stdout, "\n");
}
else
print_plugin_usage_hint();
@@ -462,12 +468,33 @@ static void usage(args_t *args)
fprintf(pysam_stderr, "Plugin options:\n");
fprintf(pysam_stderr, " -h, --help list plugin's options\n");
fprintf(pysam_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(pysam_stderr, " -v, --verbose print debugging information on plugin failure\n");
+ fprintf(pysam_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n");
fprintf(pysam_stderr, " -V, --version print version string and exit\n");
fprintf(pysam_stderr, "\n");
exit(1);
}
+static int is_verbose(int argc, char *argv[])
+{
+ int c, verbose = 0, opterr_ori = opterr;
+ static struct option loptions[] =
+ {
+ {"verbose",no_argument,NULL,'v'},
+ {NULL,0,NULL,0}
+ };
+ opterr = 0;
+ while ((c = getopt_long(argc, argv, "-v",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'v': verbose++; break;
+ case 1:
+ default: break;
+ }
+ }
+ opterr = opterr_ori;
+ optind = 0;
+ return verbose;
+}
int main_plugin(int argc, char *argv[])
{
int c;
@@ -485,6 +512,7 @@ int main_plugin(int argc, char *argv[])
char *plugin_name = NULL;
if ( argv[1][0]!='-' )
{
+ args->verbose = is_verbose(argc, argv);
plugin_name = argv[1];
argc--;
argv++;
@@ -520,7 +548,7 @@ int main_plugin(int argc, char *argv[])
{
switch (c) {
case 'V': version_only = 1; break;
- case 'v': args->verbose = 1; break;
+ case 'v': args->verbose++; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c
index 9560559..9437d7e 100644
--- a/bcftools/vcfroh.c
+++ b/bcftools/vcfroh.c
@@ -30,12 +30,19 @@ THE SOFTWARE. */
#include <htslib/synced_bcf_reader.h>
#include <htslib/kstring.h>
#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
#include "bcftools.h"
#include "HMM.h"
+#include "smpl_ilist.h"
#define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies
#define STATE_AZ 1 // autozygous state
+#define OUTPUT_ST (1<<1)
+#define OUTPUT_RG (1<<2)
+#define OUTPUT_GZ (1<<3)
+
/** Genetic map */
typedef struct
{
@@ -44,6 +51,24 @@ typedef struct
}
genmap_t;
+/** HMM data for each sample */
+typedef struct
+{
+ double *eprob; // emission probs [2*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+ int igenmap; // current position in genmap
+ int nused; // some stats to detect if things didn't go wrong
+ int nrid, *rid, *rid_off; // for viterbi training, keep all chromosomes
+ void *snapshot; // hmm snapshot
+ struct {
+ uint32_t beg,end,nqual;
+ double qual;
+ int rid, state;
+ } rg;
+}
+smpl_t;
+
typedef struct _args_t
{
bcf_srs_t *files;
@@ -57,29 +82,32 @@ typedef struct _args_t
double rec_rate; // constant recombination rate if > 0
hmm_t *hmm;
- double *eprob; // emission probs [2*nsites,msites]
- uint32_t *sites; // positions [nsites,msites]
- int nsites, msites;
+ double baum_welch_th;
int nrids, *rids, *rid_offs; // multiple chroms with vi_training
+ int nbuf_max, nbuf_olap;
- int32_t *itmp;
- int nitmp, mitmp;
float *AFs;
- int mAFs;
+ int32_t *itmp;
+ int mAFs, nitmp, mitmp, pl_hdr_id, gt_hdr_id;
double pl2p[256], *pdg;
int32_t skip_rid, prev_rid, prev_pos;
- int ntot, nused; // some stats to detect if things didn't go awfully wrong
- int ismpl, nsmpl; // index of query sample
- char *estimate_AF, *sample; // list of samples for AF estimate and query sample
- char **argv, *targets_list, *regions_list, *af_fname, *af_tag;
- int argc, fake_PLs, snps_only, vi_training;
+ int ntot; // some stats to detect if things didn't go wrong
+ smpl_t *smpl; // HMM data for each sample
+ smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF)
+ smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file)
+ char *estimate_AF; // list of samples for AF estimate and query sample
+ int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT
+ char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
+ int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+ BGZF *out;
+ kstring_t str;
}
args_t;
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
void *smalloc(size_t size)
{
@@ -90,57 +118,137 @@ void *smalloc(size_t size)
static void init_data(args_t *args)
{
+ int i;
+
args->prev_rid = args->skip_rid = -1;
args->hdr = args->files->readers[0].header;
- if ( !args->sample )
- {
- if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
- args->sample = strdup(args->hdr->samples[0]);
- }
if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
- // Set samples
- kstring_t str = {0,0,0};
- if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
+ if ( !args->fake_PLs )
{
- int i, n;
- char **smpls = hts_readlist(args->estimate_AF, 1, &n);
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n");
+ if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT )
+ error("Error: The FORMAT/PL tag not defined as Integer in the header\n");
+ }
- // Make sure the query sample is included
- for (i=0; i<n; i++)
- if ( !strcmp(args->sample,smpls[i]) ) break;
+ if ( args->estimate_AF )
+ {
+ if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3;
+ else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; }
+ if ( strcmp("-",args->estimate_AF) )
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
- // Add the query sample if not present
- if ( i!=n ) kputs(args->sample, &str);
+ if ( args->estimate_AF || args->fake_PLs )
+ {
+ if ( args->af_from_PL )
+ {
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header\n");
+ }
+ else
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
+ }
+ if ( args->fake_PLs )
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
- for (i=0; i<n; i++)
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+ if ( args->samples )
+ {
+ // we may be able to subset to a few samples, for a text VCF this can be a major speedup
+ if ( (bcf_sr_get_reader(args->files,0))->file->format.format==vcf )
{
- if ( str.l ) kputc(',', &str);
- kputs(smpls[i], &str);
- free(smpls[i]);
+ kstring_t str = {0,0,0};
+ smpl_ilist_t *tmp = args->roh_smpl, *rmme = NULL;
+ if ( args->af_smpl )
+ {
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[args->roh_smpl->idx[i]], &str);
+ }
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ kputc(',', &str);
+ kputs(args->hdr->samples[args->af_smpl->idx[i]], &str);
+ }
+ rmme = tmp = smpl_ilist_init(args->hdr, str.s, 0, SMPL_NONE);
+ }
+ if ( tmp->n < bcf_hdr_nsamples(args->hdr) )
+ {
+ str.l = 0;
+ for (i=0; i<tmp->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[tmp->idx[i]], &str);
+ }
+ int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
+ else if ( ret>0 ) error("The %d-th sample not found in the VCF: %s\n", ret,str.s);
+
+ // update sample ids
+ smpl_ilist_destroy(args->roh_smpl);
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+
+ if ( args->af_smpl )
+ {
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
+ }
+ free(str.s);
+ if ( rmme )
+ smpl_ilist_destroy(rmme);
}
- free(smpls);
}
- else if ( !args->estimate_AF )
- kputs(args->sample, &str);
- if ( str.l )
+ // check whether all samples are in this list. If so, the lookup will not be needed
+ if ( args->af_smpl && args->af_smpl->n == bcf_hdr_nsamples(args->hdr) )
{
- int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
- if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
- else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
+ // all samples are in this list
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = NULL;
}
- if ( args->af_tag )
- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
- error("No such INFO tag in the VCF: %s\n", args->af_tag);
+ if ( args->buffer_size )
+ {
+ args->nbuf_olap = -1;
+ char *end;
+ double tmp = strtod(args->buffer_size,&end);
+ if ( *end )
+ {
+ if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size);
+ args->nbuf_olap = strtol(end+1,&end,10);
+ if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size);
+ }
+ if ( tmp<0 )
+ args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n;
+ else
+ args->nbuf_max = tmp;
- args->nsmpl = bcf_hdr_nsamples(args->hdr);
- args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
- free(str.s);
+ if ( args->nbuf_olap<0 )
+ args->nbuf_olap = args->nbuf_max*0.01;
+ }
+ fprintf(stderr,"Number of target samples: %d\n", args->roh_smpl->n);
+ fprintf(stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0));
+ fprintf(stderr,"Number of sites in the buffer/overlap: ");
+ if ( args->nbuf_max ) fprintf(stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap);
+ else fprintf(stderr,"unlimited\n");
+
+ args->smpl = (smpl_t*) calloc(args->roh_smpl->n,sizeof(smpl_t));
- int i;
for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);
// Init transition matrix and HMM
@@ -150,40 +258,88 @@ static void init_data(args_t *args)
MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ args->hmm = hmm_init(2, tprob, 10000);
if ( args->genmap_fname )
- {
- args->hmm = hmm_init(2, tprob, 0);
hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
- }
else if ( args->rec_rate > 0 )
- {
- args->hmm = hmm_init(2, tprob, 0);
- hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);
+ hmm_set_tprob_func(args->hmm, set_tprob_rrate, args);
- }
- else
- args->hmm = hmm_init(2, tprob, 10000);
+ args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu");
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno));
// print header
- printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
- printf("# The command line was:\tbcftools %s", args->argv[0]);
+ args->str.l = 0;
+ ksprintf(&args->str, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ ksprintf(&args->str, "# The command line was:\tbcftools %s", args->argv[0]);
for (i=1; i<args->argc; i++)
- printf(" %s",args->argv[i]);
- printf("\n#\n");
- printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
+ ksprintf(&args->str, " %s",args->argv[i]);
+ ksprintf(&args->str, "\n#\n");
+ if ( args->output_type & OUTPUT_RG )
+ {
+ i = 2;
+ ksprintf(&args->str, "# RG");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Start", i++);
+ ksprintf(&args->str, "\t[%d]End", i++);
+ ksprintf(&args->str, "\t[%d]Length (bp)", i++);
+ ksprintf(&args->str, "\t[%d]Number of markers", i++);
+ ksprintf(&args->str, "\t[%d]Quality (average fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->output_type & OUTPUT_ST )
+ {
+ i = 2;
+ ksprintf(&args->str, "# ST");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Position", i++);
+ ksprintf(&args->str, "\t[%d]State (0:HW, 1:AZ)", i++);
+ ksprintf(&args->str, "\t[%d]Quality (fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->vi_training)
+ {
+ i = 2;
+ ksprintf(&args->str, "# VT, Viterbi Training");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Iteration", i++);
+ ksprintf(&args->str, "\t[%d]dAZ", i++);
+ ksprintf(&args->str, "\t[%d]dHW", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(HW|HW)", i++);
+ ksprintf(&args->str, "\t[%d]P(AZ|HW)", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(AZ|AZ)", i++);
+ ksprintf(&args->str, "\t[%d]P(HW|AZ)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l )
+ error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
static void destroy_data(args_t *args)
{
- free(args->sites);
- free(args->eprob);
- free(args->sample);
+ if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname);
+ int i;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ free(args->smpl[i].eprob);
+ free(args->smpl[i].sites);
+ free(args->smpl[i].rid);
+ free(args->smpl[i].rid_off);
+ free(args->smpl[i].snapshot);
+ }
+ free(args->str.s);
+ free(args->smpl);
+ if ( args->af_smpl ) smpl_ilist_destroy(args->af_smpl);
+ smpl_ilist_destroy(args->roh_smpl);
free(args->rids);
free(args->rid_offs);
hmm_destroy(args->hmm);
bcf_sr_destroy(args->files);
- free(args->itmp); free(args->AFs); free(args->pdg);
+ free(args->AFs); free(args->pdg);
free(args->genmap);
+ free(args->itmp);
+ free(args->samples);
}
static int load_genmap(args_t *args, bcf1_t *line)
@@ -220,21 +376,22 @@ static int load_genmap(args_t *args, bcf1_t *line)
hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
genmap_t *gm = &args->genmap[args->ngenmap-1];
+ // position, convert to 0-based
char *tmp, *end;
gm->pos = strtol(str.s, &tmp, 10);
if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->pos -= 1;
// skip second column
tmp++;
while ( *tmp && !isspace(*tmp) ) tmp++;
- // read the genetic map in cM
+ // read the genetic map in cM, scale from % to likelihood
gm->rate = strtod(tmp+1, &end);
if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->rate *= 0.01;
}
if ( !args->ngenmap ) error("Genetic map empty?\n");
- int i;
- for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
if ( hts_close(fp) ) error("Close failed\n");
free(str.s);
return 0;
@@ -255,7 +412,6 @@ static double get_genmap_rate(args_t *args, int start, int end)
// position j to be equal or larger than end
int j = i;
while ( j+1<args->ngenmap && args->genmap[j].pos < end ) j++;
-
if ( i==j )
{
args->igenmap = i;
@@ -272,17 +428,20 @@ static double get_genmap_rate(args_t *args, int start, int end)
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
- double ci = get_genmap_rate(args, pos - prev_pos, pos);
+ double ci = get_genmap_rate(args, prev_pos, pos);
+ if ( args->rec_rate ) ci *= args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
}
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
double ci = (pos - prev_pos) * args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
@@ -315,132 +474,163 @@ void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data,
*
*/
-static void flush_viterbi(args_t *args)
+static void flush_viterbi(args_t *args, int ismpl)
{
- int i,j;
+ smpl_t *smpl = &args->smpl[ismpl];
+ if ( !smpl->nsites ) return;
- if ( !args->nsites ) return;
+ const char *name = args->hdr->samples[ args->roh_smpl->idx[ismpl] ];
- if ( !args->vi_training )
+ int i,j,k;
+
+ if ( !args->vi_training ) // single viterbi pass
{
- // single viterbi pass, one chromsome
- hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
- hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
+ hmm_restore(args->hmm, smpl->snapshot);
+ int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites;
+ if ( end < smpl->nsites )
+ smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1);
+
+ args->igenmap = smpl->igenmap;
+ hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
+ hmm_run_fwd_bwd(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (i=0; i<args->nsites; i++)
+ const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+
+ for (i=0; i<end; i++)
{
int state = vpath[i*2]==STATE_AZ ? 1 : 0;
- double *pval = fwd + i*2;
- printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
- }
- return;
- }
+ double qual = phred_score(1.0 - fwd[i*2 + state]);
+ if ( args->output_type & OUTPUT_ST )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "ST\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[i]+1, state, qual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ }
- // viterbi training, multiple chromosomes
- double t2az_prev, t2hw_prev;
- double deltaz, delthw;
- int niter = 0;
- do
- {
- double *tprob_arr = hmm_get_tprob(args->hmm);
- t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ;
- t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW;
- double tcounts[] = { 0,0,0,0 };
- for (i=0; i<args->nrids; i++)
- {
- // run viterbi for each chromosomes. eprob and sites contain
- // multiple chromosomes, rid_offs mark the boundaries
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
-
- // what transitions were observed: add to the total counts
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (j=1; j<nsites; j++)
+ if ( args->output_type & OUTPUT_RG )
{
- // count the number of transitions
- int prev_state = vpath[2*(j-1)];
- int curr_state = vpath[2*j];
- MAT(tcounts,2,curr_state,prev_state) += 1;
+ if ( state!=smpl->rg.state )
+ {
+ if ( !state ) // the region ends, flush
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
+ }
+ else
+ {
+ smpl->rg.state = 1;
+ smpl->rg.beg = smpl->sites[i];
+ smpl->rg.rid = args->prev_rid;
+ }
+ }
+ else if ( state )
+ {
+ smpl->rg.nqual++;
+ smpl->rg.qual += qual;
+ smpl->rg.end = smpl->sites[i];
+ }
}
}
- // update the transition matrix
- int n = 1;
- for (i=0; i<2; i++)
+ if ( end < smpl->nsites )
{
- for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
+ end = smpl->nsites - args->nbuf_olap;
+ memmove(smpl->sites, smpl->sites + end, sizeof(*smpl->sites)*args->nbuf_olap);
+ memmove(smpl->eprob, smpl->eprob + end*2, sizeof(*smpl->eprob)*args->nbuf_olap*2);
+ smpl->nsites = args->nbuf_olap;
+ smpl->igenmap = args->igenmap;
}
- for (i=0; i<2; i++)
+ else
{
- for (j=0; j<2; j++)
+ smpl->nsites = 0;
+ smpl->igenmap = 0;
+
+ if ( smpl->rg.state )
{
- // no transition to i-th state was observed, set to a small number
- if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n;
- else MAT(tcounts,2,i,j) /= n;
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
}
}
- // normalize
- for (i=0; i<2; i++)
+ return;
+ }
+
+
+ // viterbi training, multiple chromosomes
+ double t2az_prev, t2hw_prev;
+ double deltaz, delthw;
+
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
+ MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW;
+ MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ;
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ hmm_set_tprob(args->hmm, tprob_arr, 10000);
+
+ int niter = 0;
+ do
+ {
+ tprob_arr = hmm_get_tprob(args->hmm);
+ t2az_prev = MAT(tprob_arr,2,STATE_AZ,STATE_HW); //args->t2AZ;
+ t2hw_prev = MAT(tprob_arr,2,STATE_HW,STATE_AZ); //args->t2HW;
+ double tprob_new[] = { 0,0,0,0 };
+ for (i=0; i<smpl->nrid; i++)
{
- double norm = 0;
- for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i);
- assert( norm!=0 );
- for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm;
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ tprob_arr = hmm_run_baum_welch(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) += MAT(tprob_arr,2,j,k);
}
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) /= smpl->nrid;
- if ( args->genmap_fname || args->rec_rate > 0 )
- hmm_set_tprob(args->hmm, tcounts, 0);
- else
- hmm_set_tprob(args->hmm, tcounts, 10000);
+ hmm_set_tprob(args->hmm, tprob_new, 10000);
- tprob_arr = hmm_get_tprob(args->hmm);
- deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
- delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
+ deltaz = fabs(MAT(tprob_new,2,STATE_AZ,STATE_HW)-t2az_prev);
+ delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev);
niter++;
- fprintf(stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n",
- niter,deltaz,delthw,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ args->str.l = 0;
+ ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n",
+ name,niter,deltaz,delthw,
+ 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW),
+ 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
- while ( deltaz > 0.0 || delthw > 0.0 );
- double *tprob_arr = hmm_get_tprob(args->hmm);
- fprintf(stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th );
// output the results
- for (i=0; i<args->nrids; i++)
+ for (i=0; i<smpl->nrid; i++)
{
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
- hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ hmm_run_viterbi(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ hmm_run_fwd_bwd(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
+ const char *chr = bcf_hdr_id2name(args->hdr,smpl->rid[i]);
for (j=0; j<nsites; j++)
{
- int state = vpath[j*2];
- double pval = fwd[j*2 + state];
- printf("%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval);
+ int state = vpath[j*2]==STATE_AZ ? 1 : 0;
+ double *pval = fwd + j*2;
+ args->str.l = 0;
+ ksprintf(&args->str, "ROH\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[ioff+j]+1, state, phred_score(1.0-pval[state]));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
}
}
-static void push_rid(args_t *args, int rid)
-{
- args->nrids++;
- args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int));
- args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int));
- args->rids[ args->nrids-1 ] = rid;
- args->rid_offs[ args->nrids-1 ] = args->nsites;
-}
int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
{
@@ -468,27 +658,52 @@ int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
return 0;
}
-int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
+int8_t *get_GT(args_t *args, bcf1_t *line)
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->gt_hdr_id ) break;
+ if ( i==line->n_fmt ) return NULL; // the tag is not present in this record
+
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ if ( fmt->n!=2 ) return NULL; // not diploid
+ if ( fmt->type!=BCF_BT_INT8 ) error("This is unexpected, GT type is %d\n", fmt->type);
+ return (int8_t*) fmt->p;
+}
+
+int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq)
+{
int i, nalt = 0, nref = 0;
- for (i=0; i<args->nsmpl; i++)
+ if ( args->af_smpl ) // subset samples for AF estimate
{
- int32_t *gt = &args->itmp[i*args->nitmp];
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ int ismpl = args->af_smpl->idx[i];
+ if ( bcf_gt_is_missing(gt[2*ismpl]) || bcf_gt_is_missing(gt[2*ismpl+1]) ) continue;
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+ if ( bcf_gt_allele(gt[2*ismpl]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[0]) ) nalt++;
- else nref++;
+ if ( bcf_gt_allele(gt[2*ismpl+1]) ) nalt++;
+ else nref++;
+ }
+ }
+ else // all samples used in AF estimate
+ {
+ int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr);
+ while ( gt < end )
+ {
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ if ( bcf_gt_allele(gt[0]) ) nalt++;
+ else nref++;
+
+ if ( bcf_gt_allele(gt[1]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[1]) ) nalt++;
- else nref++;
+ gt += 2;
+ }
}
if ( !nalt && !nref ) return -1;
@@ -496,105 +711,249 @@ int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
return 0;
}
+int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_freq)
+{
+ double af = 0;
+ int i, j, naf = 0;
+
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+
+ if ( args->af_smpl ) // subset samples for AF estimate
+ {
+ #define BRANCH(type_t) \
+ { \
+ for (i=0; i<args->af_smpl->n; i++) \
+ { \
+ int ismpl = args->af_smpl->idx[i]; \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ else // all samples used in AF estimate
+ {
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p; \
+ p -= fmt_pl->n; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ p += fmt_pl->n; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ if ( !naf ) return -1;
+
+ *alt_freq = af / naf;
+ return 0;
+}
+
+bcf_fmt_t *get_PL(args_t *args, bcf1_t *line)
+{
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->pl_hdr_id ) return &line->d.fmt[i];
+ return NULL;
+}
-int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
+int process_line(args_t *args, bcf1_t *line, int ial)
{
- args->nitmp = 0;
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ double alt_freq;
+ int8_t *GTs = NULL;
+ bcf_fmt_t *fmt_pl = NULL;
// Set allele frequency
- int ret;
+ int ret = 0, i,j;
if ( args->af_tag )
{
// Use an INFO tag provided by the user
ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
- if ( ret==1 )
- *alt_freq = args->AFs[0];
+ if ( ret>0 )
+ alt_freq = args->AFs[ial-1];
if ( ret==-2 )
error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
}
else if ( args->af_fname )
{
// Read AF from a file
- ret = read_AF(args->files->targets, line, alt_freq);
+ ret = read_AF(args->files->targets, line, &alt_freq);
+ }
+ else if ( args->dflt_AF > 0 )
+ {
+ alt_freq = args->dflt_AF;
+ }
+ else if ( args->estimate_AF )
+ {
+ // Estimate AF from GTs or PLs of all samples or samples listed in a file
+ if ( args->af_from_PL )
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ ret = estimate_AF_from_PL(args, fmt_pl, ial, &alt_freq);
+ }
+ else
+ {
+ GTs = get_GT(args, line);
+ if ( !GTs ) return -1;
+ ret = estimate_AF_from_GT(args, GTs, &alt_freq);
+ }
}
else
{
- // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
- ret = -1;
- if ( !args->estimate_AF )
+ // Use AC/AN
+ int AC = -1, AN = 0;
+ ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
+ if ( ret==1 )
{
- int AC = -1, AN = 0;
- ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
- if ( ret==1 )
- {
- AN = args->itmp[0];
- ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
- if ( ret>0 )
- AC = args->itmp[0];
- }
- if ( AN<=0 || AC<0 )
- ret = -1;
- else
- *alt_freq = (double) AC/AN;
+ AN = args->itmp[0];
+ ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
+ if ( ret>0 )
+ AC = args->itmp[0];
}
- if ( ret==-1 )
- ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp
+ if ( AN<=0 || AC<0 )
+ ret = -1;
+ else
+ alt_freq = (double) AC/AN;
}
if ( ret<0 ) return ret;
- if ( *alt_freq==0.0 )
- {
- if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0
- *alt_freq = args->dflt_AF;
- }
+ if ( alt_freq==0.0 ) return -1;
- // Set P(D|G)
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
if ( args->fake_PLs )
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ if ( !GTs ) GTs = get_GT(args, line);
+ }
+ else
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+ }
- int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ int ismpl = args->roh_smpl->idx[i];
- int a = bcf_gt_allele(gt[0]);
- int b = bcf_gt_allele(gt[1]);
- if ( a!=b )
- {
- pdg[0] = pdg[2] = args->unseen_PL;
- pdg[1] = 1 - 2*args->unseen_PL;
- }
- else if ( a==0 )
+ // set P(D|G)
+ double pdg[3];
+ if ( args->fake_PLs )
{
- pdg[0] = 1 - 2*args->unseen_PL;
- pdg[1] = pdg[2] = args->unseen_PL;
+ int8_t *gt = GTs + 2*ismpl;
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ int a = bcf_gt_allele(gt[0]);
+ int b = bcf_gt_allele(gt[1]);
+ if ( a!=b )
+ {
+ pdg[0] = pdg[2] = args->unseen_PL;
+ pdg[1] = 1 - 2*args->unseen_PL;
+ }
+ else if ( a==0 )
+ {
+ pdg[0] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = args->unseen_PL*args->unseen_PL;
+ }
+ else
+ {
+ pdg[0] = args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ }
}
else
{
- pdg[0] = pdg[1] = args->unseen_PL;
- pdg[2] = 1 - 2*args->unseen_PL;
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ pdg[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ pdg[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ pdg[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
}
- }
- else
- {
- args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
- if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
-
- int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
- pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
- pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
- pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;
double sum = pdg[0] + pdg[1] + pdg[2];
- if ( !sum ) return -1;
- pdg[0] /= sum;
- pdg[1] /= sum;
- pdg[2] /= sum;
+ if ( !sum ) continue;
+ for (j=0; j<3; j++) pdg[j] /= sum;
+ if ( args->skip_homref && pdg[0]>0.99 ) continue;
+
+ smpl_t *smpl = &args->smpl[i];
+ smpl->nused++;
+
+ if ( smpl->nsites >= smpl->msites )
+ {
+ hts_expand(uint32_t,smpl->nsites+1,smpl->msites,smpl->sites);
+ smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2);
+ if ( !smpl->eprob ) error("Error: failed to alloc %d bytes\n", sizeof(*smpl->eprob)*smpl->msites*2);
+ }
+
+ // Calculate emission probabilities P(D|AZ) and P(D|HW)
+ double *eprob = &smpl->eprob[2*smpl->nsites];
+ eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
+ eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
+
+ smpl->sites[smpl->nsites] = line->pos;
+ smpl->nsites++;
+
+ if ( args->vi_training )
+ {
+ if ( !smpl->nrid || line->rid!=smpl->rid[smpl->nrid-1] )
+ {
+ smpl->nrid++;
+ smpl->rid = (int*) realloc(smpl->rid,sizeof(*smpl->rid)*smpl->nrid);
+ smpl->rid[smpl->nrid-1] = line->rid;
+ smpl->rid_off = (int*) realloc(smpl->rid_off,sizeof(*smpl->rid_off)*smpl->nrid);
+ smpl->rid_off[smpl->nrid-1] = smpl->nsites - 1;
+ }
+ }
+ else if ( args->nbuf_max && smpl->nsites >= args->nbuf_max ) flush_viterbi(args, i);
}
return 0;
@@ -602,18 +961,35 @@ int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
static void vcfroh(args_t *args, bcf1_t *line)
{
+ int i;
+
// Are we done?
if ( !line )
{
- flush_viterbi(args);
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
return;
}
args->ntot++;
- // Skip unwanted lines
+ // Skip unwanted lines, for simplicity we consider only biallelic sites
if ( line->rid == args->skip_rid ) return;
if ( line->n_allele==1 ) return; // no ALT allele
- if ( line->n_allele!=2 ) return; // only biallelic sites
+ if ( line->n_allele > 3 ) return; // cannot be bi-allelic, even with <*>
+
+ // This can be raw callable VCF with the symbolic unseen allele <*>
+ int ial = 0;
+ for (i=1; i<line->n_allele; i++)
+ if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
+ if ( ial==0 ) // normal VCF, the symbolic allele is not present
+ {
+ if ( line->n_allele!=2 ) return; // not biallelic
+ ial = 1;
+ }
+ else
+ {
+ if ( line->n_allele!=3 ) return; // not biallelic
+ ial = ial==1 ? 2 : 1; // <*> can come in any order
+ }
if ( args->snps_only && !bcf_is_snp(line) ) return;
// Initialize genetic map
@@ -623,21 +999,15 @@ static void vcfroh(args_t *args, bcf1_t *line)
args->prev_rid = line->rid;
args->prev_pos = line->pos;
skip_rid = load_genmap(args, line);
- if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
}
// New chromosome?
if ( args->prev_rid!=line->rid )
{
skip_rid = load_genmap(args, line);
- if ( args->vi_training )
- {
- if ( !skip_rid ) push_rid(args, line->rid);
- }
- else
+ if ( !args->vi_training )
{
- flush_viterbi(args);
- args->nsites = 0;
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
}
args->prev_rid = line->rid;
args->prev_pos = line->pos;
@@ -655,25 +1025,8 @@ static void vcfroh(args_t *args, bcf1_t *line)
args->prev_pos = line->pos;
- // Ready for the new site
- int m = args->msites;
- hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
- if ( args->msites!=m )
- args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);
-
- // Set likelihoods and alternate allele frequencies
- double alt_freq, pdg[3];
- if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong
-
- args->nused++;
-
- // Calculate emission probabilities P(D|AZ) and P(D|HW)
- double *eprob = &args->eprob[2*args->nsites];
- eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
- eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
-
- args->sites[args->nsites] = line->pos;
- args->nsites++;
+ // parse the new line
+ process_line(args, line, ial);
}
static void usage(args_t *args)
@@ -686,21 +1039,32 @@ static void usage(args_t *args)
fprintf(stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
fprintf(stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
fprintf(stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(stderr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
- fprintf(stderr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
+ fprintf(stderr, " -b --buffer-size <int[,int]> buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
+ fprintf(stderr, " If the first number is negative, it is interpreted as the maximum memory to\n");
+ fprintf(stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
+ fprintf(stderr, " -e, --estimate-AF [TAG],<file> estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n");
+ fprintf(stderr, " in <file>. If TAG is not given, the frequency is estimated from GT by default\n");
+ fprintf(stderr, " -G, --GTs-only <float> use GTs and ignore PLs, instead using <float> for PL of the two least likely genotypes.\n");
+ fprintf(stderr, " Safe value to use is 30 to account for GT errors.\n");
+ fprintf(stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n");
fprintf(stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
- fprintf(stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
+ fprintf(stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
+ fprintf(stderr, " is replaced with chromosome name\n");
fprintf(stderr, " -M, --rec-rate <float> constant recombination rate per bp\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --sample <sample> sample to analyze\n");
+ fprintf(stderr, " -s, --samples <list> list of samples to analyze [all samples]\n");
+ fprintf(stderr, " -S, --samples-file <file> file of samples to analyze [all samples]\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "HMM Options:\n");
fprintf(stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
fprintf(stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
- fprintf(stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
+ fprintf(stderr, " -V, --viterbi-training <float> estimate HMM parameters, <float> is the convergence threshold, e.g. 1e-10 (experimental)\n");
fprintf(stderr, "\n");
exit(1);
}
@@ -721,12 +1085,17 @@ int main_vcfroh(int argc, char *argv[])
{"AF-tag",1,0,0},
{"AF-file",1,0,1},
{"AF-dflt",1,0,2},
+ {"buffer-size",1,0,'b'},
+ {"ignore-homref",0,0,'i'},
{"estimate-AF",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,0,'O'},
{"GTs-only",1,0,'G'},
- {"sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
{"hw-to-az",1,0,'a'},
{"az-to-hw",1,0,'H'},
- {"viterbi-training",0,0,'V'},
+ {"viterbi-training",1,0,'V'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
{"regions",1,0,'r'},
@@ -734,12 +1103,13 @@ int main_vcfroh(int argc, char *argv[])
{"genetic-map",1,0,'m'},
{"rec-rate",1,0,'M'},
{"skip-indels",0,0,'I'},
+ {"threads",1,0,9},
{0,0,0,0}
};
int naf_opts = 0;
char *tmp;
- while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) {
switch (c) {
case 0: args->af_tag = optarg; naf_opts++; break;
case 1: args->af_fname = optarg; naf_opts++; break;
@@ -747,7 +1117,15 @@ int main_vcfroh(int argc, char *argv[])
args->dflt_AF = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
+ if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG;
+ if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ;
+ break;
case 'e': args->estimate_AF = optarg; naf_opts++; break;
+ case 'b': args->buffer_size = optarg; break;
+ case 'i': args->skip_homref = 1; break;
case 'I': args->snps_only = 1; break;
case 'G':
args->fake_PLs = 1;
@@ -760,7 +1138,8 @@ int main_vcfroh(int argc, char *argv[])
args->rec_rate = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -M %s\n", optarg);
break;
- case 's': args->sample = strdup(optarg); break;
+ case 's': args->samples = strdup(optarg); break;
+ case 'S': args->samples = strdup(optarg); args->samples_is_file = 1; break;
case 'a':
args->t2AZ = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -a %s\n", optarg);
@@ -773,14 +1152,28 @@ int main_vcfroh(int argc, char *argv[])
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
- case 'V': args->vi_training = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'V':
+ args->vi_training = 1;
+ args->baum_welch_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
}
+ if ( !args->output_fname ) args->output_fname = "stdout";
+ if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG;
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
- if ( argc<optind+1 ) usage(args);
+ if ( args->vi_training && args->buffer_size ) error("Error: cannot use -b with -V\n");
if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ);
if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW);
if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n");
@@ -800,7 +1193,9 @@ int main_vcfroh(int argc, char *argv[])
if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
error("Failed to read the targets: %s\n", args->af_fname);
}
- if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
while ( bcf_sr_next_line(args->files) )
@@ -808,7 +1203,15 @@ int main_vcfroh(int argc, char *argv[])
vcfroh(args, args->files->readers[0].buffer[0]);
}
vcfroh(args, NULL);
- fprintf(stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ int i, nmin = 0;
+ for (i=0; i<args->roh_smpl->n; i++)
+ if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused;
+ fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
+ if ( nmin==0 )
+ {
+ fprintf(stderr,"No usable sites were found.");
+ if ( !naf_opts && !args->dflt_AF ) fprintf(stderr, " Consider using one of the AF options.\n");
+ }
destroy_data(args);
free(args);
return 0;
diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c
index 66ddc17..70ed798 100644
--- a/bcftools/vcfroh.c.pysam.c
+++ b/bcftools/vcfroh.c.pysam.c
@@ -32,12 +32,19 @@ THE SOFTWARE. */
#include <htslib/synced_bcf_reader.h>
#include <htslib/kstring.h>
#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
#include "bcftools.h"
#include "HMM.h"
+#include "smpl_ilist.h"
#define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies
#define STATE_AZ 1 // autozygous state
+#define OUTPUT_ST (1<<1)
+#define OUTPUT_RG (1<<2)
+#define OUTPUT_GZ (1<<3)
+
/** Genetic map */
typedef struct
{
@@ -46,6 +53,24 @@ typedef struct
}
genmap_t;
+/** HMM data for each sample */
+typedef struct
+{
+ double *eprob; // emission probs [2*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+ int igenmap; // current position in genmap
+ int nused; // some stats to detect if things didn't go wrong
+ int nrid, *rid, *rid_off; // for viterbi training, keep all chromosomes
+ void *snapshot; // hmm snapshot
+ struct {
+ uint32_t beg,end,nqual;
+ double qual;
+ int rid, state;
+ } rg;
+}
+smpl_t;
+
typedef struct _args_t
{
bcf_srs_t *files;
@@ -59,29 +84,32 @@ typedef struct _args_t
double rec_rate; // constant recombination rate if > 0
hmm_t *hmm;
- double *eprob; // emission probs [2*nsites,msites]
- uint32_t *sites; // positions [nsites,msites]
- int nsites, msites;
+ double baum_welch_th;
int nrids, *rids, *rid_offs; // multiple chroms with vi_training
+ int nbuf_max, nbuf_olap;
- int32_t *itmp;
- int nitmp, mitmp;
float *AFs;
- int mAFs;
+ int32_t *itmp;
+ int mAFs, nitmp, mitmp, pl_hdr_id, gt_hdr_id;
double pl2p[256], *pdg;
int32_t skip_rid, prev_rid, prev_pos;
- int ntot, nused; // some stats to detect if things didn't go awfully wrong
- int ismpl, nsmpl; // index of query sample
- char *estimate_AF, *sample; // list of samples for AF estimate and query sample
- char **argv, *targets_list, *regions_list, *af_fname, *af_tag;
- int argc, fake_PLs, snps_only, vi_training;
+ int ntot; // some stats to detect if things didn't go wrong
+ smpl_t *smpl; // HMM data for each sample
+ smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF)
+ smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file)
+ char *estimate_AF; // list of samples for AF estimate and query sample
+ int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT
+ char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
+ int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+ BGZF *out;
+ kstring_t str;
}
args_t;
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
void *smalloc(size_t size)
{
@@ -92,57 +120,137 @@ void *smalloc(size_t size)
static void init_data(args_t *args)
{
+ int i;
+
args->prev_rid = args->skip_rid = -1;
args->hdr = args->files->readers[0].header;
- if ( !args->sample )
- {
- if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
- args->sample = strdup(args->hdr->samples[0]);
- }
if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
- // Set samples
- kstring_t str = {0,0,0};
- if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
+ if ( !args->fake_PLs )
{
- int i, n;
- char **smpls = hts_readlist(args->estimate_AF, 1, &n);
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n");
+ if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT )
+ error("Error: The FORMAT/PL tag not defined as Integer in the header\n");
+ }
- // Make sure the query sample is included
- for (i=0; i<n; i++)
- if ( !strcmp(args->sample,smpls[i]) ) break;
+ if ( args->estimate_AF )
+ {
+ if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3;
+ else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; }
+ if ( strcmp("-",args->estimate_AF) )
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
- // Add the query sample if not present
- if ( i!=n ) kputs(args->sample, &str);
+ if ( args->estimate_AF || args->fake_PLs )
+ {
+ if ( args->af_from_PL )
+ {
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header\n");
+ }
+ else
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
+ }
+ if ( args->fake_PLs )
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
- for (i=0; i<n; i++)
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+ if ( args->samples )
+ {
+ // we may be able to subset to a few samples, for a text VCF this can be a major speedup
+ if ( (bcf_sr_get_reader(args->files,0))->file->format.format==vcf )
{
- if ( str.l ) kputc(',', &str);
- kputs(smpls[i], &str);
- free(smpls[i]);
+ kstring_t str = {0,0,0};
+ smpl_ilist_t *tmp = args->roh_smpl, *rmme = NULL;
+ if ( args->af_smpl )
+ {
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[args->roh_smpl->idx[i]], &str);
+ }
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ kputc(',', &str);
+ kputs(args->hdr->samples[args->af_smpl->idx[i]], &str);
+ }
+ rmme = tmp = smpl_ilist_init(args->hdr, str.s, 0, SMPL_NONE);
+ }
+ if ( tmp->n < bcf_hdr_nsamples(args->hdr) )
+ {
+ str.l = 0;
+ for (i=0; i<tmp->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[tmp->idx[i]], &str);
+ }
+ int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
+ else if ( ret>0 ) error("The %d-th sample not found in the VCF: %s\n", ret,str.s);
+
+ // update sample ids
+ smpl_ilist_destroy(args->roh_smpl);
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+
+ if ( args->af_smpl )
+ {
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
+ }
+ free(str.s);
+ if ( rmme )
+ smpl_ilist_destroy(rmme);
}
- free(smpls);
}
- else if ( !args->estimate_AF )
- kputs(args->sample, &str);
- if ( str.l )
+ // check whether all samples are in this list. If so, the lookup will not be needed
+ if ( args->af_smpl && args->af_smpl->n == bcf_hdr_nsamples(args->hdr) )
{
- int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
- if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
- else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
+ // all samples are in this list
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = NULL;
}
- if ( args->af_tag )
- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
- error("No such INFO tag in the VCF: %s\n", args->af_tag);
+ if ( args->buffer_size )
+ {
+ args->nbuf_olap = -1;
+ char *end;
+ double tmp = strtod(args->buffer_size,&end);
+ if ( *end )
+ {
+ if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size);
+ args->nbuf_olap = strtol(end+1,&end,10);
+ if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size);
+ }
+ if ( tmp<0 )
+ args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n;
+ else
+ args->nbuf_max = tmp;
- args->nsmpl = bcf_hdr_nsamples(args->hdr);
- args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
- free(str.s);
+ if ( args->nbuf_olap<0 )
+ args->nbuf_olap = args->nbuf_max*0.01;
+ }
+ fprintf(pysam_stderr,"Number of target samples: %d\n", args->roh_smpl->n);
+ fprintf(pysam_stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0));
+ fprintf(pysam_stderr,"Number of sites in the buffer/overlap: ");
+ if ( args->nbuf_max ) fprintf(pysam_stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap);
+ else fprintf(pysam_stderr,"unlimited\n");
+
+ args->smpl = (smpl_t*) calloc(args->roh_smpl->n,sizeof(smpl_t));
- int i;
for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);
// Init transition matrix and HMM
@@ -152,40 +260,88 @@ static void init_data(args_t *args)
MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ args->hmm = hmm_init(2, tprob, 10000);
if ( args->genmap_fname )
- {
- args->hmm = hmm_init(2, tprob, 0);
hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
- }
else if ( args->rec_rate > 0 )
- {
- args->hmm = hmm_init(2, tprob, 0);
- hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);
+ hmm_set_tprob_func(args->hmm, set_tprob_rrate, args);
- }
- else
- args->hmm = hmm_init(2, tprob, 10000);
+ args->out = bgzf_open(strcmp("pysam_stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu");
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno));
// print header
- fprintf(pysam_stdout, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
- fprintf(pysam_stdout, "# The command line was:\tbcftools %s", args->argv[0]);
+ args->str.l = 0;
+ ksprintf(&args->str, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ ksprintf(&args->str, "# The command line was:\tbcftools %s", args->argv[0]);
for (i=1; i<args->argc; i++)
- fprintf(pysam_stdout, " %s",args->argv[i]);
- fprintf(pysam_stdout, "\n#\n");
- fprintf(pysam_stdout, "# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
+ ksprintf(&args->str, " %s",args->argv[i]);
+ ksprintf(&args->str, "\n#\n");
+ if ( args->output_type & OUTPUT_RG )
+ {
+ i = 2;
+ ksprintf(&args->str, "# RG");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Start", i++);
+ ksprintf(&args->str, "\t[%d]End", i++);
+ ksprintf(&args->str, "\t[%d]Length (bp)", i++);
+ ksprintf(&args->str, "\t[%d]Number of markers", i++);
+ ksprintf(&args->str, "\t[%d]Quality (average fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->output_type & OUTPUT_ST )
+ {
+ i = 2;
+ ksprintf(&args->str, "# ST");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Position", i++);
+ ksprintf(&args->str, "\t[%d]State (0:HW, 1:AZ)", i++);
+ ksprintf(&args->str, "\t[%d]Quality (fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->vi_training)
+ {
+ i = 2;
+ ksprintf(&args->str, "# VT, Viterbi Training");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Iteration", i++);
+ ksprintf(&args->str, "\t[%d]dAZ", i++);
+ ksprintf(&args->str, "\t[%d]dHW", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(HW|HW)", i++);
+ ksprintf(&args->str, "\t[%d]P(AZ|HW)", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(AZ|AZ)", i++);
+ ksprintf(&args->str, "\t[%d]P(HW|AZ)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l )
+ error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
static void destroy_data(args_t *args)
{
- free(args->sites);
- free(args->eprob);
- free(args->sample);
+ if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname);
+ int i;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ free(args->smpl[i].eprob);
+ free(args->smpl[i].sites);
+ free(args->smpl[i].rid);
+ free(args->smpl[i].rid_off);
+ free(args->smpl[i].snapshot);
+ }
+ free(args->str.s);
+ free(args->smpl);
+ if ( args->af_smpl ) smpl_ilist_destroy(args->af_smpl);
+ smpl_ilist_destroy(args->roh_smpl);
free(args->rids);
free(args->rid_offs);
hmm_destroy(args->hmm);
bcf_sr_destroy(args->files);
- free(args->itmp); free(args->AFs); free(args->pdg);
+ free(args->AFs); free(args->pdg);
free(args->genmap);
+ free(args->itmp);
+ free(args->samples);
}
static int load_genmap(args_t *args, bcf1_t *line)
@@ -222,21 +378,22 @@ static int load_genmap(args_t *args, bcf1_t *line)
hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
genmap_t *gm = &args->genmap[args->ngenmap-1];
+ // position, convert to 0-based
char *tmp, *end;
gm->pos = strtol(str.s, &tmp, 10);
if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->pos -= 1;
// skip second column
tmp++;
while ( *tmp && !isspace(*tmp) ) tmp++;
- // read the genetic map in cM
+ // read the genetic map in cM, scale from % to likelihood
gm->rate = strtod(tmp+1, &end);
if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->rate *= 0.01;
}
if ( !args->ngenmap ) error("Genetic map empty?\n");
- int i;
- for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
if ( hts_close(fp) ) error("Close failed\n");
free(str.s);
return 0;
@@ -257,7 +414,6 @@ static double get_genmap_rate(args_t *args, int start, int end)
// position j to be equal or larger than end
int j = i;
while ( j+1<args->ngenmap && args->genmap[j].pos < end ) j++;
-
if ( i==j )
{
args->igenmap = i;
@@ -274,17 +430,20 @@ static double get_genmap_rate(args_t *args, int start, int end)
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
- double ci = get_genmap_rate(args, pos - prev_pos, pos);
+ double ci = get_genmap_rate(args, prev_pos, pos);
+ if ( args->rec_rate ) ci *= args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
}
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
double ci = (pos - prev_pos) * args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
@@ -317,132 +476,163 @@ void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data,
*
*/
-static void flush_viterbi(args_t *args)
+static void flush_viterbi(args_t *args, int ismpl)
{
- int i,j;
+ smpl_t *smpl = &args->smpl[ismpl];
+ if ( !smpl->nsites ) return;
- if ( !args->nsites ) return;
+ const char *name = args->hdr->samples[ args->roh_smpl->idx[ismpl] ];
- if ( !args->vi_training )
+ int i,j,k;
+
+ if ( !args->vi_training ) // single viterbi pass
{
- // single viterbi pass, one chromsome
- hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
- hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
+ hmm_restore(args->hmm, smpl->snapshot);
+ int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites;
+ if ( end < smpl->nsites )
+ smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1);
+
+ args->igenmap = smpl->igenmap;
+ hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
+ hmm_run_fwd_bwd(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (i=0; i<args->nsites; i++)
+ const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+
+ for (i=0; i<end; i++)
{
int state = vpath[i*2]==STATE_AZ ? 1 : 0;
- double *pval = fwd + i*2;
- fprintf(pysam_stdout, "%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
- }
- return;
- }
+ double qual = phred_score(1.0 - fwd[i*2 + state]);
+ if ( args->output_type & OUTPUT_ST )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "ST\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[i]+1, state, qual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ }
- // viterbi training, multiple chromosomes
- double t2az_prev, t2hw_prev;
- double deltaz, delthw;
- int niter = 0;
- do
- {
- double *tprob_arr = hmm_get_tprob(args->hmm);
- t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ;
- t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW;
- double tcounts[] = { 0,0,0,0 };
- for (i=0; i<args->nrids; i++)
- {
- // run viterbi for each chromosomes. eprob and sites contain
- // multiple chromosomes, rid_offs mark the boundaries
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
-
- // what transitions were observed: add to the total counts
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (j=1; j<nsites; j++)
+ if ( args->output_type & OUTPUT_RG )
{
- // count the number of transitions
- int prev_state = vpath[2*(j-1)];
- int curr_state = vpath[2*j];
- MAT(tcounts,2,curr_state,prev_state) += 1;
+ if ( state!=smpl->rg.state )
+ {
+ if ( !state ) // the region ends, flush
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
+ }
+ else
+ {
+ smpl->rg.state = 1;
+ smpl->rg.beg = smpl->sites[i];
+ smpl->rg.rid = args->prev_rid;
+ }
+ }
+ else if ( state )
+ {
+ smpl->rg.nqual++;
+ smpl->rg.qual += qual;
+ smpl->rg.end = smpl->sites[i];
+ }
}
}
- // update the transition matrix
- int n = 1;
- for (i=0; i<2; i++)
+ if ( end < smpl->nsites )
{
- for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
+ end = smpl->nsites - args->nbuf_olap;
+ memmove(smpl->sites, smpl->sites + end, sizeof(*smpl->sites)*args->nbuf_olap);
+ memmove(smpl->eprob, smpl->eprob + end*2, sizeof(*smpl->eprob)*args->nbuf_olap*2);
+ smpl->nsites = args->nbuf_olap;
+ smpl->igenmap = args->igenmap;
}
- for (i=0; i<2; i++)
+ else
{
- for (j=0; j<2; j++)
+ smpl->nsites = 0;
+ smpl->igenmap = 0;
+
+ if ( smpl->rg.state )
{
- // no transition to i-th state was observed, set to a small number
- if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n;
- else MAT(tcounts,2,i,j) /= n;
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
}
}
- // normalize
- for (i=0; i<2; i++)
+ return;
+ }
+
+
+ // viterbi training, multiple chromosomes
+ double t2az_prev, t2hw_prev;
+ double deltaz, delthw;
+
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
+ MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW;
+ MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ;
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ hmm_set_tprob(args->hmm, tprob_arr, 10000);
+
+ int niter = 0;
+ do
+ {
+ tprob_arr = hmm_get_tprob(args->hmm);
+ t2az_prev = MAT(tprob_arr,2,STATE_AZ,STATE_HW); //args->t2AZ;
+ t2hw_prev = MAT(tprob_arr,2,STATE_HW,STATE_AZ); //args->t2HW;
+ double tprob_new[] = { 0,0,0,0 };
+ for (i=0; i<smpl->nrid; i++)
{
- double norm = 0;
- for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i);
- assert( norm!=0 );
- for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm;
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ tprob_arr = hmm_run_baum_welch(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) += MAT(tprob_arr,2,j,k);
}
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) /= smpl->nrid;
- if ( args->genmap_fname || args->rec_rate > 0 )
- hmm_set_tprob(args->hmm, tcounts, 0);
- else
- hmm_set_tprob(args->hmm, tcounts, 10000);
+ hmm_set_tprob(args->hmm, tprob_new, 10000);
- tprob_arr = hmm_get_tprob(args->hmm);
- deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
- delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
+ deltaz = fabs(MAT(tprob_new,2,STATE_AZ,STATE_HW)-t2az_prev);
+ delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev);
niter++;
- fprintf(pysam_stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n",
- niter,deltaz,delthw,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ args->str.l = 0;
+ ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n",
+ name,niter,deltaz,delthw,
+ 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW),
+ 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
- while ( deltaz > 0.0 || delthw > 0.0 );
- double *tprob_arr = hmm_get_tprob(args->hmm);
- fprintf(pysam_stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th );
// output the results
- for (i=0; i<args->nrids; i++)
+ for (i=0; i<smpl->nrid; i++)
{
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
- hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ hmm_run_viterbi(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ hmm_run_fwd_bwd(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
+ const char *chr = bcf_hdr_id2name(args->hdr,smpl->rid[i]);
for (j=0; j<nsites; j++)
{
- int state = vpath[j*2];
- double pval = fwd[j*2 + state];
- fprintf(pysam_stdout, "%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval);
+ int state = vpath[j*2]==STATE_AZ ? 1 : 0;
+ double *pval = fwd + j*2;
+ args->str.l = 0;
+ ksprintf(&args->str, "ROH\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[ioff+j]+1, state, phred_score(1.0-pval[state]));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
}
}
-static void push_rid(args_t *args, int rid)
-{
- args->nrids++;
- args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int));
- args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int));
- args->rids[ args->nrids-1 ] = rid;
- args->rid_offs[ args->nrids-1 ] = args->nsites;
-}
int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
{
@@ -470,27 +660,52 @@ int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
return 0;
}
-int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
+int8_t *get_GT(args_t *args, bcf1_t *line)
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->gt_hdr_id ) break;
+ if ( i==line->n_fmt ) return NULL; // the tag is not present in this record
+
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ if ( fmt->n!=2 ) return NULL; // not diploid
+ if ( fmt->type!=BCF_BT_INT8 ) error("This is unexpected, GT type is %d\n", fmt->type);
+ return (int8_t*) fmt->p;
+}
+
+int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq)
+{
int i, nalt = 0, nref = 0;
- for (i=0; i<args->nsmpl; i++)
+ if ( args->af_smpl ) // subset samples for AF estimate
{
- int32_t *gt = &args->itmp[i*args->nitmp];
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ int ismpl = args->af_smpl->idx[i];
+ if ( bcf_gt_is_missing(gt[2*ismpl]) || bcf_gt_is_missing(gt[2*ismpl+1]) ) continue;
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+ if ( bcf_gt_allele(gt[2*ismpl]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[0]) ) nalt++;
- else nref++;
+ if ( bcf_gt_allele(gt[2*ismpl+1]) ) nalt++;
+ else nref++;
+ }
+ }
+ else // all samples used in AF estimate
+ {
+ int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr);
+ while ( gt < end )
+ {
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ if ( bcf_gt_allele(gt[0]) ) nalt++;
+ else nref++;
+
+ if ( bcf_gt_allele(gt[1]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[1]) ) nalt++;
- else nref++;
+ gt += 2;
+ }
}
if ( !nalt && !nref ) return -1;
@@ -498,105 +713,249 @@ int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
return 0;
}
+int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_freq)
+{
+ double af = 0;
+ int i, j, naf = 0;
+
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+
+ if ( args->af_smpl ) // subset samples for AF estimate
+ {
+ #define BRANCH(type_t) \
+ { \
+ for (i=0; i<args->af_smpl->n; i++) \
+ { \
+ int ismpl = args->af_smpl->idx[i]; \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ else // all samples used in AF estimate
+ {
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p; \
+ p -= fmt_pl->n; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ p += fmt_pl->n; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ if ( !naf ) return -1;
+
+ *alt_freq = af / naf;
+ return 0;
+}
+
+bcf_fmt_t *get_PL(args_t *args, bcf1_t *line)
+{
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->pl_hdr_id ) return &line->d.fmt[i];
+ return NULL;
+}
-int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
+int process_line(args_t *args, bcf1_t *line, int ial)
{
- args->nitmp = 0;
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ double alt_freq;
+ int8_t *GTs = NULL;
+ bcf_fmt_t *fmt_pl = NULL;
// Set allele frequency
- int ret;
+ int ret = 0, i,j;
if ( args->af_tag )
{
// Use an INFO tag provided by the user
ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
- if ( ret==1 )
- *alt_freq = args->AFs[0];
+ if ( ret>0 )
+ alt_freq = args->AFs[ial-1];
if ( ret==-2 )
error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
}
else if ( args->af_fname )
{
// Read AF from a file
- ret = read_AF(args->files->targets, line, alt_freq);
+ ret = read_AF(args->files->targets, line, &alt_freq);
+ }
+ else if ( args->dflt_AF > 0 )
+ {
+ alt_freq = args->dflt_AF;
+ }
+ else if ( args->estimate_AF )
+ {
+ // Estimate AF from GTs or PLs of all samples or samples listed in a file
+ if ( args->af_from_PL )
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ ret = estimate_AF_from_PL(args, fmt_pl, ial, &alt_freq);
+ }
+ else
+ {
+ GTs = get_GT(args, line);
+ if ( !GTs ) return -1;
+ ret = estimate_AF_from_GT(args, GTs, &alt_freq);
+ }
}
else
{
- // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
- ret = -1;
- if ( !args->estimate_AF )
+ // Use AC/AN
+ int AC = -1, AN = 0;
+ ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
+ if ( ret==1 )
{
- int AC = -1, AN = 0;
- ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
- if ( ret==1 )
- {
- AN = args->itmp[0];
- ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
- if ( ret>0 )
- AC = args->itmp[0];
- }
- if ( AN<=0 || AC<0 )
- ret = -1;
- else
- *alt_freq = (double) AC/AN;
+ AN = args->itmp[0];
+ ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
+ if ( ret>0 )
+ AC = args->itmp[0];
}
- if ( ret==-1 )
- ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp
+ if ( AN<=0 || AC<0 )
+ ret = -1;
+ else
+ alt_freq = (double) AC/AN;
}
if ( ret<0 ) return ret;
- if ( *alt_freq==0.0 )
- {
- if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0
- *alt_freq = args->dflt_AF;
- }
+ if ( alt_freq==0.0 ) return -1;
- // Set P(D|G)
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
if ( args->fake_PLs )
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ if ( !GTs ) GTs = get_GT(args, line);
+ }
+ else
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+ }
- int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ int ismpl = args->roh_smpl->idx[i];
- int a = bcf_gt_allele(gt[0]);
- int b = bcf_gt_allele(gt[1]);
- if ( a!=b )
- {
- pdg[0] = pdg[2] = args->unseen_PL;
- pdg[1] = 1 - 2*args->unseen_PL;
- }
- else if ( a==0 )
+ // set P(D|G)
+ double pdg[3];
+ if ( args->fake_PLs )
{
- pdg[0] = 1 - 2*args->unseen_PL;
- pdg[1] = pdg[2] = args->unseen_PL;
+ int8_t *gt = GTs + 2*ismpl;
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ int a = bcf_gt_allele(gt[0]);
+ int b = bcf_gt_allele(gt[1]);
+ if ( a!=b )
+ {
+ pdg[0] = pdg[2] = args->unseen_PL;
+ pdg[1] = 1 - 2*args->unseen_PL;
+ }
+ else if ( a==0 )
+ {
+ pdg[0] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = args->unseen_PL*args->unseen_PL;
+ }
+ else
+ {
+ pdg[0] = args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ }
}
else
{
- pdg[0] = pdg[1] = args->unseen_PL;
- pdg[2] = 1 - 2*args->unseen_PL;
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ pdg[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ pdg[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ pdg[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
}
- }
- else
- {
- args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
- if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
-
- int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
- pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
- pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
- pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;
double sum = pdg[0] + pdg[1] + pdg[2];
- if ( !sum ) return -1;
- pdg[0] /= sum;
- pdg[1] /= sum;
- pdg[2] /= sum;
+ if ( !sum ) continue;
+ for (j=0; j<3; j++) pdg[j] /= sum;
+ if ( args->skip_homref && pdg[0]>0.99 ) continue;
+
+ smpl_t *smpl = &args->smpl[i];
+ smpl->nused++;
+
+ if ( smpl->nsites >= smpl->msites )
+ {
+ hts_expand(uint32_t,smpl->nsites+1,smpl->msites,smpl->sites);
+ smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2);
+ if ( !smpl->eprob ) error("Error: failed to alloc %d bytes\n", sizeof(*smpl->eprob)*smpl->msites*2);
+ }
+
+ // Calculate emission probabilities P(D|AZ) and P(D|HW)
+ double *eprob = &smpl->eprob[2*smpl->nsites];
+ eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
+ eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
+
+ smpl->sites[smpl->nsites] = line->pos;
+ smpl->nsites++;
+
+ if ( args->vi_training )
+ {
+ if ( !smpl->nrid || line->rid!=smpl->rid[smpl->nrid-1] )
+ {
+ smpl->nrid++;
+ smpl->rid = (int*) realloc(smpl->rid,sizeof(*smpl->rid)*smpl->nrid);
+ smpl->rid[smpl->nrid-1] = line->rid;
+ smpl->rid_off = (int*) realloc(smpl->rid_off,sizeof(*smpl->rid_off)*smpl->nrid);
+ smpl->rid_off[smpl->nrid-1] = smpl->nsites - 1;
+ }
+ }
+ else if ( args->nbuf_max && smpl->nsites >= args->nbuf_max ) flush_viterbi(args, i);
}
return 0;
@@ -604,18 +963,35 @@ int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
static void vcfroh(args_t *args, bcf1_t *line)
{
+ int i;
+
// Are we done?
if ( !line )
{
- flush_viterbi(args);
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
return;
}
args->ntot++;
- // Skip unwanted lines
+ // Skip unwanted lines, for simplicity we consider only biallelic sites
if ( line->rid == args->skip_rid ) return;
if ( line->n_allele==1 ) return; // no ALT allele
- if ( line->n_allele!=2 ) return; // only biallelic sites
+ if ( line->n_allele > 3 ) return; // cannot be bi-allelic, even with <*>
+
+ // This can be raw callable VCF with the symbolic unseen allele <*>
+ int ial = 0;
+ for (i=1; i<line->n_allele; i++)
+ if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
+ if ( ial==0 ) // normal VCF, the symbolic allele is not present
+ {
+ if ( line->n_allele!=2 ) return; // not biallelic
+ ial = 1;
+ }
+ else
+ {
+ if ( line->n_allele!=3 ) return; // not biallelic
+ ial = ial==1 ? 2 : 1; // <*> can come in any order
+ }
if ( args->snps_only && !bcf_is_snp(line) ) return;
// Initialize genetic map
@@ -625,21 +1001,15 @@ static void vcfroh(args_t *args, bcf1_t *line)
args->prev_rid = line->rid;
args->prev_pos = line->pos;
skip_rid = load_genmap(args, line);
- if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
}
// New chromosome?
if ( args->prev_rid!=line->rid )
{
skip_rid = load_genmap(args, line);
- if ( args->vi_training )
- {
- if ( !skip_rid ) push_rid(args, line->rid);
- }
- else
+ if ( !args->vi_training )
{
- flush_viterbi(args);
- args->nsites = 0;
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
}
args->prev_rid = line->rid;
args->prev_pos = line->pos;
@@ -657,25 +1027,8 @@ static void vcfroh(args_t *args, bcf1_t *line)
args->prev_pos = line->pos;
- // Ready for the new site
- int m = args->msites;
- hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
- if ( args->msites!=m )
- args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);
-
- // Set likelihoods and alternate allele frequencies
- double alt_freq, pdg[3];
- if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong
-
- args->nused++;
-
- // Calculate emission probabilities P(D|AZ) and P(D|HW)
- double *eprob = &args->eprob[2*args->nsites];
- eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
- eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
-
- args->sites[args->nsites] = line->pos;
- args->nsites++;
+ // parse the new line
+ process_line(args, line, ial);
}
static void usage(args_t *args)
@@ -688,21 +1041,32 @@ static void usage(args_t *args)
fprintf(pysam_stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
fprintf(pysam_stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
fprintf(pysam_stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(pysam_stderr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
- fprintf(pysam_stderr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
+ fprintf(pysam_stderr, " -b --buffer-size <int[,int]> buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
+ fprintf(pysam_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n");
+ fprintf(pysam_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
+ fprintf(pysam_stderr, " -e, --estimate-AF [TAG],<file> estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n");
+ fprintf(pysam_stderr, " in <file>. If TAG is not given, the frequency is estimated from GT by default\n");
+ fprintf(pysam_stderr, " -G, --GTs-only <float> use GTs and ignore PLs, instead using <float> for PL of the two least likely genotypes.\n");
+ fprintf(pysam_stderr, " Safe value to use is 30 to account for GT errors.\n");
+ fprintf(pysam_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n");
fprintf(pysam_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
- fprintf(pysam_stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
+ fprintf(pysam_stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
+ fprintf(pysam_stderr, " is replaced with chromosome name\n");
fprintf(pysam_stderr, " -M, --rec-rate <float> constant recombination rate per bp\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n");
fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysam_stderr, " -s, --sample <sample> sample to analyze\n");
+ fprintf(pysam_stderr, " -s, --samples <list> list of samples to analyze [all samples]\n");
+ fprintf(pysam_stderr, " -S, --samples-file <file> file of samples to analyze [all samples]\n");
fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "HMM Options:\n");
fprintf(pysam_stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
fprintf(pysam_stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
- fprintf(pysam_stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
+ fprintf(pysam_stderr, " -V, --viterbi-training <float> estimate HMM parameters, <float> is the convergence threshold, e.g. 1e-10 (experimental)\n");
fprintf(pysam_stderr, "\n");
exit(1);
}
@@ -723,12 +1087,17 @@ int main_vcfroh(int argc, char *argv[])
{"AF-tag",1,0,0},
{"AF-file",1,0,1},
{"AF-dflt",1,0,2},
+ {"buffer-size",1,0,'b'},
+ {"ignore-homref",0,0,'i'},
{"estimate-AF",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,0,'O'},
{"GTs-only",1,0,'G'},
- {"sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
{"hw-to-az",1,0,'a'},
{"az-to-hw",1,0,'H'},
- {"viterbi-training",0,0,'V'},
+ {"viterbi-training",1,0,'V'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
{"regions",1,0,'r'},
@@ -736,12 +1105,13 @@ int main_vcfroh(int argc, char *argv[])
{"genetic-map",1,0,'m'},
{"rec-rate",1,0,'M'},
{"skip-indels",0,0,'I'},
+ {"threads",1,0,9},
{0,0,0,0}
};
int naf_opts = 0;
char *tmp;
- while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) {
switch (c) {
case 0: args->af_tag = optarg; naf_opts++; break;
case 1: args->af_fname = optarg; naf_opts++; break;
@@ -749,7 +1119,15 @@ int main_vcfroh(int argc, char *argv[])
args->dflt_AF = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
+ if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG;
+ if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ;
+ break;
case 'e': args->estimate_AF = optarg; naf_opts++; break;
+ case 'b': args->buffer_size = optarg; break;
+ case 'i': args->skip_homref = 1; break;
case 'I': args->snps_only = 1; break;
case 'G':
args->fake_PLs = 1;
@@ -762,7 +1140,8 @@ int main_vcfroh(int argc, char *argv[])
args->rec_rate = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -M %s\n", optarg);
break;
- case 's': args->sample = strdup(optarg); break;
+ case 's': args->samples = strdup(optarg); break;
+ case 'S': args->samples = strdup(optarg); args->samples_is_file = 1; break;
case 'a':
args->t2AZ = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -a %s\n", optarg);
@@ -775,14 +1154,28 @@ int main_vcfroh(int argc, char *argv[])
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
- case 'V': args->vi_training = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'V':
+ args->vi_training = 1;
+ args->baum_welch_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
}
+ if ( !args->output_fname ) args->output_fname = "pysam_stdout";
+ if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG;
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
- if ( argc<optind+1 ) usage(args);
+ if ( args->vi_training && args->buffer_size ) error("Error: cannot use -b with -V\n");
if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ);
if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW);
if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n");
@@ -802,7 +1195,9 @@ int main_vcfroh(int argc, char *argv[])
if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
error("Failed to read the targets: %s\n", args->af_fname);
}
- if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
while ( bcf_sr_next_line(args->files) )
@@ -810,7 +1205,15 @@ int main_vcfroh(int argc, char *argv[])
vcfroh(args, args->files->readers[0].buffer[0]);
}
vcfroh(args, NULL);
- fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ int i, nmin = 0;
+ for (i=0; i<args->roh_smpl->n; i++)
+ if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused;
+ fprintf(pysam_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
+ if ( nmin==0 )
+ {
+ fprintf(pysam_stderr,"No usable sites were found.");
+ if ( !naf_opts && !args->dflt_AF ) fprintf(pysam_stderr, " Consider using one of the AF options.\n");
+ }
destroy_data(args);
free(args);
return 0;
diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c
index 1032bf8..4041a5a 100644
--- a/bcftools/vcfstats.c
+++ b/bcftools/vcfstats.c
@@ -1,6 +1,6 @@
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -39,6 +39,7 @@ THE SOFTWARE. */
#include <inttypes.h>
#include "bcftools.h"
#include "filter.h"
+#include "bin.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
@@ -69,17 +70,6 @@ idist_t;
typedef struct
{
- double x;
- double x2;
- double y;
- double y2;
- double xy;
- double n;
-}
-smpl_r_t;
-
-typedef struct
-{
int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons
#if HWE_STATS
@@ -108,9 +98,14 @@ stats_t;
typedef struct
{
- uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches
- float r2sum;
- uint32_t r2n;
+ uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats
+ /*
+ Pearson's R^2 is used for aggregate R^2
+ y, yy .. sum of dosage and squared dosage in the query VCF (second file)
+ x, xx .. sum of squared dosage in the truth VCF (first file)
+ n .. number of genotypes
+ */
+ double y, yy, x, xx, yx, n;
}
gtcmp_t;
@@ -135,7 +130,11 @@ typedef struct
int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm;
uint8_t *tmp_frm;
int dp_min, dp_max, dp_step;
- gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons
+ gtcmp_t *smpl_gts_snps, *smpl_gts_indels;
+ gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
+ bin_t *af_bins;
+ float *farr;
+ int mfarr;
// indel context
indel_ctx_t *indel_ctx;
@@ -148,21 +147,18 @@ typedef struct
// other
bcf_srs_t *files;
bcf_sr_regions_t *exons;
- char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
+ char **argv, *exons_fname, *regions_list, *samples_list, *targets_list, *af_bins_list, *af_tag;
int argc, verbose_sites, first_allele_only, samples_is_file;
int split_by_id, nstats;
filter_t *filter[2];
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
-
- // Per Sample r working data arrays of size equal to number of samples
- smpl_r_t* smpl_r_snps;
- smpl_r_t* smpl_r_indels;
+ int n_threads;
}
args_t;
-static int type2dosage[6], type2ploidy[6], type2stats[6];
+static int type2dosage[6], type2ploidy[6], type2stats[7];
static void idist_init(idist_t *d, int min, int max, int step)
{
@@ -187,6 +183,12 @@ static inline int idist_i2bin(idist_t *d, int i)
return i-1+d->min;
}
+static inline int clip_nonnegative(float x, int limit)
+{
+ if (x >= limit || isnan(x)) return limit - 1;
+ else if (x <= 0.0) return 0;
+ else return (int) x;
+}
#define IC_DBG 0
#if IC_DBG
@@ -403,13 +405,30 @@ static void init_stats(args_t *args)
args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str);
if ( args->files->nreaders==2 )
args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str);
+ args->files->max_unpack |= filter_max_unpack(args->filter[0]);
+ }
+
+ // AF corresponds to AC but is more robust to mixtures of haploid and diploid GTs
+ if ( !args->af_bins_list )
+ {
+ args->m_af = 101;
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
+ args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ }
+ else
+ {
+ args->af_bins = bin_init(args->af_bins_list,0,1);
+
+ // m_af is used also for other af arrays, where the first bin is for
+ // singletons. However, since the last element is unused in af_bins
+ // (n boundaries form n-1 intervals), the m_af count is good for both.
+ args->m_af = bin_get_size(args->af_bins);
}
- // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs
- args->m_af = 101;
- for (i=0; i<args->files->nreaders; i++)
- if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
- args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
+ error("No such INFO tag: %s\n", args->af_tag);
#if QUAL_STATS
args->m_qual = 999;
@@ -430,8 +449,6 @@ static void init_stats(args_t *args)
args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
- args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
- args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
}
for (i=0; i<args->nstats; i++)
{
@@ -503,9 +520,10 @@ static void init_stats(args_t *args)
type2stats[GT_HOM_RR] = 0;
type2stats[GT_HET_RA] = 1;
type2stats[GT_HOM_AA] = 2;
- type2stats[GT_HET_AA] = 1;
+ type2stats[GT_HET_AA] = 3;
type2stats[GT_HAPL_R] = 0;
type2stats[GT_HAPL_A] = 2;
+ type2stats[GT_UNKN] = 4;
}
static void destroy_stats(args_t *args)
@@ -526,7 +544,6 @@ static void destroy_stats(args_t *args)
if (stats->qual_indels) free(stats->qual_indels);
#endif
#if HWE_STATS
- //if ( args->files->n_smpl ) free(stats->af_hwe);
free(stats->af_hwe);
#endif
free(stats->insertions);
@@ -554,6 +571,8 @@ static void destroy_stats(args_t *args)
if ( args->exons ) free(stats->smpl_frm_shifts);
}
for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
+ if ( args->af_bins ) bin_destroy(args->af_bins);
+ free(args->farr);
free(args->usr);
free(args->tmp_frm);
free(args->tmp_iaf);
@@ -562,8 +581,6 @@ static void destroy_stats(args_t *args)
free(args->af_gts_indels);
free(args->smpl_gts_snps);
free(args->smpl_gts_indels);
- free(args->smpl_r_snps);
- free(args->smpl_r_indels);
if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
if (args->filter[0]) filter_destroy(args->filter[0]);
if (args->filter[1]) filter_destroy(args->filter[1]);
@@ -572,36 +589,59 @@ static void destroy_stats(args_t *args)
static void init_iaf(args_t *args, bcf_sr_t *reader)
{
bcf1_t *line = reader->buffer[0];
- if ( args->ntmp_iaf < line->n_allele )
+ hts_expand(int32_t,line->n_allele,args->ntmp_iaf,args->tmp_iaf);
+
+ int i, ret;
+ if ( args->af_tag )
{
- args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int));
- args->ntmp_iaf = line->n_allele;
+ ret = bcf_get_info_float(reader->header, line, args->af_tag, &args->farr, &args->mfarr);
+ if ( ret<=0 || ret!=line->n_allele-1 )
+ {
+ // the AF tag is not present or wrong number of values, put in the singletons/unknown bin
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0;
+ return;
+ }
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ float af = args->farr[i-1];
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1; // the first tmp_iaf bin is reserved for singletons
+ }
+ return;
}
+
// tmp_iaf is first filled with AC counts in calc_ac and then transformed to
// an index to af_gts_snps
- int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
- if ( ret )
+ ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
+ if ( !ret )
{
- int an=0;
- for (i=0; i<line->n_allele; i++)
- an += args->tmp_iaf[i];
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0; // singletons/unknown bin
+ return;
+ }
- args->tmp_iaf[0] = 0;
- for (i=1; i<line->n_allele; i++)
+ int an = 0;
+ for (i=0; i<line->n_allele; i++)
+ an += args->tmp_iaf[i];
+
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->tmp_iaf[i]==1 )
+ args->tmp_iaf[i] = 0; // singletons into the first bin
+ else if ( !an )
+ args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
+ else
{
- if ( args->tmp_iaf[i]==1 )
- args->tmp_iaf[i] = 0; // singletons into the first bin
- else if ( !an )
- args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
- else
- args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an;
+ float af = (float) args->tmp_iaf[i] / an;
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1;
}
}
- else
- for (i=0; i<line->n_allele; i++)
- args->tmp_iaf[i] = 0;
-
- // todo: otherwise use AF
}
static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
@@ -621,7 +661,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
bcf1_t *line = reader->buffer[0];
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_indels[iqual]++;
#endif
@@ -756,7 +796,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
if ( ref<0 ) return;
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_snps[iqual]++;
#endif
@@ -873,6 +913,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
{
float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
int idx = het_frac*(args->naf_hwe - 1);
+//check me: what is this?
if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
stats->af_hwe[idx]++;
}
@@ -911,88 +952,42 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return;
// only the first ALT allele is considered
- int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1;
+ int iaf = args->tmp_iaf[1];
int line_type = bcf_get_variant_types(files->readers[0].buffer[0]);
gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels;
gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels;
- //
- // Calculates r squared
- // x is mean dosage of x at given site
- // x2 is mean squared dosage of x at given site
- // y is mean dosage of x at given site
- // y2 is mean squared dosage of x at given site
- // xy is mean dosage of x*y at given site
- // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
- // r2n is number of sites considered
- // output as r2sum/r2n for each AF bin
- int r2n = 0;
- float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0;
- // Select smpl_r
- smpl_r_t *smpl_r = NULL;
- if (line_type&VCF_SNP)
- {
- smpl_r = args->smpl_r_snps;
- }
- else if (line_type&VCF_INDEL)
- {
- smpl_r = args->smpl_r_indels;
- }
for (is=0; is<files->n_smpl; is++)
{
// Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
// actual alleles can be enforced by running without the -c option.
int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
- if ( gt0 == GT_UNKN ) continue;
-
int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
- if ( gt1 == GT_UNKN ) continue;
- if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
+ int idx0 = type2stats[gt0];
+ int idx1 = type2stats[gt1];
+ af_stats[iaf].gt2gt[idx0][idx1]++;
+ smpl_stats[is].gt2gt[idx0][idx1]++;
- int dsg0 = type2dosage[gt0];
- int dsg1 = type2dosage[gt1];
- x += dsg0;
- x2 += dsg0*dsg0;
- y += dsg1;
- y2 += dsg1*dsg1;
- xy += dsg0*dsg1;
- r2n++;
-
- int idx = type2stats[gt0];
- if ( gt0==gt1 )
- {
- af_stats[iaf].m[idx]++;
- smpl_stats[is].m[idx]++;
- }
- else
- {
- af_stats[iaf].mm[idx]++;
- smpl_stats[is].mm[idx]++;
- }
-
- // Now do it across samples
+ if ( gt0 == GT_UNKN || gt1 == GT_UNKN ) continue;
+ if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
- if (smpl_r) {
- smpl_r[is].xy += dsg0*dsg1;
- smpl_r[is].x += dsg0;
- smpl_r[is].x2 += dsg0*dsg0;
- smpl_r[is].y += dsg1;
- smpl_r[is].y2 += dsg1*dsg1;
- ++(smpl_r[is].n);
- }
- }
-
- if ( r2n )
- {
- x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n;
- float cov = xy - x*y;
- float var2 = (x2 - x*x) * (y2 - y*y);
- if ( var2!=0 )
- {
- af_stats[iaf].r2sum += cov*cov/var2;
- af_stats[iaf].r2n++;
- }
+ float y = type2dosage[gt0];
+ float x = type2dosage[gt1];
+
+ smpl_stats[is].yx += y*x;
+ smpl_stats[is].x += x;
+ smpl_stats[is].xx += x*x;
+ smpl_stats[is].y += y;
+ smpl_stats[is].yy += y*y;
+ smpl_stats[is].n += 1;
+
+ af_stats[iaf].yx += y*x;
+ af_stats[iaf].x += x;
+ af_stats[iaf].xx += x*x;
+ af_stats[iaf].y += y;
+ af_stats[iaf].yy += y*y;
+ af_stats[iaf].n += 1;
}
if ( args->verbose_sites )
@@ -1129,7 +1124,7 @@ static void print_header(args_t *args)
#define T2S(x) type2stats[x]
static void print_stats(args_t *args)
{
- int i, id;
+ int i, j,k, id;
printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
@@ -1202,6 +1197,24 @@ static void print_stats(args_t *args)
stats->af_repeats[1][1] += stats->af_repeats[1][0];
stats->af_repeats[2][1] += stats->af_repeats[2][0];
}
+ // move the singletons stats into the first AF bin, singleton stats was collected separately because of init_iaf
+ if ( args->af_gts_snps )
+ {
+ args->af_gts_snps[1].y += args->af_gts_snps[0].y;
+ args->af_gts_snps[1].yy += args->af_gts_snps[0].yy;
+ args->af_gts_snps[1].xx += args->af_gts_snps[0].xx;
+ args->af_gts_snps[1].yx += args->af_gts_snps[0].yx;
+ args->af_gts_snps[1].n += args->af_gts_snps[0].n;
+ }
+ if ( args->af_gts_indels )
+ {
+ args->af_gts_indels[1].y += args->af_gts_indels[0].y;
+ args->af_gts_indels[1].yy += args->af_gts_indels[0].yy;
+ args->af_gts_indels[1].xx += args->af_gts_indels[0].xx;
+ args->af_gts_indels[1].yx += args->af_gts_indels[0].yx;
+ args->af_gts_indels[1].n += args->af_gts_indels[0].n;
+ }
+
printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
for (id=0; id<args->nstats; id++)
{
@@ -1209,7 +1222,8 @@ static void print_stats(args_t *args)
for (i=1; i<args->m_af; i++) // note that af[1] now contains also af[0], see SiS stats output above
{
if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue;
- printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]);
}
}
@@ -1266,34 +1280,56 @@ static void print_stats(args_t *args)
printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
int x;
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
if ( x==0 )
{
- printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_snps;
}
else
{
- printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_indels;
}
- uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
+ uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins
for (i=0; i<args->m_af; i++)
{
- int j, n = 0;
- for (j=0; j<3; j++)
+ int n = 0;
+ uint64_t m[4] = {0,0,0,0}, mm[4] = {0,0,0,0}; // in i-th AF bin
+ for (j=0; j<4; j++) // rr, ra, aa hom, aa het, ./.
+ for (k=0; k<4; k++)
+ {
+ n += stats[i].gt2gt[j][k];
+ if ( j==k )
+ {
+ nrd_m[j] += stats[i].gt2gt[j][k];
+ m[j] += stats[i].gt2gt[j][k];
+ }
+ else
+ {
+ nrd_mm[j] += stats[i].gt2gt[j][k];
+ mm[j] += stats[i].gt2gt[j][k];
+ }
+ }
+ if ( !i || !n ) continue; // skip singleton stats and empty bins
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
{
- n += stats[i].m[j] + stats[i].mm[j];
- nrd_m[j] += stats[i].m[j];
- nrd_mm[j] += stats[i].mm[j];
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
- if ( !i || !n ) continue; // skip singleton stats and empty bins
- printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- printf("\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', af);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]);
+ if ( stats[i].n && !isnan(r2) ) printf("\t%f", r2);
+ else printf("\t"NA_STRING);
+ printf("\t%.0f\n", stats[i].n);
}
if ( x==0 )
@@ -1309,8 +1345,8 @@ static void print_stats(args_t *args)
}
else
printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
- uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)];
- uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)];
+ uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)];
+ uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)];
printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
m+mm ? mm*100.0/(m+mm) : 0,
nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0,
@@ -1319,42 +1355,99 @@ static void print_stats(args_t *args)
);
}
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
- smpl_r_t *smpl_r_array;
if ( x==0 )
{
printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_snps;
- smpl_r_array = args->smpl_r_snps;
}
else
{
printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_indels;
- smpl_r_array = args->smpl_r_indels;
}
for (i=0; i<args->files->n_smpl; i++)
{
- uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)];
- uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)];
- // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
- smpl_r_t *smpl_r = smpl_r_array + i;
- double r = 0.0;
- if (smpl_r->n) {
- double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula
- double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n;
- double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
- r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
+ uint64_t mm = 0, m = stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)];
+ for (j=0; j<3; j++)
+ for (k=0; k<3; k++)
+ if ( j!=k ) mm += stats[i].gt2gt[j][k];
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
+ {
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- if (smpl_r->n && !isnan(r)) printf("\t%f\n", r*r);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]);
+ if ( stats[i].n && !isnan(r2) ) printf("\t%f\n", r2);
else printf("\t"NA_STRING"\n");
}
}
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
+ {
+ //printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+
+ gtcmp_t *stats;
+ if ( x==0 )
+ {
+ printf("# GCTs, Genotype concordance table (SNPs)\n# GCTs");
+ stats = args->smpl_gts_snps;
+ }
+ else
+ {
+ printf("# GCTi, Genotype concordance table (indels)\n# GCTi");
+ stats = args->smpl_gts_indels;
+ }
+ i = 1;
+ printf("\t[%d]sample", ++i);
+ printf("\t[%d]RR Hom -> RR Hom", ++i);
+ printf("\t[%d]RR Hom -> RA Het", ++i);
+ printf("\t[%d]RR Hom -> AA Hom", ++i);
+ printf("\t[%d]RR Hom -> AA Het", ++i);
+ printf("\t[%d]RR Hom -> missing", ++i);
+ printf("\t[%d]RA Het -> RR Hom", ++i);
+ printf("\t[%d]RA Het -> RA Het", ++i);
+ printf("\t[%d]RA Het -> AA Hom", ++i);
+ printf("\t[%d]RA Het -> AA Het", ++i);
+ printf("\t[%d]RA Het -> missing", ++i);
+ printf("\t[%d]AA Hom -> RR Hom", ++i);
+ printf("\t[%d]AA Hom -> RA Het", ++i);
+ printf("\t[%d]AA Hom -> AA Hom", ++i);
+ printf("\t[%d]AA Hom -> AA Het", ++i);
+ printf("\t[%d]AA Hom -> missing", ++i);
+ printf("\t[%d]AA Het -> RR Hom", ++i);
+ printf("\t[%d]AA Het -> RA Het", ++i);
+ printf("\t[%d]AA Het -> AA Hom", ++i);
+ printf("\t[%d]AA Het -> AA Het", ++i);
+ printf("\t[%d]AA Het -> missing", ++i);
+ printf("\t[%d]missing -> RR Hom", ++i);
+ printf("\t[%d]missing -> RA Het", ++i);
+ printf("\t[%d]missing -> AA Hom", ++i);
+ printf("\t[%d]missing -> AA Het", ++i);
+ printf("\t[%d]missing -> missing\n", ++i);
+
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ printf("GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]);
+ for (j=0; j<5; j++)
+ for (k=0; k<5; k++)
+ printf("\t%"PRId64, stats[i].gt2gt[j][k]);
+ printf("\n");
+ }
+ }
}
printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
@@ -1423,8 +1516,10 @@ static void print_stats(args_t *args)
for (j=0; j<args->naf_hwe; j++) sum_tot += ptr[j];
if ( !sum_tot ) continue;
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+
int nprn = 3;
- printf("HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
+ printf("HWE\t%d\t%f\t%d",id,af,sum_tot);
for (j=0; j<args->naf_hwe; j++)
{
sum_tmp += ptr[j];
@@ -1462,6 +1557,8 @@ static void usage(void)
fprintf(stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
+ fprintf(stderr, " --af-bins <list> allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+ fprintf(stderr, " --af-tag <string> allele frequency tag to use, by default estimated from AN,AC or GT\n");
fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
fprintf(stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
@@ -1478,6 +1575,7 @@ static void usage(void)
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
fprintf(stderr, "\n");
exit(1);
@@ -1494,6 +1592,8 @@ int main_vcfstats(int argc, char *argv[])
static struct option loptions[] =
{
+ {"af-bins",1,0,1},
+ {"af-tag",1,0,2},
{"1st-allele-only",0,0,'1'},
{"include",1,0,'i'},
{"exclude",1,0,'e'},
@@ -1512,10 +1612,13 @@ int main_vcfstats(int argc, char *argv[])
{"targets-file",1,0,'T'},
{"fasta-ref",1,0,'F'},
{"user-tstv",1,0,'u'},
+ {"threads",1,0,9},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) {
switch (c) {
+ case 1 : args->af_bins_list = optarg; break;
+ case 2 : args->af_tag = optarg; break;
case 'u': add_user_stats(args,optarg); break;
case '1': args->first_allele_only = 1; break;
case 'F': args->ref_fname = optarg; break;
@@ -1547,6 +1650,7 @@ int main_vcfstats(int argc, char *argv[])
case 'I': args->split_by_id = 1; break;
case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
@@ -1571,6 +1675,9 @@ int main_vcfstats(int argc, char *argv[])
error("Failed to read the targets: %s\n", args->targets_list);
if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+
while (fname)
{
if ( !bcf_sr_add_reader(args->files, fname) )
diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c
index 5653760..a5e5a9f 100644
--- a/bcftools/vcfstats.c.pysam.c
+++ b/bcftools/vcfstats.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -41,6 +41,7 @@ THE SOFTWARE. */
#include <inttypes.h>
#include "bcftools.h"
#include "filter.h"
+#include "bin.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
@@ -71,17 +72,6 @@ idist_t;
typedef struct
{
- double x;
- double x2;
- double y;
- double y2;
- double xy;
- double n;
-}
-smpl_r_t;
-
-typedef struct
-{
int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons
#if HWE_STATS
@@ -110,9 +100,14 @@ stats_t;
typedef struct
{
- uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches
- float r2sum;
- uint32_t r2n;
+ uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats
+ /*
+ Pearson's R^2 is used for aggregate R^2
+ y, yy .. sum of dosage and squared dosage in the query VCF (second file)
+ x, xx .. sum of squared dosage in the truth VCF (first file)
+ n .. number of genotypes
+ */
+ double y, yy, x, xx, yx, n;
}
gtcmp_t;
@@ -137,7 +132,11 @@ typedef struct
int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm;
uint8_t *tmp_frm;
int dp_min, dp_max, dp_step;
- gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons
+ gtcmp_t *smpl_gts_snps, *smpl_gts_indels;
+ gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
+ bin_t *af_bins;
+ float *farr;
+ int mfarr;
// indel context
indel_ctx_t *indel_ctx;
@@ -150,21 +149,18 @@ typedef struct
// other
bcf_srs_t *files;
bcf_sr_regions_t *exons;
- char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
+ char **argv, *exons_fname, *regions_list, *samples_list, *targets_list, *af_bins_list, *af_tag;
int argc, verbose_sites, first_allele_only, samples_is_file;
int split_by_id, nstats;
filter_t *filter[2];
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
-
- // Per Sample r working data arrays of size equal to number of samples
- smpl_r_t* smpl_r_snps;
- smpl_r_t* smpl_r_indels;
+ int n_threads;
}
args_t;
-static int type2dosage[6], type2ploidy[6], type2stats[6];
+static int type2dosage[6], type2ploidy[6], type2stats[7];
static void idist_init(idist_t *d, int min, int max, int step)
{
@@ -189,6 +185,12 @@ static inline int idist_i2bin(idist_t *d, int i)
return i-1+d->min;
}
+static inline int clip_nonnegative(float x, int limit)
+{
+ if (x >= limit || isnan(x)) return limit - 1;
+ else if (x <= 0.0) return 0;
+ else return (int) x;
+}
#define IC_DBG 0
#if IC_DBG
@@ -405,13 +407,30 @@ static void init_stats(args_t *args)
args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str);
if ( args->files->nreaders==2 )
args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str);
+ args->files->max_unpack |= filter_max_unpack(args->filter[0]);
+ }
+
+ // AF corresponds to AC but is more robust to mixtures of haploid and diploid GTs
+ if ( !args->af_bins_list )
+ {
+ args->m_af = 101;
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
+ args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ }
+ else
+ {
+ args->af_bins = bin_init(args->af_bins_list,0,1);
+
+ // m_af is used also for other af arrays, where the first bin is for
+ // singletons. However, since the last element is unused in af_bins
+ // (n boundaries form n-1 intervals), the m_af count is good for both.
+ args->m_af = bin_get_size(args->af_bins);
}
- // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs
- args->m_af = 101;
- for (i=0; i<args->files->nreaders; i++)
- if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
- args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
+ error("No such INFO tag: %s\n", args->af_tag);
#if QUAL_STATS
args->m_qual = 999;
@@ -432,8 +451,6 @@ static void init_stats(args_t *args)
args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
- args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
- args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
}
for (i=0; i<args->nstats; i++)
{
@@ -505,9 +522,10 @@ static void init_stats(args_t *args)
type2stats[GT_HOM_RR] = 0;
type2stats[GT_HET_RA] = 1;
type2stats[GT_HOM_AA] = 2;
- type2stats[GT_HET_AA] = 1;
+ type2stats[GT_HET_AA] = 3;
type2stats[GT_HAPL_R] = 0;
type2stats[GT_HAPL_A] = 2;
+ type2stats[GT_UNKN] = 4;
}
static void destroy_stats(args_t *args)
@@ -528,7 +546,6 @@ static void destroy_stats(args_t *args)
if (stats->qual_indels) free(stats->qual_indels);
#endif
#if HWE_STATS
- //if ( args->files->n_smpl ) free(stats->af_hwe);
free(stats->af_hwe);
#endif
free(stats->insertions);
@@ -556,6 +573,8 @@ static void destroy_stats(args_t *args)
if ( args->exons ) free(stats->smpl_frm_shifts);
}
for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
+ if ( args->af_bins ) bin_destroy(args->af_bins);
+ free(args->farr);
free(args->usr);
free(args->tmp_frm);
free(args->tmp_iaf);
@@ -564,8 +583,6 @@ static void destroy_stats(args_t *args)
free(args->af_gts_indels);
free(args->smpl_gts_snps);
free(args->smpl_gts_indels);
- free(args->smpl_r_snps);
- free(args->smpl_r_indels);
if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
if (args->filter[0]) filter_destroy(args->filter[0]);
if (args->filter[1]) filter_destroy(args->filter[1]);
@@ -574,36 +591,59 @@ static void destroy_stats(args_t *args)
static void init_iaf(args_t *args, bcf_sr_t *reader)
{
bcf1_t *line = reader->buffer[0];
- if ( args->ntmp_iaf < line->n_allele )
+ hts_expand(int32_t,line->n_allele,args->ntmp_iaf,args->tmp_iaf);
+
+ int i, ret;
+ if ( args->af_tag )
{
- args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int));
- args->ntmp_iaf = line->n_allele;
+ ret = bcf_get_info_float(reader->header, line, args->af_tag, &args->farr, &args->mfarr);
+ if ( ret<=0 || ret!=line->n_allele-1 )
+ {
+ // the AF tag is not present or wrong number of values, put in the singletons/unknown bin
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0;
+ return;
+ }
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ float af = args->farr[i-1];
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1; // the first tmp_iaf bin is reserved for singletons
+ }
+ return;
}
+
// tmp_iaf is first filled with AC counts in calc_ac and then transformed to
// an index to af_gts_snps
- int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
- if ( ret )
+ ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
+ if ( !ret )
{
- int an=0;
- for (i=0; i<line->n_allele; i++)
- an += args->tmp_iaf[i];
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0; // singletons/unknown bin
+ return;
+ }
- args->tmp_iaf[0] = 0;
- for (i=1; i<line->n_allele; i++)
+ int an = 0;
+ for (i=0; i<line->n_allele; i++)
+ an += args->tmp_iaf[i];
+
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->tmp_iaf[i]==1 )
+ args->tmp_iaf[i] = 0; // singletons into the first bin
+ else if ( !an )
+ args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
+ else
{
- if ( args->tmp_iaf[i]==1 )
- args->tmp_iaf[i] = 0; // singletons into the first bin
- else if ( !an )
- args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
- else
- args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an;
+ float af = (float) args->tmp_iaf[i] / an;
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1;
}
}
- else
- for (i=0; i<line->n_allele; i++)
- args->tmp_iaf[i] = 0;
-
- // todo: otherwise use AF
}
static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
@@ -623,7 +663,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
bcf1_t *line = reader->buffer[0];
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_indels[iqual]++;
#endif
@@ -758,7 +798,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
if ( ref<0 ) return;
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_snps[iqual]++;
#endif
@@ -875,6 +915,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
{
float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
int idx = het_frac*(args->naf_hwe - 1);
+//check me: what is this?
if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
stats->af_hwe[idx]++;
}
@@ -913,88 +954,42 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return;
// only the first ALT allele is considered
- int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1;
+ int iaf = args->tmp_iaf[1];
int line_type = bcf_get_variant_types(files->readers[0].buffer[0]);
gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels;
gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels;
- //
- // Calculates r squared
- // x is mean dosage of x at given site
- // x2 is mean squared dosage of x at given site
- // y is mean dosage of x at given site
- // y2 is mean squared dosage of x at given site
- // xy is mean dosage of x*y at given site
- // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
- // r2n is number of sites considered
- // output as r2sum/r2n for each AF bin
- int r2n = 0;
- float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0;
- // Select smpl_r
- smpl_r_t *smpl_r = NULL;
- if (line_type&VCF_SNP)
- {
- smpl_r = args->smpl_r_snps;
- }
- else if (line_type&VCF_INDEL)
- {
- smpl_r = args->smpl_r_indels;
- }
for (is=0; is<files->n_smpl; is++)
{
// Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
// actual alleles can be enforced by running without the -c option.
int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
- if ( gt0 == GT_UNKN ) continue;
-
int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
- if ( gt1 == GT_UNKN ) continue;
- if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
+ int idx0 = type2stats[gt0];
+ int idx1 = type2stats[gt1];
+ af_stats[iaf].gt2gt[idx0][idx1]++;
+ smpl_stats[is].gt2gt[idx0][idx1]++;
- int dsg0 = type2dosage[gt0];
- int dsg1 = type2dosage[gt1];
- x += dsg0;
- x2 += dsg0*dsg0;
- y += dsg1;
- y2 += dsg1*dsg1;
- xy += dsg0*dsg1;
- r2n++;
-
- int idx = type2stats[gt0];
- if ( gt0==gt1 )
- {
- af_stats[iaf].m[idx]++;
- smpl_stats[is].m[idx]++;
- }
- else
- {
- af_stats[iaf].mm[idx]++;
- smpl_stats[is].mm[idx]++;
- }
-
- // Now do it across samples
+ if ( gt0 == GT_UNKN || gt1 == GT_UNKN ) continue;
+ if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
- if (smpl_r) {
- smpl_r[is].xy += dsg0*dsg1;
- smpl_r[is].x += dsg0;
- smpl_r[is].x2 += dsg0*dsg0;
- smpl_r[is].y += dsg1;
- smpl_r[is].y2 += dsg1*dsg1;
- ++(smpl_r[is].n);
- }
- }
-
- if ( r2n )
- {
- x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n;
- float cov = xy - x*y;
- float var2 = (x2 - x*x) * (y2 - y*y);
- if ( var2!=0 )
- {
- af_stats[iaf].r2sum += cov*cov/var2;
- af_stats[iaf].r2n++;
- }
+ float y = type2dosage[gt0];
+ float x = type2dosage[gt1];
+
+ smpl_stats[is].yx += y*x;
+ smpl_stats[is].x += x;
+ smpl_stats[is].xx += x*x;
+ smpl_stats[is].y += y;
+ smpl_stats[is].yy += y*y;
+ smpl_stats[is].n += 1;
+
+ af_stats[iaf].yx += y*x;
+ af_stats[iaf].x += x;
+ af_stats[iaf].xx += x*x;
+ af_stats[iaf].y += y;
+ af_stats[iaf].yy += y*y;
+ af_stats[iaf].n += 1;
}
if ( args->verbose_sites )
@@ -1131,7 +1126,7 @@ static void print_header(args_t *args)
#define T2S(x) type2stats[x]
static void print_stats(args_t *args)
{
- int i, id;
+ int i, j,k, id;
fprintf(pysam_stdout, "# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
@@ -1204,6 +1199,24 @@ static void print_stats(args_t *args)
stats->af_repeats[1][1] += stats->af_repeats[1][0];
stats->af_repeats[2][1] += stats->af_repeats[2][0];
}
+ // move the singletons stats into the first AF bin, singleton stats was collected separately because of init_iaf
+ if ( args->af_gts_snps )
+ {
+ args->af_gts_snps[1].y += args->af_gts_snps[0].y;
+ args->af_gts_snps[1].yy += args->af_gts_snps[0].yy;
+ args->af_gts_snps[1].xx += args->af_gts_snps[0].xx;
+ args->af_gts_snps[1].yx += args->af_gts_snps[0].yx;
+ args->af_gts_snps[1].n += args->af_gts_snps[0].n;
+ }
+ if ( args->af_gts_indels )
+ {
+ args->af_gts_indels[1].y += args->af_gts_indels[0].y;
+ args->af_gts_indels[1].yy += args->af_gts_indels[0].yy;
+ args->af_gts_indels[1].xx += args->af_gts_indels[0].xx;
+ args->af_gts_indels[1].yx += args->af_gts_indels[0].yx;
+ args->af_gts_indels[1].n += args->af_gts_indels[0].n;
+ }
+
fprintf(pysam_stdout, "# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
for (id=0; id<args->nstats; id++)
{
@@ -1211,7 +1224,8 @@ static void print_stats(args_t *args)
for (i=1; i<args->m_af; i++) // note that af[1] now contains also af[0], see SiS stats output above
{
if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue;
- fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]);
}
}
@@ -1268,34 +1282,56 @@ static void print_stats(args_t *args)
fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
int x;
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
if ( x==0 )
{
- fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_snps;
}
else
{
- fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_indels;
}
- uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
+ uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins
for (i=0; i<args->m_af; i++)
{
- int j, n = 0;
- for (j=0; j<3; j++)
+ int n = 0;
+ uint64_t m[4] = {0,0,0,0}, mm[4] = {0,0,0,0}; // in i-th AF bin
+ for (j=0; j<4; j++) // rr, ra, aa hom, aa het, ./.
+ for (k=0; k<4; k++)
+ {
+ n += stats[i].gt2gt[j][k];
+ if ( j==k )
+ {
+ nrd_m[j] += stats[i].gt2gt[j][k];
+ m[j] += stats[i].gt2gt[j][k];
+ }
+ else
+ {
+ nrd_mm[j] += stats[i].gt2gt[j][k];
+ mm[j] += stats[i].gt2gt[j][k];
+ }
+ }
+ if ( !i || !n ) continue; // skip singleton stats and empty bins
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
{
- n += stats[i].m[j] + stats[i].mm[j];
- nrd_m[j] += stats[i].m[j];
- nrd_mm[j] += stats[i].mm[j];
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
- if ( !i || !n ) continue; // skip singleton stats and empty bins
- fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- fprintf(pysam_stdout, "\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', af);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]);
+ if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f", r2);
+ else fprintf(pysam_stdout, "\t"NA_STRING);
+ fprintf(pysam_stdout, "\t%.0f\n", stats[i].n);
}
if ( x==0 )
@@ -1311,8 +1347,8 @@ static void print_stats(args_t *args)
}
else
fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
- uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)];
- uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)];
+ uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)];
+ uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)];
fprintf(pysam_stdout, "NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
m+mm ? mm*100.0/(m+mm) : 0,
nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0,
@@ -1321,42 +1357,99 @@ static void print_stats(args_t *args)
);
}
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
- smpl_r_t *smpl_r_array;
if ( x==0 )
{
fprintf(pysam_stdout, "# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_snps;
- smpl_r_array = args->smpl_r_snps;
}
else
{
fprintf(pysam_stdout, "# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_indels;
- smpl_r_array = args->smpl_r_indels;
}
for (i=0; i<args->files->n_smpl; i++)
{
- uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)];
- uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)];
- // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
- smpl_r_t *smpl_r = smpl_r_array + i;
- double r = 0.0;
- if (smpl_r->n) {
- double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula
- double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n;
- double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
- r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
+ uint64_t mm = 0, m = stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)];
+ for (j=0; j<3; j++)
+ for (k=0; k<3; k++)
+ if ( j!=k ) mm += stats[i].gt2gt[j][k];
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
+ {
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
fprintf(pysam_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- if (smpl_r->n && !isnan(r)) fprintf(pysam_stdout, "\t%f\n", r*r);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]);
+ if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f\n", r2);
else fprintf(pysam_stdout, "\t"NA_STRING"\n");
}
}
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
+ {
+ //printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+
+ gtcmp_t *stats;
+ if ( x==0 )
+ {
+ fprintf(pysam_stdout, "# GCTs, Genotype concordance table (SNPs)\n# GCTs");
+ stats = args->smpl_gts_snps;
+ }
+ else
+ {
+ fprintf(pysam_stdout, "# GCTi, Genotype concordance table (indels)\n# GCTi");
+ stats = args->smpl_gts_indels;
+ }
+ i = 1;
+ fprintf(pysam_stdout, "\t[%d]sample", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> missing\n", ++i);
+
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ fprintf(pysam_stdout, "GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]);
+ for (j=0; j<5; j++)
+ for (k=0; k<5; k++)
+ fprintf(pysam_stdout, "\t%"PRId64, stats[i].gt2gt[j][k]);
+ fprintf(pysam_stdout, "\n");
+ }
+ }
}
fprintf(pysam_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
@@ -1425,8 +1518,10 @@ static void print_stats(args_t *args)
for (j=0; j<args->naf_hwe; j++) sum_tot += ptr[j];
if ( !sum_tot ) continue;
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+
int nprn = 3;
- fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
+ fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,af,sum_tot);
for (j=0; j<args->naf_hwe; j++)
{
sum_tmp += ptr[j];
@@ -1464,6 +1559,8 @@ static void usage(void)
fprintf(pysam_stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " --af-bins <list> allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+ fprintf(pysam_stderr, " --af-tag <string> allele frequency tag to use, by default estimated from AN,AC or GT\n");
fprintf(pysam_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
fprintf(pysam_stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
fprintf(pysam_stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
@@ -1480,6 +1577,7 @@ static void usage(void)
fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(pysam_stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
fprintf(pysam_stderr, "\n");
exit(1);
@@ -1496,6 +1594,8 @@ int main_vcfstats(int argc, char *argv[])
static struct option loptions[] =
{
+ {"af-bins",1,0,1},
+ {"af-tag",1,0,2},
{"1st-allele-only",0,0,'1'},
{"include",1,0,'i'},
{"exclude",1,0,'e'},
@@ -1514,10 +1614,13 @@ int main_vcfstats(int argc, char *argv[])
{"targets-file",1,0,'T'},
{"fasta-ref",1,0,'F'},
{"user-tstv",1,0,'u'},
+ {"threads",1,0,9},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) {
switch (c) {
+ case 1 : args->af_bins_list = optarg; break;
+ case 2 : args->af_tag = optarg; break;
case 'u': add_user_stats(args,optarg); break;
case '1': args->first_allele_only = 1; break;
case 'F': args->ref_fname = optarg; break;
@@ -1549,6 +1652,7 @@ int main_vcfstats(int argc, char *argv[])
case 'I': args->split_by_id = 1; break;
case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
@@ -1573,6 +1677,9 @@ int main_vcfstats(int argc, char *argv[])
error("Failed to read the targets: %s\n", args->targets_list);
if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+
while (fname)
{
if ( !bcf_sr_add_reader(args->files, fname) )
diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c
index c14075d..645cc8a 100644
--- a/bcftools/vcfview.c
+++ b/bcftools/vcfview.c
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -181,10 +182,12 @@ static void init_data(args_t *args)
if (args->include_types) {
args->include = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
else {
fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
@@ -195,10 +198,12 @@ static void init_data(args_t *args)
if (args->exclude_types) {
args->exclude = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
else {
fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
@@ -220,7 +225,8 @@ static void init_data(args_t *args)
else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out, args->n_threads);
+ if ( args->n_threads > 0)
+ hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
// headers: hdr=full header, hsub=subset header, hnull=sites only header
if (args->sites_only){
@@ -315,8 +321,8 @@ int subset_vcf(args_t *args, bcf1_t *line)
if (args->include || args->exclude)
{
int line_type = bcf_get_variant_types(line);
- if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
- if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types
+ if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
+ if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types
}
if ( args->filter )
@@ -398,7 +404,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
}
}
- if (args->min_ac)
+ if (args->min_ac!=-1)
{
if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
@@ -406,7 +412,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
}
- if (args->max_ac)
+ if (args->max_ac!=-1)
{
if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
@@ -414,7 +420,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
}
- if (args->min_af)
+ if (args->min_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
@@ -423,7 +429,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
}
- if (args->max_af)
+ if (args->max_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
@@ -443,7 +449,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
if (args->trim_alts)
{
int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
- if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
+ if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
}
if (args->phased) {
int phased = bcf_all_phased(args->hdr, line);
@@ -494,7 +500,7 @@ static void usage(args_t *args)
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
fprintf(stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Subset options:\n");
fprintf(stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
@@ -515,7 +521,7 @@ static void usage(args_t *args)
fprintf(stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
- fprintf(stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
+ fprintf(stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
fprintf(stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
fprintf(stderr, "\n");
exit(1);
@@ -533,6 +539,7 @@ int main_vcfview(int argc, char *argv[])
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
@@ -726,6 +733,7 @@ int main_vcfview(int argc, char *argv[])
error("Failed to read the targets: %s\n", args->targets_list);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
@@ -734,6 +742,8 @@ int main_vcfview(int argc, char *argv[])
bcf_hdr_write(args->out, out_hdr);
else if ( args->output_type & FT_BCF )
error("BCF output requires header, cannot proceed with -H\n");
+
+ int ret = 0;
if (!args->header_only)
{
while ( bcf_sr_next_line(args->files) )
@@ -743,10 +753,12 @@ int main_vcfview(int argc, char *argv[])
if ( subset_vcf(args, line) )
bcf_write1(args->out, out_hdr, line);
}
+ ret = args->files->errnum;
+ if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
}
hts_close(args->out);
destroy_data(args);
bcf_sr_destroy(args->files);
free(args);
- return 0;
+ return ret;
}
diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c
index 53b7c53..a471f37 100644
--- a/bcftools/vcfview.c.pysam.c
+++ b/bcftools/vcfview.c.pysam.c
@@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
@@ -183,10 +184,12 @@ static void init_data(args_t *args)
if (args->include_types) {
args->include = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
else {
fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n");
@@ -197,10 +200,12 @@ static void init_data(args_t *args)
if (args->exclude_types) {
args->exclude = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
else {
fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n");
@@ -222,7 +227,8 @@ static void init_data(args_t *args)
else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out, args->n_threads);
+ if ( args->n_threads > 0)
+ hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
// headers: hdr=full header, hsub=subset header, hnull=sites only header
if (args->sites_only){
@@ -317,8 +323,8 @@ int subset_vcf(args_t *args, bcf1_t *line)
if (args->include || args->exclude)
{
int line_type = bcf_get_variant_types(line);
- if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
- if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types
+ if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
+ if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types
}
if ( args->filter )
@@ -400,7 +406,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
}
}
- if (args->min_ac)
+ if (args->min_ac!=-1)
{
if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
@@ -408,7 +414,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
}
- if (args->max_ac)
+ if (args->max_ac!=-1)
{
if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
@@ -416,7 +422,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
}
- if (args->min_af)
+ if (args->min_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
@@ -425,7 +431,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
}
- if (args->max_af)
+ if (args->max_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
@@ -445,7 +451,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
if (args->trim_alts)
{
int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
- if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
+ if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
}
if (args->phased) {
int phased = bcf_all_phased(args->hdr, line);
@@ -496,7 +502,7 @@ static void usage(args_t *args)
fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(pysam_stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
fprintf(pysam_stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Subset options:\n");
fprintf(pysam_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
@@ -517,7 +523,7 @@ static void usage(args_t *args)
fprintf(pysam_stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
fprintf(pysam_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
- fprintf(pysam_stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
+ fprintf(pysam_stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
fprintf(pysam_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
fprintf(pysam_stderr, "\n");
exit(1);
@@ -535,6 +541,7 @@ int main_vcfview(int argc, char *argv[])
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
@@ -728,6 +735,7 @@ int main_vcfview(int argc, char *argv[])
error("Failed to read the targets: %s\n", args->targets_list);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
@@ -736,6 +744,8 @@ int main_vcfview(int argc, char *argv[])
bcf_hdr_write(args->out, out_hdr);
else if ( args->output_type & FT_BCF )
error("BCF output requires header, cannot proceed with -H\n");
+
+ int ret = 0;
if (!args->header_only)
{
while ( bcf_sr_next_line(args->files) )
@@ -745,10 +755,12 @@ int main_vcfview(int argc, char *argv[])
if ( subset_vcf(args, line) )
bcf_write1(args->out, out_hdr, line);
}
+ ret = args->files->errnum;
+ if ( ret ) fprintf(pysam_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
}
hts_close(args->out);
destroy_data(args);
bcf_sr_destroy(args->files);
free(args);
- return 0;
+ return ret;
}
diff --git a/bcftools/version.h b/bcftools/version.h
index 05929f5..84247e7 100644
--- a/bcftools/version.h
+++ b/bcftools/version.h
@@ -1 +1 @@
-#define BCFTOOLS_VERSION "1.3.1"
+#define BCFTOOLS_VERSION "1.4.1"
diff --git a/buildwheels.sh b/buildwheels.sh
index a5987f1..ae0d953 100755
--- a/buildwheels.sh
+++ b/buildwheels.sh
@@ -22,7 +22,7 @@ if ! grep -q docker /proc/1/cgroup; then
exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0
fi
-yum install -y zlib-devel
+yum install -y zlib-devel bzip2-devel xz-devel
# Python 2.6 is not supported
rm -r /opt/python/cp26*
diff --git a/doc/api.rst b/doc/api.rst
index 686c60d..8e76686 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -88,11 +88,11 @@ The above code outputs::
Commands available in :term:`csamtools` are available as simple
function calls. For example::
- pysam.sort("ex1.bam", "output")
+ pysam.sort("-o", "output.bam", "ex1.bam")
corresponds to the command line::
- samtools sort ex1.bam output
+ samtools sort -o output.bam ex1.bam
Analogous to :class:`~pysam.AlignmentFile`, a
:class:`~pysam.TabixFile` allows fast random access to compressed and
diff --git a/doc/release.rst b/doc/release.rst
index 1d378f3..3874856 100644
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,81 @@
Release notes
=============
+Release 0.11.2.2
+================
+
+Bugfix release to address two issues:
+
+* Changes in 0.11.2.1 broke the GTF/GFF3 parser. Corrected and
+ more tests have been added.
+* [#479] Correct VariantRecord edge cases described in issue
+
+
+Release 0.11.2.1
+================
+
+Release to fix release tar-ball containing 0.11.1 pre-compiled
+C-files.
+
+
+Release 0.11.2
+==============
+
+This release wraps htslib/samtools/bcfools versions 1.4.1 in response
+to a security fix in these libraries. Additionaly the following
+issues have been fixed:
+
+* [#452] add GFF3 support for tabix parsers
+* [#461] Multiple fixes related to VariantRecordInfo and handling of INFO/END
+* [#447] limit query name to 251 characters (only partially addresses issue)
+
+VariantFile and related object fixes
+
+* Restore VariantFile.\_\_dealloc\_\_
+* Correct handling of bcf_str_missing in bcf_array_to_object and
+ bcf_object_to_array
+* Added update() and pop() methods to some dict-like proxy objects
+* scalar INFO entries could not be set again after being deleted
+* VariantRecordInfo.__delitem__ now allows unset flags to be deleted without
+ raising a KeyError
+* Multiple other fixes for VariantRecordInfo methods
+* INFO/END is now accessible only via VariantRecord.stop and
+ VariantRecord.rlen. Even if present behind the scenes, it is no longer
+ accessible via VariantRecordInfo.
+* Add argument to issue a warning instead of an exception if input appears
+ to be truncated
+
+Other features and fixes:
+
+* Make AlignmentFile \_\_dealloc\_\_ and close more
+ stringent
+* Add argument AlignmentFile to issue a warning instead of an
+ exception if input appears to be truncated
+
+Release 0.11.1
+==============
+
+Bugfix release
+
+* [#440] add deprecated 'always' option to infer_query_length for backwards compatibility.
+
+Release 0.11.0
+==============
+
+This release wraps the latest versions of htslib/samtools/bcftools and
+implements a few bugfixes.
+
+* [#413] Wrap HTSlib/Samtools/BCFtools 1.4
+* [#422] Fix missing pysam.sort.usage() message
+* [#411] Fix BGZfile initialization bug
+* [#412] Add seek support for BGZFile
+* [#395] Make BGZfile iterable
+* [#433] Correct getQueryEnd
+* [#419] Export SAM enums such as pysam.CMATCH
+* [#415] Fix access by tid in AlignmentFile.fetch()
+* [#405] Writing SAM now outputs a header by default.
+* [#332] split infer_query_length(always) into infer_query_length and infer_read_length
+
Release 0.10.0
==============
diff --git a/doc/usage.rst b/doc/usage.rst
index 936f3bd..6172329 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -123,26 +123,23 @@ Note that the file open mode needs to changed from ``r`` to ``rb``.
Using samtools commands within python
=====================================
-Commands available in :term:`csamtools` are available
-as simple function calls. For example::
+Commands available in :term:`csamtools` are available as simple
+function calls. Command line options are provided as arguments. For
+example::
- pysam.sort("ex1.bam", "output")
+ pysam.sort("-o", "output.bam", "ex1.bam")
corresponds to the command line::
- samtools sort ex1.bam output
+ samtools sort -o output.bam ex1.bam
-Command line options can be provided as arguments::
-
- pysam.sort("-n", "ex1.bam", "output")
-
-or::
+Or for example::
- pysam.sort("-m", "1000000", "ex1.bam", "output")
+ pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam")
In order to get usage information, try::
- print pysam.sort.usage()
+ print(pysam.sort.usage())
Argument errors raise a :class:`pysam.SamtoolsError`::
diff --git a/import.py b/import.py
index 12d2016..b8eab01 100644
--- a/import.py
+++ b/import.py
@@ -31,10 +31,22 @@ import hashlib
EXCLUDE = {
"samtools": (
- "razip.c", "bgzip.c", "main.c",
- "calDepth.c", "bam2bed.c", "wgsim.c",
- "md5fa.c", "md5sum-lite.c", "maq2sam.c",
- "bamcheck.c", "chk_indel.c", "vcf-miniview.c",
+ "razip.c",
+ "bgzip.c",
+ "main.c",
+ "calDepth.c",
+ "bam2bed.c",
+ "wgsim.c",
+ "bam_tview.c",
+ "bam_tview.h",
+ "bam_tview_html.c",
+ "bam_tview_curses.c",
+ "md5fa.c",
+ "md5sum-lite.c",
+ "maq2sam.c",
+ "bamcheck.c",
+ "chk_indel.c",
+ "vcf-miniview.c",
"htslib-1.3", # do not import twice
"hfile_irods.c", # requires irods library
),
@@ -73,9 +85,10 @@ def _update_pysam_files(cf, destdir):
if not filename:
continue
dest = filename + ".pysam.c"
- with open(filename) as infile:
+ with open(filename, encoding="utf-8") as infile:
lines = "".join(infile.readlines())
- with open(dest, "w") as outfile:
+
+ with open(dest, "w", encoding="utf-8") as outfile:
outfile.write('#include "pysam.h"\n\n')
subname, _ = os.path.splitext(os.path.basename(filename))
if subname in MAIN.get(basename, []):
@@ -161,9 +174,9 @@ if len(sys.argv) >= 1:
old_file = os.path.join(targetdir, f)
if os.path.exists(old_file):
md5_old = hashlib.md5(
- "".join(open(old_file, "r").readlines())).digest()
+ "".join(open(old_file, "r", encoding="utf-8").readlines()).encode()).digest()
md5_new = hashlib.md5(
- "".join(open(src, "r").readlines())).digest()
+ "".join(open(src, "r", encoding="utf-8").readlines()).encode()).digest()
if md5_old != md5_new:
raise ValueError(
"incompatible files for %s and %s" %
diff --git a/pysam/__init__.py b/pysam/__init__.py
index ed17e04..c142c6c 100644
--- a/pysam/__init__.py
+++ b/pysam/__init__.py
@@ -3,6 +3,8 @@ import sys
import sysconfig
from pysam.libchtslib import *
+from pysam.libcsamtools import *
+from pysam.libcbcftools import *
from pysam.libcutils import *
import pysam.libcutils as libcutils
import pysam.libcfaidx as libcfaidx
diff --git a/pysam/cbcftools_util.h b/pysam/cbcftools_util.h
new file mode 100644
index 0000000..4a9f2e9
--- /dev/null
+++ b/pysam/cbcftools_util.h
@@ -0,0 +1,6 @@
+#ifndef CBCFTOOLS_UTIL_H
+#define CBCFTOOLS_UTIL_H
+
+int bcftools_main(int argc, char *argv[]);
+
+#endif
diff --git a/pysam/csamtools_util.h b/pysam/csamtools_util.h
new file mode 100644
index 0000000..0a03c13
--- /dev/null
+++ b/pysam/csamtools_util.h
@@ -0,0 +1,6 @@
+#ifndef CSAMTOOLS_UTIL_H
+#define CSAMTOOLS_UTIL_H
+
+int samtools_main(int argc, char *argv[]);
+
+#endif
diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h
index f0d582c..c714986 100644
--- a/pysam/htslib_util.h
+++ b/pysam/htslib_util.h
@@ -92,36 +92,16 @@ static inline int pysam_bam_get_l_aux(bam1_t * b) {
static inline char pysam_bam_seqi(uint8_t * s, int i) {
return bam_seqi(s,i);}
-// Wrapping bit field access in bam1_core_t
-// bit fields not supported in cython and due
-// to endian-ness it is not clear which part
-// of the bit-field is in the higher or lower bytes.
-static inline uint16_t pysam_get_bin(bam1_t * b) {
- return b->core.bin;}
-
static inline uint8_t pysam_get_qual(bam1_t * b) {
return b->core.qual;}
-static inline uint8_t pysam_get_l_qname(bam1_t * b) {
- return b->core.l_qname;}
-
-static inline uint16_t pysam_get_flag(bam1_t * b) {
- return b->core.flag;}
static inline uint16_t pysam_get_n_cigar(bam1_t * b) {
return b->core.n_cigar;}
-static inline void pysam_set_bin(bam1_t * b, uint16_t v) {
- b->core.bin=v;}
-
static inline void pysam_set_qual(bam1_t * b, uint8_t v) {
b->core.qual=v;}
-static inline void pysam_set_l_qname(bam1_t * b, uint8_t v) {
- b->core.l_qname=v;}
-
-static inline void pysam_set_flag(bam1_t * b, uint16_t v) {
- b->core.flag=v;}
static inline void pysam_set_n_cigar(bam1_t * b, uint16_t v) {
b->core.n_cigar=v;}
diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd
index f1d59d1..8441313 100644
--- a/pysam/libcalignedsegment.pxd
+++ b/pysam/libcalignedsegment.pxd
@@ -19,15 +19,9 @@ cdef extern from "htslib_util.h":
int pysam_bam_get_l_aux(bam1_t * b)
char pysam_bam_seqi(uint8_t * s, int i)
- uint16_t pysam_get_bin(bam1_t * b)
uint8_t pysam_get_qual(bam1_t * b)
- uint8_t pysam_get_l_qname(bam1_t * b)
- uint16_t pysam_get_flag(bam1_t * b)
uint16_t pysam_get_n_cigar(bam1_t * b)
- void pysam_set_bin(bam1_t * b, uint16_t v)
void pysam_set_qual(bam1_t * b, uint8_t v)
- void pysam_set_l_qname(bam1_t * b, uint8_t v)
- void pysam_set_flag(bam1_t * b, uint16_t v)
void pysam_set_n_cigar(bam1_t * b, uint16_t v)
void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx
index c95bb13..73d426a 100644
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -61,7 +61,7 @@ import struct
cimport cython
from cpython cimport array as c_array
from cpython.version cimport PY_MAJOR_VERSION
-from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
+from cpython cimport PyBytes_FromStringAndSize
from libc.string cimport strchr
from cpython cimport array as c_array
@@ -281,6 +281,9 @@ cdef inline packTags(tags):
len(value)] + list(value))
elif isinstance(value, array.array):
+ valuetype = value.typecode
+ if valuetype not in datatype2format:
+ valuetype = None
# binary tags from arrays
if valuetype is None:
array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
@@ -325,9 +328,41 @@ cdef inline packTags(tags):
return "".join(fmts), args
-cdef inline int32_t calculateQueryLength(bam1_t * src):
+cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
"""return query length computed from CIGAR alignment.
+ Length ignores hard-clipped bases.
+
+ Return 0 if there is no CIGAR alignment.
+ """
+
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+
+ if cigar_p == NULL:
+ return 0
+
+ cdef uint32_t k, qpos
+ cdef int op
+ qpos = 0
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+
+ if op == BAM_CMATCH or \
+ op == BAM_CINS or \
+ op == BAM_CSOFT_CLIP or \
+ op == BAM_CEQUAL or \
+ op == BAM_CDIFF:
+ qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ return qpos
+
+
+cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
+ """return query length computed from CIGAR alignment.
+
+ Length includes hard-clipped bases.
+
Return 0 if there is no CIGAR alignment.
"""
@@ -356,44 +391,45 @@ cdef inline int32_t calculateQueryLength(bam1_t * src):
cdef inline int32_t getQueryStart(bam1_t *src) except -1:
cdef uint32_t * cigar_p
- cdef uint32_t k, op
cdef uint32_t start_offset = 0
+ cdef uint32_t k, op
- if pysam_get_n_cigar(src):
- cigar_p = pysam_bam_get_cigar(src);
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- if op == BAM_CHARD_CLIP:
- if start_offset != 0 and start_offset != src.core.l_qseq:
- PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
- return -1
- elif op == BAM_CSOFT_CLIP:
- start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
- else:
- break
+ cigar_p = pysam_bam_get_cigar(src);
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CHARD_CLIP:
+ if start_offset != 0 and start_offset != src.core.l_qseq:
+ raise ValueError('Invalid clipping in CIGAR string')
+ elif op == BAM_CSOFT_CLIP:
+ start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
return start_offset
cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
- cdef uint32_t * cigar_p
- cdef uint32_t k, op
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
cdef uint32_t end_offset = src.core.l_qseq
+ cdef uint32_t k, op
# if there is no sequence, compute length from cigar string
if end_offset == 0:
- end_offset = calculateQueryLength(src)
-
- # walk backwards in cigar string
- if pysam_get_n_cigar(src) > 1:
- cigar_p = pysam_bam_get_cigar(src);
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CMATCH or \
+ op == BAM_CINS or \
+ op == BAM_CEQUAL or \
+ op == BAM_CDIFF or \
+ (op == BAM_CSOFT_CLIP and end_offset == 0):
+ end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ # walk backwards in cigar string
for k from pysam_get_n_cigar(src) > k >= 1:
op = cigar_p[k] & BAM_CIGAR_MASK
if op == BAM_CHARD_CLIP:
- if end_offset != 0 and end_offset != src.core.l_qseq:
- PyErr_SetString(ValueError,
- 'Invalid clipping in CIGAR string')
- return -1
+ if end_offset != src.core.l_qseq:
+ raise ValueError('Invalid clipping in CIGAR string')
elif op == BAM_CSOFT_CLIP:
end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
else:
@@ -748,10 +784,13 @@ cdef class AlignedSegment:
if t == o:
return 0
+ cdef uint8_t *a = <uint8_t*>&t.core
+ cdef uint8_t *b = <uint8_t*>&o.core
+
retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
-
if retval:
return retval
+
# cmp(t.l_data, o.l_data)
retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
if retval:
@@ -819,49 +858,60 @@ cdef class AlignedSegment:
property query_name:
"""the query template name (None if not present)"""
def __get__(self):
- cdef bam1_t * src
- src = self._delegate
- if pysam_get_l_qname(src) == 0:
+
+ cdef bam1_t * src = self._delegate
+ if src.core.l_qname == 0:
return None
+
return charptr_to_str(<char *>pysam_bam_get_qname(src))
def __set__(self, qname):
+
if qname is None or len(qname) == 0:
return
- if len(qname) >= 255:
- raise ValueError("query length out of range {} > 254".format(
+ # See issue #447
+ # (The threshold is 252 chars, but this includes a \0 byte.
+ if len(qname) > 251:
+ raise ValueError("query length out of range {} > 251".format(
len(qname)))
qname = force_bytes(qname)
- cdef bam1_t * src
- cdef int l
- cdef char * p
+ cdef bam1_t * src = self._delegate
+ # the qname is \0 terminated
+ cdef uint8_t l = len(qname) + 1
- src = self._delegate
- p = pysam_bam_get_qname(src)
+ cdef char * p = pysam_bam_get_qname(src)
+ cdef uint8_t l_extranul = 0
+
+ if l % 4 != 0:
+ l_extranul = 4 - l % 4
- # the qname is \0 terminated
- l = len(qname) + 1
pysam_bam_update(src,
- pysam_get_l_qname(src),
- l,
+ src.core.l_qname,
+ l + l_extranul,
<uint8_t*>p)
- pysam_set_l_qname(src, l)
-
+ src.core.l_extranul = l_extranul
+ src.core.l_qname = l + l_extranul
+
# re-acquire pointer to location in memory
# as it might have moved
p = pysam_bam_get_qname(src)
strncpy(p, qname, l)
+ # x might be > 255
+ cdef uint16_t x = 0
+
+ for x from l <= x < l + l_extranul:
+ p[x] = '\0'
property flag:
"""properties flag"""
def __get__(self):
- return pysam_get_flag(self._delegate)
+ return self._delegate.core.flag
def __set__(self, flag):
- pysam_set_flag(self._delegate, flag)
+ self._delegate.core.flag = flag
property reference_name:
""":term:`reference` name (None if no AlignmentFile is associated)"""
@@ -893,19 +943,17 @@ cdef class AlignedSegment:
src = self._delegate
src.core.pos = pos
if pysam_get_n_cigar(src):
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- bam_endpos(src),
- 14,
- 5))
+ src.core.bin = hts_reg2bin(
+ src.core.pos,
+ bam_endpos(src),
+ 14,
+ 5)
else:
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- src.core.pos + 1,
- 14,
- 5))
+ src.core.bin = hts_reg2bin(
+ src.core.pos,
+ src.core.pos + 1,
+ 14,
+ 5)
property mapping_quality:
"""mapping quality"""
@@ -1156,9 +1204,9 @@ cdef class AlignedSegment:
property bin:
"""properties bin"""
def __get__(self):
- return pysam_get_bin(self._delegate)
+ return self._delegate.core.bin
def __set__(self, bin):
- pysam_set_bin(self._delegate, bin)
+ self._delegate.core.bin = bin
##########################################################
@@ -1344,14 +1392,17 @@ cdef class AlignedSegment:
This the index of the first base in :attr:`seq` that is not
soft-clipped.
-
"""
def __get__(self):
return getQueryStart(self._delegate)
property query_alignment_end:
"""end index of the aligned query portion of the sequence (0-based,
- exclusive)"""
+ exclusive)
+
+ This the index just past the last base in :attr:`seq` that is not
+ soft-clipped.
+ """
def __get__(self):
return getQueryEnd(self._delegate)
@@ -1408,26 +1459,30 @@ cdef class AlignedSegment:
return result
- def infer_query_length(self, always=True):
- """inferred read length from CIGAR string.
+ def infer_query_length(self, always=False):
+ """infer query length from sequence or CIGAR alignment.
- If *always* is set to True, the read length
- will be always inferred. If set to False, the length
- of the read sequence will be returned if it is
- available.
+ This method deduces the query length from the CIGAR alignment
+ but does not include hard-clipped bases.
- Returns None if CIGAR string is not present.
- """
+ Returns None if CIGAR alignment is not present.
- cdef uint32_t * cigar_p
- cdef bam1_t * src
+ If *always* is set to True, `infer_read_length` is used instead.
+ This is deprecated and only present for backward compatibility.
+ """
+ if always is True:
+ return self.infer_read_length()
+ return calculateQueryLengthWithoutHardClipping(self._delegate)
- src = self._delegate
+ def infer_read_length(self):
+ """infer read length from CIGAR alignment.
- if not always and src.core.l_qseq:
- return src.core.l_qseq
+ This method deduces the read length from the CIGAR alignment
+ including hard-clipped bases.
- return calculateQueryLength(src)
+ Returns None if CIGAR alignment is not present.
+ """
+ return calculateQueryLengthWithHardClipping(self._delegate)
def get_reference_sequence(self):
"""return the reference sequence.
@@ -1677,7 +1732,9 @@ cdef class AlignedSegment:
+-----+--------------+-----+
|X |BAM_CDIFF |8 |
+-----+--------------+-----+
- |NM |NM tag |9 |
+ |B |BAM_CBACK |9 |
+ +-----+--------------+-----+
+ |NM |NM tag |10 |
+-----+--------------+-----+
If no cigar string is present, empty arrays will be returned.
@@ -1756,6 +1813,8 @@ cdef class AlignedSegment:
+-----+--------------+-----+
|X |BAM_CDIFF |8 |
+-----+--------------+-----+
+ |B |BAM_CBACK |9 |
+ +-----+--------------+-----+
.. note::
The output is a list of (operation, length) tuples, such as
@@ -1823,12 +1882,11 @@ cdef class AlignedSegment:
k += 1
## setting the cigar string requires updating the bin
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- bam_endpos(src),
- 14,
- 5))
+ src.core.bin = hts_reg2bin(
+ src.core.pos,
+ bam_endpos(src),
+ 14,
+ 5)
cpdef set_tag(self,
@@ -2477,7 +2535,71 @@ cdef class PileupRead:
def __get__(self):
return self._is_refskip
+
+cpdef enum CIGAR_OPS:
+ CMATCH = 0
+ CINS = 1
+ CDEL = 2
+ CREF_SKIP = 3
+ CSOFT_CLIP = 4
+ CHARD_CLIP = 5
+ CPAD = 6
+ CEQUAL = 7
+ CDIFF = 8
+ CBACK = 9
+
+
+cpdef enum SAM_FLAGS:
+ # the read is paired in sequencing, no matter whether it is mapped in a pair
+ FPAIRED = 1
+ # the read is mapped in a proper pair
+ FPROPER_PAIR = 2
+ # the read itself is unmapped; conflictive with FPROPER_PAIR
+ FUNMAP = 4
+ # the mate is unmapped
+ FMUNMAP = 8
+ # the read is mapped to the reverse strand
+ FREVERSE = 16
+ # the mate is mapped to the reverse strand
+ FMREVERSE = 32
+ # this is read1
+ FREAD1 = 64
+ # this is read2
+ FREAD2 = 128
+ # not primary alignment
+ FSECONDARY = 256
+ # QC failure
+ FQCFAIL = 512
+ # optical or PCR duplicate
+ FDUP = 1024
+ # supplementary alignment
+ FSUPPLEMENTARY = 2048
+
+
__all__ = [
"AlignedSegment",
"PileupColumn",
- "PileupRead"]
+ "PileupRead",
+ "CMATCH",
+ "CINS",
+ "CDEL",
+ "CREF_SKIP",
+ "CSOFT_CLIP",
+ "CHARD_CLIP",
+ "CPAD",
+ "CEQUAL",
+ "CDIFF",
+ "CBACK",
+ "FPAIRED",
+ "FPROPER_PAIR",
+ "FUNMAP",
+ "FMUNMAP",
+ "FREVERSE",
+ "FMREVERSE",
+ "FREAD1",
+ "FREAD2",
+ "FSECONDARY",
+ "FQCFAIL",
+ "FDUP",
+ "FSUPPLEMENTARY"]
+
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx
index 2161f87..0b248c1 100644
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -7,16 +7,16 @@
# The principal classes defined in this module are:
#
# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
-#
+#
# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
# the original sort order intact
-#
+#
# Additionally this module defines numerous additional classes that
# are part of the internal API. These are:
-#
+#
# Various iterator classes to iterate over alignments in sequential
# (IteratorRow) or in a stacked fashion (IteratorColumn):
-#
+#
# class IteratorRow
# class IteratorRowRegion
# class IteratorRowHead
@@ -76,15 +76,9 @@ else:
cimport cython
########################################################
-## Constants and global variables
-
-# defines imported from samtools
-DEF SEEK_SET = 0
-DEF SEEK_CUR = 1
-DEF SEEK_END = 2
-
+## global variables
# maximum genomic coordinace
-cdef int MAX_POS = 2 << 29
+cdef int MAX_POS = 2 << 29
# valid types for SAM headers
VALID_HEADER_TYPES = {"HD" : dict,
@@ -98,7 +92,7 @@ VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
# default type conversions within SAM header records
KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
- "SQ" : {"SN" : str, "LN" : int, "AS" : str,
+ "SQ" : {"SN" : str, "LN" : int, "AS" : str,
"M5" : str, "SP" : str, "UR" : str,
"AH" : str,},
"RG" : {"ID" : str, "CN" : str, "DS" : str,
@@ -106,7 +100,7 @@ KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
"LB" : str, "PG" : str, "PI" : str,
"PL" : str, "PM" : str, "PU" : str,
"SM" : str,},
- "PG" : {"ID" : str, "PN" : str, "CL" : str,
+ "PG" : {"ID" : str, "PN" : str, "CL" : str,
"PP" : str, "DS" : str, "VN" : str,},}
# output order of fields within records. Ensure that CL is at
@@ -147,20 +141,15 @@ def build_header_line(fields, record):
return "\t".join(line)
-cdef bam_hdr_t * build_header(new_header):
+cdef bam_hdr_t * build_header_from_dict(new_header):
'''return a new header built from a dictionary in `new_header`.
This method inserts the text field, target_name and target_len.
'''
-
- lines = []
-
- # check if hash exists
+ cdef list lines = []
# create new header and copy old data
- cdef bam_hdr_t * dest
-
- dest = bam_hdr_init()
+ cdef bam_hdr_t * dest = bam_hdr_init()
# first: defined tags
for record in VALID_HEADERS:
@@ -219,13 +208,63 @@ cdef bam_hdr_t * build_header(new_header):
return dest
+cdef bam_hdr_t * build_header_from_list(reference_names,
+ reference_lengths,
+ add_sq_text=True,
+ text=None):
+
+ assert len(reference_names) == len(reference_lengths), \
+ "unequal names and lengths of reference sequences"
+
+ cdef bam_hdr_t * dest = bam_hdr_init()
+
+ # allocate and fill header
+ reference_names = [force_bytes(ref) for ref in reference_names]
+ dest.n_targets = len(reference_names)
+ n = 0
+ for x in reference_names:
+ n += len(x) + 1
+ dest.target_name = <char**>calloc(n, sizeof(char*))
+ dest.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+ for x from 0 <= x < dest.n_targets:
+ dest.target_len[x] = reference_lengths[x]
+ name = reference_names[x]
+ dest.target_name[x] = <char*>calloc(
+ len(name) + 1, sizeof(char))
+ strncpy(dest.target_name[x], name, len(name))
+
+ # Optionally, if there is no text, add a SAM
+ # compatible header to output file.
+ if text is None and add_sq_text:
+ text = []
+ for x from 0 <= x < dest.n_targets:
+ text.append("@SQ\tSN:%s\tLN:%s\n" % \
+ (force_str(reference_names[x]),
+ reference_lengths[x]))
+ text = ''.join(text)
+
+ cdef char * ctext = NULL
+
+ if text is not None:
+ # copy without \0
+ text = force_bytes(text)
+ ctext = text
+ dest.l_text = strlen(ctext)
+ dest.text = <char*>calloc(
+ strlen(ctext), sizeof(char))
+ memcpy(dest.text, ctext, strlen(ctext))
+
+ return dest
+
+
cdef class AlignmentFile(HTSFile):
"""AlignmentFile(filepath_or_object, mode=None, template=None,
reference_names=None, reference_lengths=None, text=NULL,
header=None, add_sq_text=False, check_header=True, check_sq=True,
- reference_filename=None, filename=None, duplicate_filehandle=True)
+ reference_filename=None, filename=None, duplicate_filehandle=True,
+ ignore_truncation=False)
- A :term:`SAM`/:term:`BAM` formatted file.
+ A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
If `filepath_or_object` is a string, the file is automatically
opened. If `filepath_or_object` is a python File object, the
@@ -245,7 +284,7 @@ cdef class AlignmentFile(HTSFile):
:class:`~pysam.AlignmentFile`).
2. If `header` is given, the header is built from a
- multi-level dictionary.
+ multi-level dictionary.
3. If `text` is given, new header text is copied from raw
text.
@@ -297,20 +336,27 @@ cdef class AlignmentFile(HTSFile):
when writing, use the string provided as the header
reference_names : list
- see referece_lengths
+ see reference_lengths
reference_lengths : list
- when writing, build header from list of chromosome names and
- lengths. By default, 'SQ' and 'LN' tags will be added to the
- header text. This option can be changed by unsetting the flag
- `add_sq_text`.
+ when writing or opening a SAM file without header build header
+ from list of chromosome names and lengths. By default, 'SQ'
+ and 'LN' tags will be added to the header text. This option
+ can be changed by unsetting the flag `add_sq_text`.
add_sq_text : bool
do not add 'SQ' and 'LN' tags to header. This option permits
construction :term:`SAM` formatted files without a header.
+ add_sam_header : bool
+ when outputting SAM the default is to output a header. This is
+ equivalent to opening the file in 'wh' mode. If this option is
+ set to False, no header will be output. To read such a file,
+ set `check_header=False`.
+
check_header : bool
- when reading, check if header is present (default=True)
+ obsolete: when reading a SAM file, check if header is present
+ (default=True)
check_sq : bool
when reading, check if SQ entries are present in header
@@ -326,7 +372,7 @@ cdef class AlignmentFile(HTSFile):
Alternative to filepath_or_object. Filename of the file
to be opened.
- duplicate_filehandle: bool
+ duplicate_filehandle: bool
By default, file handles passed either directly or through
File-like objects will be duplicated before passing them to
htslib. The duplication prevents issues where the same stream
@@ -334,6 +380,10 @@ cdef class AlignmentFile(HTSFile):
high-level python object. Set to False to turn off
duplication.
+ ignore_truncation: bool
+ Issue a warning, instead of raising an error if the current file
+ appears to be truncated due to a missing EOF marker. Only applies
+ to bgzipped formats. (Default=False)
"""
def __cinit__(self, *args, **kwargs):
@@ -393,16 +443,19 @@ cdef class AlignmentFile(HTSFile):
header=None,
port=None,
add_sq_text=True,
+ add_sam_header=True,
check_header=True,
check_sq=True,
filepath_index=None,
referencenames=None,
referencelengths=None,
- duplicate_filehandle=True):
+ duplicate_filehandle=True,
+ ignore_truncation=False):
'''open a sam, bam or cram formatted file.
If _open is called on an existing file, the current file
will be closed and a new file will be opened.
+
'''
cdef char *cfilename = NULL
cdef char *creference_filename = NULL
@@ -423,6 +476,9 @@ cdef class AlignmentFile(HTSFile):
if mode is None:
mode = "r"
+ if add_sam_header and mode == "w":
+ mode = "wh"
+
assert mode in ("r", "w", "rb", "wb", "wh",
"wbu", "rU", "wb0",
"rc", "wc"), \
@@ -468,10 +524,6 @@ cdef class AlignmentFile(HTSFile):
self.reference_filename = reference_filename = encode_filename(
reference_filename)
- cdef char * ctext
- cdef hFILE * fp
- ctext = NULL
-
if mode[0] == 'w':
# open file for writing
@@ -479,50 +531,18 @@ cdef class AlignmentFile(HTSFile):
if template:
self.header = bam_hdr_dup(template.header)
elif header:
- self.header = build_header(header)
+ self.header = build_header_from_dict(header)
else:
- # build header from a target names and lengths
assert reference_names and reference_lengths, \
("either supply options `template`, `header` "
"or both `reference_names` and `reference_lengths` "
"for writing")
- assert len(reference_names) == len(reference_lengths), \
- "unequal names and lengths of reference sequences"
-
- # allocate and fill header
- reference_names = [force_bytes(ref) for ref in reference_names]
- self.header = bam_hdr_init()
- self.header.n_targets = len(reference_names)
- n = 0
- for x in reference_names:
- n += len(x) + 1
- self.header.target_name = <char**>calloc(n, sizeof(char*))
- self.header.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
- for x from 0 <= x < self.header.n_targets:
- self.header.target_len[x] = reference_lengths[x]
- name = reference_names[x]
- self.header.target_name[x] = <char*>calloc(
- len(name) + 1, sizeof(char))
- strncpy(self.header.target_name[x], name, len(name))
-
- # Optionally, if there is no text, add a SAM
- # compatible header to output file.
- if text is None and add_sq_text:
- text = []
- for x from 0 <= x < self.header.n_targets:
- text.append("@SQ\tSN:%s\tLN:%s\n" % \
- (force_str(reference_names[x]),
- reference_lengths[x]))
- text = ''.join(text)
-
- if text is not None:
- # copy without \0
- text = force_bytes(text)
- ctext = text
- self.header.l_text = strlen(ctext)
- self.header.text = <char*>calloc(
- strlen(ctext), sizeof(char))
- memcpy(self.header.text, ctext, strlen(ctext))
+ # build header from a target names and lengths
+ self.header = build_header_from_list(
+ reference_names,
+ reference_lengths,
+ add_sq_text=add_sq_text,
+ text=text)
self.htsfile = self._open_htsfile()
@@ -542,7 +562,7 @@ cdef class AlignmentFile(HTSFile):
# open file for reading
if not self._exists():
raise IOError("file `%s` not found" % self.filename)
-
+
self.htsfile = self._open_htsfile()
if self.htsfile == NULL:
@@ -553,6 +573,8 @@ cdef class AlignmentFile(HTSFile):
if self.htsfile.format.category != sequence_data:
raise ValueError("file does not contain alignment data")
+ self.check_truncation(ignore_truncation)
+
# bam files require a valid header
if self.is_bam or self.is_cram:
with nogil:
@@ -562,16 +584,21 @@ cdef class AlignmentFile(HTSFile):
"file does not have valid header (mode='%s') "
"- is it BAM format?" % mode )
else:
- # in sam files it is optional (htsfile full of
- # unmapped reads)
- if check_header:
+ # in sam files a header is optional, but requires
+ # reference names and lengths
+ if reference_names and reference_lengths:
+ self.header = build_header_from_list(
+ reference_names,
+ reference_lengths,
+ add_sq_text=add_sq_text,
+ text=text)
+ else:
with nogil:
self.header = sam_hdr_read(self.htsfile)
if self.header == NULL:
raise ValueError(
- "file does not have valid header (mode='%s') "
- "- is it SAM format?" % mode )
- # self.header.ignore_sam_err = True
+ "file does not have valid header (mode='%s'), "
+ "please provide reference_names and reference_lengths")
# set filename with reference sequences
if self.is_cram and reference_filename:
@@ -669,7 +696,7 @@ cdef class AlignmentFile(HTSFile):
if not self.is_open:
raise ValueError("I/O operation on closed file")
if not 0 <= tid < self.header.n_targets:
- raise ValueError("reference_id %i out of range 0<=tid<%i" %
+ raise ValueError("reference_id %i out of range 0<=tid<%i" %
(tid, self.header.n_targets))
return charptr_to_str(self.header.target_name[tid])
@@ -686,7 +713,7 @@ cdef class AlignmentFile(HTSFile):
Alternatively, a samtools :term:`region` string can be
supplied.
-
+
If any of the coordinates are missing they will be replaced by the
minimum (`start`) or maximum (`end`) coordinate.
@@ -695,14 +722,14 @@ cdef class AlignmentFile(HTSFile):
Returns
-------
-
+
tuple : a tuple of `flag`, :term:`tid`, `start` and `end`. The
flag indicates whether no coordinates were supplied and the
genomic region is the complete genomic space.
Raises
------
-
+
ValueError
for invalid or out of bounds regions.
@@ -711,6 +738,9 @@ cdef class AlignmentFile(HTSFile):
cdef long long rstart
cdef long long rend
+ if reference is None and tid is None and region is None:
+ return 0, 0, 0, 0
+
rtid = -1
rstart = 0
rend = MAX_POS
@@ -735,11 +765,11 @@ cdef class AlignmentFile(HTSFile):
if len(parts) >= 3:
rend = int(parts[2])
- if not reference:
- return 0, 0, 0, 0
-
if tid is not None:
rtid = tid
+ if rtid < 0 or rtid >= self.header.n_targets:
+ raise IndexError("invalid reference, {} out of range 0-{}".format(
+ rtid, self.header.n_targets))
else:
rtid = self.gettid(reference)
@@ -764,7 +794,7 @@ cdef class AlignmentFile(HTSFile):
tid=None,
until_eof=False,
multiple_iterators=False):
- """fetch reads aligned in a :term:`region`.
+ """fetch reads aligned in a :term:`region`.
See :meth:`AlignmentFile.parse_region` for more information
on genomic regions.
@@ -789,7 +819,7 @@ cdef class AlignmentFile(HTSFile):
Parameters
----------
-
+
until_eof : bool
If `until_eof` is True, all reads from the current file
@@ -797,7 +827,7 @@ cdef class AlignmentFile(HTSFile):
file. Using this option will also fetch unmapped reads.
multiple_iterators : bool
-
+
If `multiple_iterators` is True, multiple
iterators on the same file can be used at the same time. The
iterator returned will receive its own copy of a filehandle to
@@ -841,7 +871,7 @@ cdef class AlignmentFile(HTSFile):
if has_coord:
return IteratorRowRegion(
- self, rtid, rstart, rend,
+ self, rtid, rstart, rend,
multiple_iterators=multiple_iterators)
else:
if until_eof:
@@ -857,22 +887,17 @@ cdef class AlignmentFile(HTSFile):
else:
if has_coord:
raise ValueError(
- "fetching by region is not available for sam files")
+ "fetching by region is not available for SAM files")
- if self.header == NULL:
+ if multiple_iterators == True:
raise ValueError(
- "fetch called for htsfile without header")
+ "multiple iterators not implemented for SAM files")
- # check if targets are defined
- # give warning, sam_read1 segfaults
- if self.header.n_targets == 0:
- warnings.warn("fetch called for htsfile without header")
-
return IteratorRowAll(self,
multiple_iterators=multiple_iterators)
def head(self, n, multiple_iterators=True):
- '''return an iterator over the first n alignments.
+ '''return an iterator over the first n alignments.
This iterator is is useful for inspecting the bam-file.
@@ -880,15 +905,15 @@ cdef class AlignmentFile(HTSFile):
----------
multiple_iterators : bool
-
+
is set to True by default in order to
avoid changing the current file position.
-
+
Returns
-------
-
+
an iterator over a collection of reads
-
+
'''
return IteratorRowHead(self, n,
multiple_iterators=multiple_iterators)
@@ -903,14 +928,14 @@ cdef class AlignmentFile(HTSFile):
not re-opened the file.
.. note::
-
+
This method is too slow for high-throughput processing.
If a read needs to be processed with its mate, work
from a read name sorted file or, better, cache reads.
Returns
-------
-
+
:class:`~pysam.AlignedSegment` : the mate
Raises
@@ -1061,7 +1086,7 @@ cdef class AlignmentFile(HTSFile):
Parameters
----------
-
+
reference : string
reference_name of the genomic region (chromosome)
@@ -1070,12 +1095,12 @@ cdef class AlignmentFile(HTSFile):
end : int
end of the genomic region
-
+
region : string
a region string in samtools format.
until_eof : bool
- count until the end of the file, possibly including
+ count until the end of the file, possibly including
unmapped reads as well.
read_callback: string or function
@@ -1135,7 +1160,7 @@ cdef class AlignmentFile(HTSFile):
return counter
@cython.boundscheck(False) # we do manual bounds checking
- def count_coverage(self,
+ def count_coverage(self,
reference=None,
start=None,
end=None,
@@ -1150,7 +1175,7 @@ cdef class AlignmentFile(HTSFile):
Parameters
----------
-
+
reference : string
reference_name of the genomic region (chromosome)
@@ -1165,7 +1190,7 @@ cdef class AlignmentFile(HTSFile):
quality_threshold : int
quality_threshold is the minimum quality score (in phred) a
- base has to reach to be counted.
+ base has to reach to be counted.
read_callback: string or function
@@ -1196,7 +1221,7 @@ cdef class AlignmentFile(HTSFile):
four array.arrays of the same length in order A C G T : tuple
"""
-
+
cdef int _start = start
cdef int _stop = end
cdef int length = _stop - _start
@@ -1221,7 +1246,7 @@ cdef class AlignmentFile(HTSFile):
filter_method = 1
elif read_callback == "nofilter":
filter_method = 2
-
+
cdef int _threshold = quality_threshold
for read in self.fetch(reference=reference,
start=start,
@@ -1283,16 +1308,22 @@ cdef class AlignmentFile(HTSFile):
return res
def close(self):
- '''
- closes the :class:`pysam.AlignmentFile`.'''
+ '''closes the :class:`pysam.AlignmentFile`.'''
if self.htsfile == NULL:
return
cdef int ret = hts_close(self.htsfile)
- hts_idx_destroy(self.index)
self.htsfile = NULL
+ if self.index != NULL:
+ hts_idx_destroy(self.index)
+ self.index = NULL
+
+ if self.header != NULL:
+ bam_hdr_destroy(self.header)
+ self.header = NULL
+
if ret < 0:
global errno
if errno == EPIPE:
@@ -1301,28 +1332,23 @@ cdef class AlignmentFile(HTSFile):
raise OSError(errno, force_str(strerror(errno)))
def __dealloc__(self):
- # remember: dealloc cannot call other methods
- # note: no doc string
- # note: __del__ is not called.
-
- # FIXME[kbj]: isn't self.close a method? I've been duplicating
- # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty
- # solution and perhaps unnecessary given that calling self.close has
- # been working for years.
- # AH: I have removed the call to close. Even though it is working,
- # it seems to be dangerous according to the documentation as the
- # object be partially deconstructed already.
cdef int ret = 0
if self.htsfile != NULL:
ret = hts_close(self.htsfile)
- hts_idx_destroy(self.index);
self.htsfile = NULL
- bam_destroy1(self.b)
+ if self.index != NULL:
+ hts_idx_destroy(self.index)
+ self.index = NULL
+
if self.header != NULL:
bam_hdr_destroy(self.header)
+ self.header = NULL
+ if self.b:
+ bam_destroy1(self.b)
+ self.b = NULL
if ret < 0:
global errno
@@ -1330,7 +1356,7 @@ cdef class AlignmentFile(HTSFile):
errno = 0
else:
raise OSError(errno, force_str(strerror(errno)))
-
+
cpdef int write(self, AlignedSegment read) except -1:
'''
write a single :class:`pysam.AlignedSegment` to disk.
@@ -1342,7 +1368,7 @@ cdef class AlignmentFile(HTSFile):
Returns
-------
-
+
int : the number of bytes written. If the file is closed,
this will be 0.
'''
@@ -1387,7 +1413,7 @@ cdef class AlignmentFile(HTSFile):
return self.header.n_targets
property references:
- """tuple with the names of :term:`reference` sequences. This is a
+ """tuple with the names of :term:`reference` sequences. This is a
read-only attribute"""
def __get__(self):
if not self.is_open: raise ValueError( "I/O operation on closed file" )
@@ -1455,10 +1481,10 @@ cdef class AlignmentFile(HTSFile):
property text:
'''string with the full contents of the :term:`sam file` header as a
- string.
+ string.
This is a read-only attribute.
-
+
See :attr:`pysam.AlignmentFile.header` to get a parsed
representation of the header.
'''
@@ -1468,13 +1494,13 @@ cdef class AlignmentFile(HTSFile):
return from_string_and_size(self.header.text, self.header.l_text)
property header:
- """two-level dictionay with header information from the file.
-
+ """two-level dictionay with header information from the file.
+
This is a read-only attribute.
The first level contains the record (``HD``, ``SQ``, etc) and
the second level contains the fields (``VN``, ``LN``, etc).
-
+
The parser is validating and will raise an AssertionError if
if encounters any record or field tags that are not part of
the SAM specification. Use the
@@ -1494,7 +1520,7 @@ cdef class AlignmentFile(HTSFile):
raise ValueError( "I/O operation on closed file" )
result = {}
-
+
if self.header.text != NULL:
# convert to python string (note: call self.text to
# create 0-terminated string)
@@ -1518,7 +1544,7 @@ cdef class AlignmentFile(HTSFile):
x = {}
for idx, field in enumerate(fields[1:]):
- if ":" not in field:
+ if ":" not in field:
raise ValueError("malformatted header: no ':' in field" )
key, value = field.split(":", 1)
if key in ("CL",):
@@ -1576,7 +1602,7 @@ cdef class AlignmentFile(HTSFile):
"can not iterate over samfile without header")
return self
- cdef bam1_t * getCurrent( self ):
+ cdef bam1_t * getCurrent(self):
return self.b
cdef int cnext(self):
@@ -1598,12 +1624,12 @@ cdef class AlignmentFile(HTSFile):
raise IOError('truncated file')
else:
raise StopIteration
-
+
# Compatibility functions for pysam < 0.8.3
def gettid(self, reference):
"""deprecated, use get_tid() instead"""
return self.get_tid(reference)
-
+
def getrname(self, tid):
"""deprecated, use get_reference_name() instead"""
return self.get_reference_name(tid)
@@ -1637,7 +1663,7 @@ cdef class IteratorRow:
def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
cdef char *cfilename
cdef char *creference_filename
-
+
if not samfile.is_open:
raise ValueError("I/O operation on closed file")
@@ -1711,7 +1737,7 @@ cdef class IteratorRowRegion(IteratorRow):
tid,
beg,
end)
-
+
def __iter__(self):
return self
@@ -1766,7 +1792,7 @@ cdef class IteratorRowHead(IteratorRow):
def __iter__(self):
return self
- cdef bam1_t * getCurrent( self ):
+ cdef bam1_t * getCurrent(self):
return self.b
cdef int cnext(self):
@@ -1814,7 +1840,7 @@ cdef class IteratorRowAll(IteratorRow):
def __iter__(self):
return self
- cdef bam1_t * getCurrent( self ):
+ cdef bam1_t * getCurrent(self):
return self.b
cdef int cnext(self):
@@ -1988,7 +2014,7 @@ cdef int __advance_snpcalls(void * data, bam1_t * b):
the samtools pileup.
'''
- # Note that this method requries acces to some
+ # Note that this method requries acces to some
# functions in the samtools code base and is thus
# not htslib only.
# The functions accessed in samtools are:
@@ -2029,11 +2055,13 @@ cdef int __advance_snpcalls(void * data, bam1_t * b):
skip = 0
# realign read - changes base qualities
- if d.seq != NULL and is_cns and not is_nobaq:
- bam_prob_realn(b, d.seq)
+ if d.seq != NULL and is_cns and not is_nobaq:
+ # flag:
+ # apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
+ sam_prob_realn(b, d.seq, d.seq_len, 0)
if d.seq != NULL and capQ_thres > 10:
- q = bam_cap_mapQ(b, d.seq, capQ_thres)
+ q = sam_cap_mapq(b, d.seq, d.seq_len, capQ_thres)
if q < 0:
skip = 1
elif b.core.qual > q:
@@ -2089,7 +2117,7 @@ cdef class IteratorColumn:
Valid values are None, "all" (default), "nofilter" or "samtools".
See AlignmentFile.pileup for description.
-
+
fastafile
A :class:`~pysam.FastaFile` object
@@ -2271,7 +2299,7 @@ cdef class IteratorColumnRegion(IteratorColumn):
if self.plp == NULL:
raise StopIteration
-
+
if self.truncate:
if self.start > self.pos: continue
if self.pos >= self.end: raise StopIteration
@@ -2313,7 +2341,7 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
self.pos,
self.n_plp,
self.samfile)
-
+
# otherwise, proceed to next reference or stop
self.tid += 1
if self.tid < self.samfile.nreferences:
@@ -2465,7 +2493,7 @@ cdef class IndexedReads:
Raises
------
-
+
KeyError
if the `query_name` is not in the index.
diff --git a/pysam/libcbcf.pxd b/pysam/libcbcf.pxd
index fc7f56c..1d4129b 100644
--- a/pysam/libcbcf.pxd
+++ b/pysam/libcbcf.pxd
@@ -38,45 +38,44 @@ from pysam.libchtslib cimport *
cdef class VariantHeader(object):
cdef bcf_hdr_t *ptr
- cpdef VariantRecord new_record(self)
cdef _subset_samples(self, include_samples)
cdef class VariantHeaderRecord(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef bcf_hrec_t *ptr
cdef class VariantHeaderRecords(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef class VariantHeaderContigs(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef class VariantHeaderSamples(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef class VariantContig(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef int id
cdef class VariantMetadata(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef int type
cdef int id
cdef class VariantHeaderMetadata(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef int32_t type
cdef class VariantRecord(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef bcf1_t *ptr
@@ -107,7 +106,7 @@ cdef class BaseIndex(object):
cdef class BCFIndex(BaseIndex):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef hts_idx_t *ptr
@@ -139,6 +138,4 @@ cdef class VariantFile(HTSFile):
cdef readonly bint is_reading # true if file has begun reading records
cdef readonly bint header_written # true if header has already been written
- cpdef VariantRecord new_record(self)
-
cpdef int write(self, VariantRecord record) except -1
diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx
index 8f40451..9413e70 100644
--- a/pysam/libcbcf.pyx
+++ b/pysam/libcbcf.pyx
@@ -101,9 +101,6 @@ from cpython.version cimport PY_MAJOR_VERSION
from pysam.libchtslib cimport HTSFile, hisremote
-from warnings import warn
-
-
__all__ = ['VariantFile',
'VariantHeader',
'VariantHeaderRecord',
@@ -131,6 +128,13 @@ from pysam.libcutils cimport encode_filename, from_string_and_size
########################################################################
########################################################################
+## Sentinel object
+########################################################################
+
+cdef object _nothing = object()
+
+########################################################################
+########################################################################
## VCF/BCF string intern system
########################################################################
@@ -156,6 +160,55 @@ cdef inline bcf_str_cache_get_charptr(const char* s):
########################################################################
########################################################################
+## Genotype math
+########################################################################
+
+cdef int comb(int n, int k) except -1:
+ """Return binomial coeffient: n choose k
+
+ >>> comb(5, 1)
+ 5
+ >>> comb(5, 2)
+ 10
+ >>> comb(2, 2)
+ 1
+ >>> comb(100, 2)
+ 4950
+ """
+ if k > n:
+ return 0
+ elif k == n:
+ return 1
+ elif k > n // 2:
+ k = n - k
+
+ cdef d, result
+
+ d = result = n - k + 1
+ for i in range(2, k + 1):
+ d += 1
+ result *= d
+ result //= i
+ return result
+
+
+cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
+ """Return the count of genotypes expected for the given ploidy and number of alleles.
+
+ >>> bcf_geno_combinations(1, 2)
+ 2
+ >>> bcf_geno_combinations(2, 2)
+ 3
+ >>> bcf_geno_combinations(2, 3)
+ 6
+ >>> bcf_geno_combinations(3, 2)
+ 4
+ """
+ return comb(alleles + ploidy - 1, ploidy)
+
+
+########################################################################
+########################################################################
## Low level type conversion helpers
########################################################################
@@ -165,7 +218,32 @@ cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
- return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
+ return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
+
+
+cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
+ if sample < 0:
+ raise ValueError('genotype is only valid as a format field')
+
+ cdef int32_t *gt_arr = NULL
+ cdef int ngt = 0
+ ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt)
+
+ if ngt <= 0 or not gt_arr:
+ return 0
+
+ assert ngt % rec.n_sample == 0
+ cdef int max_ploidy = ngt // rec.n_sample
+ cdef int32_t *gt = gt_arr + sample * max_ploidy
+ cdef int ploidy = 0
+
+ while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
+ gt += 1
+ ploidy += 1
+
+ free(<void*>gt_arr)
+
+ return bcf_geno_combinations(ploidy, rec.n_allele)
cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
@@ -185,19 +263,25 @@ cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int sca
cdef int32_t *data32
cdef float *dataf
cdef int i
+ cdef bytes b
if not data or n <= 0:
return None
if type == BCF_BT_CHAR:
datac = <char *>data
- while n and datac[n-1] == bcf_str_vector_end:
- n -= 1
- value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None
- # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
- value = tuple(v or None for v in value.split(',')) if value else ()
- # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
+ if not n:
+ value = ()
+ else:
+ # Check if at least one null terminator is present
+ if datac[n-1] == bcf_str_vector_end:
+ # If so, create a string up to the first null terminator
+ b = datac
+ else:
+ # Otherwise, copy the entire block
+ b = datac[:n]
+ value = tuple(v.decode('ascii') if v and v != bcf_str_missing else None for v in b.split(b','))
else:
value = []
if type == BCF_BT_INT8:
@@ -251,13 +335,13 @@ cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
cdef float *dataf
cdef ssize_t i, value_count = len(values)
- assert(value_count <= n)
+ assert value_count <= n
if bt_type == BCF_BT_CHAR:
if not isinstance(values, (str, bytes)):
- values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+ values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
value_count = len(values)
- assert(value_count <= n)
+ assert value_count <= n
datac = <char *>data
memcpy(datac, <char *>values, value_count)
for i in range(value_count, n):
@@ -392,7 +476,7 @@ cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
raise TypeError('unsupported types')
-cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
+cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
if record is None:
raise ValueError('record must not be None')
@@ -418,7 +502,7 @@ cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *cou
elif length == BCF_VL_A:
count[0] = r.n_allele - 1
elif length == BCF_VL_G:
- count[0] = r.n_allele * (r.n_allele + 1) // 2
+ count[0] = bcf_genotype_count(hdr, r, sample)
elif length == BCF_VL_VAR:
count[0] = -1
else:
@@ -435,7 +519,7 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
cdef ssize_t count
cdef int scalar
- bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar)
+ bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
if z.len == 0:
if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
@@ -466,14 +550,15 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
return value
-cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
+cdef object bcf_check_values(VariantRecord record, value, int sample,
+ int hl_type, int ht_type,
int id, int bt_type, ssize_t bt_len,
ssize_t *value_count, int *scalar, int *realloc):
if record is None:
raise ValueError('record must not be None')
- bcf_get_value_count(record, hl_type, id, value_count, scalar)
+ bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
# Validate values now that we know the type and size
values = (value,) if not isinstance(value, (list, tuple)) else value
@@ -485,11 +570,12 @@ cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_ty
# KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
value_count[0] = -1
- if value_count[0] != -1 and value_count[0] != len(values):
+ cdef int given = len(values)
+ if value_count[0] != -1 and value_count[0] != given:
if scalar[0]:
- raise TypeError('value expected to be scalar'.format(value_count[0]))
+ raise TypeError('value expected to be scalar, given len={}'.format(value_count[0], given))
else:
- raise TypeError('values expected to be {:d}-tuple'.format(value_count[0]))
+ raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
if ht_type == BCF_HT_REAL:
for v in values:
@@ -572,33 +658,29 @@ cdef bcf_info_set_value(VariantRecord record, key, value):
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
- cdef vdict_t *d
- cdef khiter_t k
cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
if info:
info_id = info.key
else:
- d = <vdict_t *>hdr.dict[BCF_DT_ID]
- k = kh_get_vdict(d, bkey)
+ info_id = bcf_header_get_info_id(hdr, bkey)
- if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
- raise KeyError('unknown INFO')
-
- info_id = kh_val_vdict(d, k).id
+ if info_id < 0:
+ raise KeyError('unknown INFO: {}'.format(key))
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
- values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
+ values = bcf_check_values(record, value, -1,
+ BCF_HL_INFO, info_type, info_id,
info.type if info else -1,
info.len if info else -1,
&value_count, &scalar, &realloc)
@@ -611,13 +693,16 @@ cdef bcf_info_set_value(VariantRecord record, key, value):
vlen = value_count < 0
value_count = len(values)
+ # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
# If we can, write updated values to existing allocated storage
- if info and not realloc:
+ if 0 and info and not realloc:
r.d.shared_dirty |= BCF1_DIRTY_INF
if value_count == 0:
info.len = 0
- # FIXME: Check if need to free vptr if info.len > 0?
+ if not info.vptr:
+ info.vptr = <uint8_t *>&info.v1.i
+
elif value_count == 1:
# FIXME: Check if need to free vptr if info.len > 0?
if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
@@ -626,9 +711,13 @@ cdef bcf_info_set_value(VariantRecord record, key, value):
bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
else:
raise TypeError('unsupported info type code')
+
info.len = 1
+ if not info.vptr:
+ info.vptr = <uint8_t *>&info.v1.i
else:
bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
+
return
alloc_len = max(1, value_count)
@@ -665,13 +754,13 @@ cdef bcf_info_del_value(VariantRecord record, key):
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
if not info:
raise KeyError(key)
- bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar)
+ bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
if value_count <= 0:
null_value = ()
@@ -695,16 +784,16 @@ cdef bcf_format_get_value(VariantRecordSample sample, key):
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
- raise KeyError('invalid FORMAT')
+ raise KeyError('invalid FORMAT: {}'.format(key))
if is_gt_fmt(hdr, fmt.id):
return bcf_format_get_allele_indices(sample)
- bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar)
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
if fmt.p and fmt.n and fmt.size:
return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
@@ -720,6 +809,10 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
if sample is None:
raise ValueError('sample must not be None')
+ if key == 'phased':
+ sample.phased = bool(value)
+ return
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int fmt_id
@@ -731,7 +824,7 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if fmt:
@@ -741,7 +834,7 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
- raise KeyError('unknown format')
+ raise KeyError('unknown format: {}'.format(key))
fmt_id = kh_val_vdict(d, k).id
@@ -758,7 +851,8 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
# KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
fmt_type = BCF_HT_INT
- values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
+ values = bcf_check_values(sample.record, value, sample.index,
+ BCF_HL_FMT, fmt_type, fmt_id,
fmt.type if fmt else -1,
fmt.n if fmt else -1,
&value_count, &scalar, &realloc)
@@ -776,7 +870,7 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
if fmt and fmt.n > alloc_len:
alloc_len = fmt.n
- n = bcf_hdr_nsamples(hdr)
+ n = r.n_sample
new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
cdef char *valp = <char *>new_values
@@ -816,13 +910,13 @@ cdef bcf_format_del_value(VariantRecordSample sample, key):
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
raise KeyError(key)
- bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar)
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
if value_count <= 0:
null_value = ()
@@ -840,7 +934,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample):
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
@@ -900,7 +994,7 @@ cdef bcf_format_get_alleles(VariantRecordSample sample):
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
+ cdef int32_t nsamples = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
@@ -951,7 +1045,7 @@ cdef bint bcf_sample_get_phased(VariantRecordSample sample):
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
@@ -1014,7 +1108,7 @@ cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
@@ -1061,6 +1155,29 @@ cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
data32[i] = (data32[i] & 0xFFFFFFFE) | phased
+cdef inline bcf_sync_end(VariantRecord record):
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf_info_t *info
+ cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
+ cdef int ref_len = len(record.ref)
+
+ # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
+ if not record.ptr.n_allele or record.ptr.rlen == ref_len:
+ # If INFO/END is not defined in the header, it doesn't exist in the record
+ if end_id >= 0:
+ info = bcf_get_info(hdr, record.ptr, b'END')
+ if info and info.vptr:
+ if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete END')
+ else:
+ # Create END header, if not present
+ if end_id < 0:
+ record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
+
+ # Update to reflect stop position
+ bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
+
+
########################################################################
########################################################################
## Variant Header objects
@@ -1205,6 +1322,28 @@ cdef class VariantHeaderRecord(object):
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
@@ -1235,9 +1374,8 @@ cdef class VariantHeaderRecord(object):
cdef bcf_hrec_t *r = self.ptr
if not r:
return
- assert(r.key)
+ assert r.key
cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
- print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key))
bcf_hdr_remove(hdr, r.type, key)
self.ptr = NULL
@@ -1358,8 +1496,8 @@ cdef class VariantMetadata(object):
def remove_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
- cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key
- bcf_hdr_remove(hdr, self.type, bkey)
+ cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
+ bcf_hdr_remove(hdr, self.type, key)
cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
@@ -1437,11 +1575,11 @@ cdef class VariantHeaderMetadata(object):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
- raise KeyError('invalid key')
+ raise KeyError('invalid key: {}'.format(key))
return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
@@ -1449,11 +1587,11 @@ cdef class VariantHeaderMetadata(object):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
- raise KeyError('invalid key')
+ raise KeyError('invalid key: {}'.format(key))
bcf_hdr_remove(hdr, self.type, bkey)
#bcf_hdr_sync(hdr)
@@ -1555,7 +1693,7 @@ cdef class VariantContig(object):
return length if length else None
@property
- def header(self):
+ def header_record(self):
""":class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
@@ -1563,8 +1701,8 @@ cdef class VariantContig(object):
def remove_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
- cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key
- bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+ cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
+ bcf_hdr_remove(hdr, BCF_HL_CTG, key)
cdef VariantContig makeVariantContig(VariantHeader header, int id):
@@ -1607,11 +1745,11 @@ cdef class VariantHeaderContigs(object):
return makeVariantContig(self.header, index)
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d):
- raise KeyError('invalid contig')
+ raise KeyError('invalid contig: {}'.format(key))
cdef int id = kh_val_vdict(d, k).id
@@ -1620,7 +1758,7 @@ cdef class VariantHeaderContigs(object):
def remove_header(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int index
- cdef const char *bkey
+ cdef const char *ckey
cdef vdict_t *d
cdef khiter_t k
@@ -1628,15 +1766,15 @@ cdef class VariantHeaderContigs(object):
index = key
if index < 0 or index >= hdr.n[BCF_DT_CTG]:
raise IndexError('invalid contig index')
- bkey = hdr.id[BCF_DT_CTG][self.id].key
+ ckey = hdr.id[BCF_DT_CTG][self.id].key
else:
d = <vdict_t *>hdr.dict[BCF_DT_CTG]
key = force_bytes(key)
if kh_get_vdict(d, key) == kh_end(d):
- raise KeyError('invalid contig')
- bkey = key
+ raise KeyError('invalid contig: {}'.format(key))
+ ckey = key
- bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+ bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
def clear_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
@@ -1704,7 +1842,8 @@ cdef class VariantHeaderContigs(object):
if id in self:
raise ValueError('Header already exists for contig {}'.format(id))
- items = [('ID', id)] + kwargs.items()
+ items = [('ID', id)]
+ items += kwargs.items()
self.header.add_meta('contig', items=items)
@@ -1749,7 +1888,7 @@ cdef class VariantHeaderSamples(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
return k != kh_end(d)
@@ -1796,7 +1935,6 @@ cdef class VariantHeader(object):
self.ptr = NULL
def __bool__(self):
- # self.ptr == NULL should be impossible
return self.ptr != NULL
def copy(self):
@@ -1886,11 +2024,50 @@ cdef class VariantHeader(object):
finally:
free(hstr)
- cpdef VariantRecord new_record(self):
- """Create a new empty VariantRecord"""
- r = makeVariantRecord(self, bcf_init())
- r.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
- return r
+ def new_record(self, contig=None, start=0, stop=0, alleles=None,
+ id=None, qual=None, filter=None, info=None, samples=None,
+ **kwargs):
+ """Create a new empty VariantRecord.
+
+ Arguments are currently experimental. Use with caution and expect
+ changes in upcoming releases.
+
+ """
+ rec = makeVariantRecord(self, bcf_init())
+ rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
+
+ if contig is not None:
+ rec.contig = contig
+ if alleles is not None:
+ rec.alleles = alleles
+
+ rec.start = start
+ rec.stop = stop
+ rec.id = id
+ rec.qual = qual
+
+ if filter is not None:
+ if isinstance(filter, (list, tuple, VariantRecordFilter)):
+ for f in filter:
+ rec.filter.add(f)
+ else:
+ rec.filter.add(filter)
+
+ if info:
+ rec.info.update(info)
+
+ if kwargs:
+ if 'GT' in kwargs:
+ rec.samples[0]['GT'] = kwargs.pop('GT')
+ rec.samples[0].update(kwargs)
+
+ if samples:
+ for i, sample in enumerate(samples):
+ if 'GT' in sample:
+ rec.samples[i]['GT'] = sample.pop('GT')
+ rec.samples[i].update(sample)
+
+ return rec
def add_record(self, VariantHeaderRecord record):
"""Add an existing :class:`VariantHeaderRecord` to this header"""
@@ -1963,6 +2140,23 @@ cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
return header
+cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef int info_id
+
+ if isinstance(key, str):
+ key = force_bytes(key)
+
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, key)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+ return -1
+
+ return kh_val_vdict(d, k).id
+
+
########################################################################
########################################################################
## Variant Record objects
@@ -2001,7 +2195,7 @@ cdef class VariantRecordFilter(object):
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
- raise KeyError('Invalid filter')
+ raise KeyError('Invalid filter: {}'.format(key))
return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
@@ -2014,11 +2208,11 @@ cdef class VariantRecordFilter(object):
if key == '.':
key = 'PASS'
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id):
- raise KeyError('Invalid filter')
+ raise KeyError('Invalid filter: {}'.format(key))
bcf_add_filter(hdr, r, id)
@@ -2043,7 +2237,7 @@ cdef class VariantRecordFilter(object):
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
- raise KeyError('Invalid filter')
+ raise KeyError('Invalid filter: {}'.format(key))
bcf_remove_filter(hdr, r, id, 0)
@@ -2071,7 +2265,7 @@ cdef class VariantRecordFilter(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
return bcf_has_filter(hdr, r, bkey) == 1
def iterkeys(self):
@@ -2100,6 +2294,20 @@ cdef class VariantRecordFilter(object):
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.record.ptr
+ cdef bcf1_t *o = other.record.ptr
+
+ cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
@@ -2146,11 +2354,11 @@ cdef class VariantRecordFormat(object):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
- raise KeyError('unknown format')
+ raise KeyError('unknown format: {}'.format(key))
return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
@@ -2158,11 +2366,11 @@ cdef class VariantRecordFormat(object):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
- raise KeyError('unknown format')
+ raise KeyError('unknown format: {}'.format(key))
if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
raise ValueError('Unable to delete FORMAT')
@@ -2204,7 +2412,7 @@ cdef class VariantRecordFormat(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
return fmt != NULL and fmt.p != NULL
@@ -2259,38 +2467,65 @@ cdef class VariantRecordInfo(object):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
- return self.record.ptr.n_info
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef const char *key
+ cdef int i, count = 0
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
+ count += 1
+
+ return count
def __bool__(self):
- return self.record.ptr.n_info != 0
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef const char *key
+ cdef int i
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
+ return True
+
+ return False
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef vdict_t *d
- cdef khiter_t k
- cdef info_id
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
- cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+ cdef bytes bkey = force_bytes(key)
- if not info:
- d = <vdict_t *>hdr.dict[BCF_DT_ID]
- k = kh_get_vdict(d, bkey)
+ if strcmp(bkey, b'END') == 0:
+ raise KeyError('END is a reserved attribute; access is via record.stop')
- if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
- raise KeyError('Unknown INFO field: {}'.format(key))
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
- info_id = kh_val_vdict(d, k).id
- else:
- info_id = info.key
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if info_id < 0:
+ raise KeyError('Unknown INFO field: {}'.format(key))
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
+ # Handle type=Flag values
if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
return info != NULL and info.vptr != NULL
@@ -2300,18 +2535,42 @@ cdef class VariantRecordInfo(object):
return bcf_info_get_value(self.record, info)
def __setitem__(self, key, value):
+ cdef bytes bkey = force_bytes(key)
+
+ if strcmp(bkey, b'END') == 0:
+ raise KeyError('END is a reserved attribute; access is via record.stop')
+
+ if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
bcf_info_set_value(self.record, key, value)
def __delitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
+ cdef bytes bkey = force_bytes(key)
+ if strcmp(bkey, b'END') == 0:
+ raise KeyError('END is a reserved attribute; access is via record.stop')
+
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if info_id < 0:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ # Handle flags
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
+ return
+
if not info or not info.vptr:
raise KeyError('Unknown INFO field: {}'.format(key))
@@ -2333,6 +2592,8 @@ cdef class VariantRecordInfo(object):
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if strcmp(key, b'END') == 0:
+ continue
if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
raise ValueError('Unable to delete INFO')
@@ -2340,20 +2601,49 @@ cdef class VariantRecordInfo(object):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
+ cdef const char *key
cdef int i
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
- yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key))
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if strcmp(key, b'END') != 0:
+ yield bcf_str_cache_get_charptr(key)
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ cdef bytes bkey = force_bytes(key)
+
+ if strcmp(bkey, b'END') == 0:
return default
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ # Handle flags
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
+ return info != NULL and info.vptr != NULL
+
+ if not info or not info.vptr:
+ return default
+
+ return bcf_info_get_value(self.record, info)
+
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
@@ -2361,10 +2651,14 @@ cdef class VariantRecordInfo(object):
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
+
+ if strcmp(bkey, b'END') == 0:
+ return False
+
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
- return info != NULL
+ return info != NULL and info.vptr != NULL
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
@@ -2372,28 +2666,40 @@ cdef class VariantRecordInfo(object):
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
+ cdef const char *key
cdef int i
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
- yield bcf_info_get_value(self.record, info)
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if strcmp(key, b'END') != 0:
+ yield bcf_info_get_value(self.record, info)
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
+ cdef const char *key
cdef int i
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
- value = bcf_info_get_value(self.record, info)
- yield bcf_str_cache_get_charptr(key), value
+ if strcmp(key, b'END') != 0:
+ value = bcf_info_get_value(self.record, info)
+ yield bcf_str_cache_get_charptr(key), value
def keys(self):
"""D.keys() -> list of D's keys"""
@@ -2407,11 +2713,75 @@ cdef class VariantRecordInfo(object):
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ if k != 'END':
+ self[k] = v
+
+ if kwargs:
+ kwargs.pop('END', None)
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ cdef bytes bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if info_id < 0:
+ if default is _nothing:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+ return default
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ # Handle flags
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
+ return
+
+ if not info or not info.vptr:
+ if default is _nothing:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+ return default
+
+ value = bcf_info_get_value(self.record, info)
+
+ if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete INFO')
+
+ return value
+
+ def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.record.ptr
+ cdef bcf1_t *o = other.record.ptr
+
+ # Cannot use n_info as shortcut logic, since null values may remain
+ cdef bint cmp = dict(self) == dict(other)
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
- #TODO: implement __richcmp__
-
cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
if not record:
@@ -2429,15 +2799,15 @@ cdef class VariantRecordSamples(object):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
- return bcf_hdr_nsamples(self.record.header.ptr)
+ return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
def __bool__(self):
- return bcf_hdr_nsamples(self.record.header.ptr) != 0
+ return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int n = bcf_hdr_nsamples(hdr)
+ cdef int n = self.record.ptr.n_sample
cdef int sample_index
cdef vdict_t *d
cdef khiter_t k
@@ -2448,7 +2818,7 @@ cdef class VariantRecordSamples(object):
bkey = force_bytes(key)
sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
- raise KeyError('invalid sample name')
+ raise KeyError('invalid sample name: {}'.format(key))
if sample_index < 0 or sample_index >= n:
raise IndexError('invalid sample index')
@@ -2458,7 +2828,7 @@ cdef class VariantRecordSamples(object):
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+ cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield charptr_to_str(hdr.samples[i])
@@ -2473,7 +2843,7 @@ cdef class VariantRecordSamples(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int n = bcf_hdr_nsamples(hdr)
+ cdef int n = self.record.ptr.n_sample
cdef int sample_index
cdef vdict_t *d
cdef khiter_t k
@@ -2484,7 +2854,7 @@ cdef class VariantRecordSamples(object):
bkey = force_bytes(key)
sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
- raise KeyError('invalid sample name')
+ raise KeyError('invalid sample name: {}'.format(key))
return 0 <= sample_index < n
@@ -2496,7 +2866,7 @@ cdef class VariantRecordSamples(object):
"""D.itervalues() -> an iterator over the values of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+ cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield makeVariantRecordSample(self.record, i)
@@ -2505,7 +2875,7 @@ cdef class VariantRecordSamples(object):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+ cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
@@ -2522,11 +2892,45 @@ cdef class VariantRecordSamples(object):
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
+ def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.record.ptr
+ cdef bcf1_t *o = other.record.ptr
+
+ cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
- #TODO: implement __richcmp__
-
cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
if not record:
@@ -2566,6 +2970,7 @@ cdef class VariantRecord(object):
raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
bcf_translate(dst_hdr, src_hdr, self.ptr)
+ self.header = dst_header
@property
def rid(self):
@@ -2627,6 +3032,7 @@ cdef class VariantRecord(object):
if p < 1:
raise ValueError('Position must be positive')
self.ptr.pos = p - 1
+ bcf_sync_end(self)
@property
def start(self):
@@ -2639,6 +3045,7 @@ cdef class VariantRecord(object):
if s < 0:
raise ValueError('Start coordinate must be non-negative')
self.ptr.pos = s
+ bcf_sync_end(self)
@property
def stop(self):
@@ -2648,25 +3055,21 @@ cdef class VariantRecord(object):
@stop.setter
def stop(self, value):
cdef int s = value
- if s < self.ptr.pos:
- raise ValueError('Stop coordinate must be greater than or equal to start')
+ if s < 0:
+ raise ValueError('Stop coordinate must be non-negative')
self.ptr.rlen = s - self.ptr.pos
- if self.ptr.rlen != len(self.ref) or 'END' in self.info:
- self.info['END'] = s
+ bcf_sync_end(self)
@property
def rlen(self):
- """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)"""
+ """record length on chrom/contig (aka rec.stop - rec.start)"""
return self.ptr.rlen
@rlen.setter
def rlen(self, value):
cdef int r = value
- if r < 0:
- raise ValueError('Reference length must be non-negative')
self.ptr.rlen = r
- if r != len(self.ref) or 'END' in self.info:
- self.info['END'] = self.ptr.pos + r
+ bcf_sync_end(self)
@property
def qual(self):
@@ -2732,6 +3135,8 @@ cdef class VariantRecord(object):
else:
alleles = [value]
self.alleles = alleles
+ self.ptr.rlen = len(value)
+ bcf_sync_end(self)
@property
def alleles(self):
@@ -2749,17 +3154,28 @@ cdef class VariantRecord(object):
return res
@alleles.setter
- def alleles(self, value):
+ def alleles(self, values):
cdef bcf1_t *r = self.ptr
+
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- value = [force_bytes(v) for v in value]
- if b'' in value:
+
+ values = [force_bytes(v) for v in values]
+
+ if len(values) < 2:
+ raise ValueError('must set at least 2 alleles')
+
+ if b'' in values:
raise ValueError('cannot set null allele')
- value = b','.join(value)
+
+ value = b','.join(values)
+
if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
raise ValueError('Error updating alleles')
+ self.ptr.rlen = len(values[0])
+ bcf_sync_end(self)
+
@property
def alts(self):
"""tuple of alt alleles"""
@@ -2815,6 +3231,32 @@ cdef class VariantRecord(object):
raise ValueError('Error unpacking VariantRecord')
return makeVariantRecordSamples(self)
+ def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.ptr
+ cdef bcf1_t *o = other.ptr
+
+ cdef bint cmp = self is other or (
+ s.pos == o.pos
+ and s.rlen == o.rlen
+ and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
+ or s.qual == o.qual)
+ and s.n_sample == o.n_sample
+ and s.n_allele == o.n_allele
+ and self.contig == other.contig
+ and self.alleles == other.alleles
+ and self.id == other.id
+ and self.info == other.info
+ and self.filter == other.filter
+ and self.samples == other.samples)
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
def __str__(self):
cdef kstring_t line
cdef char c
@@ -2896,7 +3338,7 @@ cdef class VariantRecordSample(object):
"""sample name"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if self.index < 0 or self.index >= n:
raise ValueError('invalid sample index')
@@ -3006,7 +3448,7 @@ cdef class VariantRecordSample(object):
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
return fmt != NULL and fmt.p != NULL
@@ -3036,11 +3478,42 @@ cdef class VariantRecordSample(object):
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
+ def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bint cmp = dict(self) == dict(other)
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
- #TODO: implement __richcmp__
-
cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
if not record or sample_index < 0:
@@ -3120,6 +3593,28 @@ cdef class BaseIndex(object):
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
@@ -3253,7 +3748,7 @@ cdef class BCFIterator(BaseIterator):
try:
rid = index.refmap[contig]
except KeyError:
- raise ValueError('Unknown contig specified')
+ raise ValueError('Unknown contig specified: {}'.format(contig))
if start is None:
start = 0
@@ -3409,7 +3904,7 @@ cdef class TabixIterator(BaseIterator):
cdef class VariantFile(HTSFile):
"""*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
- duplicate_filehandle=True)*
+ duplicate_filehandle=True, ignore_truncation=False)*
A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
opened.
@@ -3451,7 +3946,7 @@ cdef class VariantFile(HTSFile):
drop_samples: bool
Ignore sample information when reading.
- duplicate_filehandle: bool
+ duplicate_filehandle: bool
By default, file handles passed either directly or through
File-like objects will be duplicated before passing them to
htslib. The duplication prevents issues where the same stream
@@ -3459,6 +3954,11 @@ cdef class VariantFile(HTSFile):
high-level python object. Set to False to turn off
duplication.
+ ignore_truncation: bool
+ Issue a warning, instead of raising an error if the current file
+ appears to be truncated due to a missing EOF marker. Only applies
+ to bgzipped formats. (Default=False)
+
"""
def __cinit__(self, *args, **kwargs):
self.htsfile = NULL
@@ -3478,19 +3978,39 @@ cdef class VariantFile(HTSFile):
self.open(*args, **kwargs)
+ def __dealloc__(self):
+ if not self.htsfile or not self.header:
+ return
+
+ # Write header if no records were written
+ if self.htsfile.is_write and not self.header_written:
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ cdef int ret = hts_close(self.htsfile)
+ self.htsfile = NULL
+ self.header = self.index = None
+
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
+
def close(self):
"""closes the :class:`pysam.VariantFile`."""
- cdef int ret = 0
- self.header = self.index = None
- if self.htsfile:
- # Write header if no records were written
- if self.htsfile.is_write and not self.header_written:
- self.header_written = True
- with nogil:
- bcf_hdr_write(self.htsfile, self.header.ptr)
+ if not self.htsfile:
+ return
- ret = hts_close(self.htsfile)
- self.htsfile = NULL
+ # Write header if no records were written
+ if self.htsfile.is_write and not self.header_written:
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ cdef int ret = hts_close(self.htsfile)
+ self.htsfile = NULL
+ self.header = self.index = None
if ret < 0:
global errno
@@ -3525,7 +4045,7 @@ cdef class VariantFile(HTSFile):
if ret == -1:
raise StopIteration
elif ret == -2:
- raise IOError('truncated file')
+ raise OSError('truncated file')
else:
raise ValueError('Variant read failed')
@@ -3572,7 +4092,8 @@ cdef class VariantFile(HTSFile):
index_filename=None,
VariantHeader header=None,
drop_samples=False,
- duplicate_filehandle=True):
+ duplicate_filehandle=True,
+ ignore_truncation=False):
"""open a vcf/bcf file.
If open is called on an existing VariantFile, the current file will be
@@ -3656,7 +4177,6 @@ cdef class VariantFile(HTSFile):
elif mode.startswith(b'r'):
# open file for reading
-
if not self._exists():
raise IOError('file `{}` not found'.format(filename))
@@ -3668,10 +4188,7 @@ cdef class VariantFile(HTSFile):
if self.htsfile.format.format not in (bcf, vcf):
raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
- if self.htsfile.format.compression == bgzf:
- bgzfp = hts_get_bgzfp(self.htsfile)
- if bgzfp and bgzf_check_EOF(bgzfp) == 0:
- warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename))
+ self.check_truncation(ignore_truncation)
with nogil:
hdr = bcf_hdr_read(self.htsfile)
@@ -3710,7 +4227,6 @@ cdef class VariantFile(HTSFile):
"""reset file position to beginning of file just after the header."""
return self.seek(self.start_offset)
-
def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
"""fetch records in a :term:`region` using 0-based indexing. The
region is specified by :term:`contig`, *start* and *end*.
@@ -3750,9 +4266,12 @@ cdef class VariantFile(HTSFile):
self.is_reading = 1
return self.index.fetch(self, contig, start, stop, region, reopen)
- cpdef VariantRecord new_record(self):
- """Create a new empty VariantRecord"""
- return self.header.new_record()
+ def new_record(self, *args, **kwargs):
+ """Create a new empty :class:`VariantRecord`.
+
+ See :meth:`VariantHeader.new_record`
+ """
+ return self.header.new_record(*args, **kwargs)
cpdef int write(self, VariantRecord record) except -1:
"""
@@ -3782,6 +4301,9 @@ cdef class VariantFile(HTSFile):
msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
+ # Sync END annotation before writing
+ bcf_sync_end(record)
+
cdef int ret
with nogil:
diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd
new file mode 100644
index 0000000..7c8e632
--- /dev/null
+++ b/pysam/libcbcftools.pxd
@@ -0,0 +1,3 @@
+cdef extern from "cbcftools_util.h":
+
+ int bcftools_main(int argc, char *argv[])
diff --git a/pysam/libcbcftools.pyx b/pysam/libcbcftools.pyx
new file mode 100644
index 0000000..8e90388
--- /dev/null
+++ b/pysam/libcbcftools.pyx
@@ -0,0 +1,2 @@
+def py_bcftools():
+ pass
diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx
index 558ceff..f1d2fa9 100644
--- a/pysam/libcbgzf.pyx
+++ b/pysam/libcbgzf.pyx
@@ -14,9 +14,10 @@ from libc.stdlib cimport malloc, calloc, realloc, free
from cpython.object cimport PyObject
from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize
-from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.libchtslib cimport *
-
+from pysam.libcutils cimport force_bytes, encode_filename
+from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \
+ bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \
+ bgzf_tell, bgzf_getline, kstring_t, SEEK_SET, BGZF
__all__ = ["BGZFile"]
@@ -32,7 +33,7 @@ cdef class BGZFile(object):
compressed file in text mode, use the gzip.open() function.
"""
cdef BGZF* bgzf
- cdef bytes name, index
+ cdef readonly object name, index
def __init__(self, filename, mode=None, index=None):
"""Constructor for the BGZFile class.
@@ -47,10 +48,14 @@ cdef class BGZFile(object):
raise ValueError("Invalid mode: {!r}".format(mode))
if not mode:
mode = 'rb'
- if mode and 'b' not in mode:
+ elif mode and 'b' not in mode:
mode += 'b'
- self.name = force_bytes(filename)
- self.index = force_bytes(index) if index is not None else None
+
+ mode = force_bytes(mode)
+
+ self.name = encode_filename(filename)
+ self.index = encode_filename(index) if index is not None else None
+
self.bgzf = bgzf_open(self.name, mode)
if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0:
@@ -59,7 +64,7 @@ cdef class BGZFile(object):
def __dealloc__(self):
self.close()
- def write(self,data):
+ def write(self, data):
if not self.bgzf:
raise ValueError("write() on closed BGZFile object")
@@ -177,6 +182,15 @@ cdef class BGZFile(object):
def seekable(self):
return True
+ def tell(self):
+ if not self.bgzf:
+ raise ValueError("seek() on closed BGZFile object")
+ cdef int64_t off = bgzf_tell(self.bgzf)
+ if off < 0:
+ raise IOError('Error in tell on BGZFFile object')
+
+ return off
+
def seek(self, offset, whence=io.SEEK_SET):
if not self.bgzf:
raise ValueError("seek() on closed BGZFile object")
@@ -198,12 +212,27 @@ cdef class BGZFile(object):
line.l = line.m = 0
line.s = NULL
- if bgzf_getline(self.bgzf, '\n', &line) < 0:
- raise IOError('Error reading line in BGZFFile object')
- ret = charptr_to_str_w_len(line.s, line.l)
+ cdef int ret = bgzf_getline(self.bgzf, '\n', &line)
+ if ret == -1:
+ s = b''
+ elif ret == -2:
+ if line.m:
+ free(line.s)
+ raise IOError('Error reading line in BGZFFile object')
+ else:
+ s = line.s[:line.l]
if line.m:
free(line.s)
- return ret
+ return s
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ line = self.readline()
+ if not line:
+ raise StopIteration()
+ return line
diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx
index 774152d..3af76f6 100644
--- a/pysam/libcfaidx.pyx
+++ b/pysam/libcfaidx.pyx
@@ -59,7 +59,7 @@ from cpython.version cimport PY_MAJOR_VERSION
from pysam.libchtslib cimport \
faidx_nseq, fai_load, fai_destroy, fai_fetch, \
- faidx_seq_len, \
+ faidx_seq_len, faidx_iseq, faidx_seq_len, \
faidx_fetch_seq, hisremote, \
bgzf_open, bgzf_close
@@ -154,21 +154,17 @@ cdef class FastaFile:
if self.fastafile == NULL:
raise IOError("could not open file `%s`" % filename)
- if self.is_remote:
- filepath_index = os.path.basename(
- re.sub("[^:]+:[/]*", "", filename)) + ".fai"
- elif filepath_index is None:
- filepath_index = filename + ".fai"
-
- if not os.path.exists(filepath_index):
- raise ValueError("could not locate index file {}".format(
- filepath_index))
-
- with open(filepath_index) as inf:
- data = [x.split("\t") for x in inf]
- self._references = tuple(x[0] for x in data)
- self._lengths = tuple(int(x[1]) for x in data)
- self.reference2length = dict(zip(self._references, self._lengths))
+ cdef int nreferences = faidx_nseq(self.fastafile)
+ cdef int x
+ cdef const char * s
+ self._references = []
+ self._lengths = []
+ for x from 0 <= x < nreferences:
+ s = faidx_iseq(self.fastafile, x)
+ ss = force_str(s)
+ self._references.append(ss)
+ self._lengths.append(faidx_seq_len(self.fastafile, s))
+ self.reference2length = dict(zip(self._references, self._lengths))
def close(self):
"""close the file."""
@@ -447,6 +443,9 @@ cdef class FastxFile:
... print(entry.sequence)
... print(entry.comment)
... print(entry.quality)
+ >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout:
+ ... for entry in fin:
+ ... fout.write(str(entry))
"""
def __cinit__(self, *args, **kwargs):
diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd
index 657a754..78a55f8 100644
--- a/pysam/libchtslib.pxd
+++ b/pysam/libchtslib.pxd
@@ -9,6 +9,12 @@ cdef extern from "Python.h":
FILE* PyFile_AsFile(object)
+# cython does not wrap stdarg
+cdef extern from "stdarg.h":
+ ctypedef struct va_list:
+ pass
+
+
cdef extern from "htslib/kstring.h" nogil:
ctypedef struct kstring_t:
size_t l, m
@@ -54,7 +60,7 @@ cdef extern from "htslib/hfile.h" nogil:
# @abstract Open the named file or URL as a stream
# @return An hFILE pointer, or NULL (with errno set) if an error occurred.
- hFILE *hopen(const char *filename, const char *mode)
+ hFILE *hopen(const char *filename, const char *mode, ...)
# @abstract Associate a stream with an existing open file descriptor
# @return An hFILE pointer, or NULL (with errno set) if an error occurred.
@@ -97,6 +103,40 @@ cdef extern from "htslib/hfile.h" nogil:
# @return The character read, or EOF on end-of-file or error
int hgetc(hFILE *fp)
+ # Read from the stream until the delimiter, up to a maximum length
+ # @param buffer The buffer into which bytes will be written
+ # @param size The size of the buffer
+ # @param delim The delimiter (interpreted as an `unsigned char`)
+ # @param fp The file stream
+ # @return The number of bytes read, or negative on error.
+ # @since 1.4
+ #
+ # Bytes will be read into the buffer up to and including a delimiter, until
+ # EOF is reached, or _size-1_ bytes have been written, whichever comes first.
+ # The string will then be terminated with a NUL byte (`\0`).
+ ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp)
+
+ # Read a line from the stream, up to a maximum length
+ # @param buffer The buffer into which bytes will be written
+ # @param size The size of the buffer
+ # @param fp The file stream
+ # @return The number of bytes read, or negative on error.
+ # @since 1.4
+ #
+ # Specialization of hgetdelim() for a `\n` delimiter.
+ ssize_t hgetln(char *buffer, size_t size, hFILE *fp)
+
+ # Read a line from the stream, up to a maximum length
+ # @param buffer The buffer into which bytes will be written
+ # @param size The size of the buffer (must be > 1 to be useful)
+ # @param fp The file stream
+ # @return _buffer_ on success, or `NULL` if an error occurred.
+ # @since 1.4
+ #
+ # This function can be used as a replacement for `fgets(3)`, or together with
+ # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_.
+ char *hgets(char *buffer, int size, hFILE *fp)
+
# @abstract Peek at characters to be read without removing them from buffers
# @param fp The file stream
# @param buffer The buffer to which the peeked bytes will be written
@@ -623,7 +663,7 @@ cdef extern from "htslib/hts.h" nogil:
# @return The index, or NULL if an error occurred.
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
- uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
+ uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
int hts_idx_get_stat(const hts_idx_t* idx, int tid,
@@ -694,6 +734,79 @@ cdef extern from "htslib/hts.h" nogil:
int hts_file_type(const char *fname)
+ # /***************************
+ # * Revised MAQ error model *
+ # ***************************/
+
+ ctypedef struct errmod_t
+
+ errmod_t *errmod_init(double depcorr)
+ void errmod_destroy(errmod_t *em)
+
+ # /*
+ # n: number of bases
+ # m: maximum base
+ # bases[i]: qual:6, strand:1, base:4
+ # q[i*m+j]: phred-scaled likelihood of (i,j)
+ # */
+ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic)
+
+ # /*****************************************
+ # * q banded glocal alignment *
+ # *****************************************/
+
+ ctypedef struct probaln_par_t:
+ float d, e
+ int bw;
+
+ int probaln_glocal(const uint8_t *ref,
+ int l_ref,
+ const uint8_t *query,
+ int l_query, const uint8_t *iqual,
+ const probaln_par_t *c,
+ int *state, uint8_t *q)
+
+ # /**********************
+ # * MD5 implementation *
+ # **********************/
+
+ ctypedef struct hts_md5_context
+
+ # /*! @abstract Intialises an MD5 context.
+ # * @discussion
+ # * The expected use is to allocate an hts_md5_context using
+ # * hts_md5_init(). This pointer is then passed into one or more calls
+ # * of hts_md5_update() to compute successive internal portions of the
+ # * MD5 sum, which can then be externalised as a full 16-byte MD5sum
+ # * calculation by calling hts_md5_final(). This can then be turned
+ # * into ASCII via hts_md5_hex().
+ # *
+ # * To dealloate any resources created by hts_md5_init() call the
+ # * hts_md5_destroy() function.
+ # *
+ # * @return hts_md5_context pointer on success, NULL otherwise.
+ # */
+ hts_md5_context *hts_md5_init()
+
+ # /*! @abstract Updates the context with the MD5 of the data. */
+ void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size)
+
+ # /*! @abstract Computes the final 128-bit MD5 hash from the given context */
+ void hts_md5_final(unsigned char *digest, hts_md5_context *ctx)
+
+ # /*! @abstract Resets an md5_context to the initial state, as returned
+ # * by hts_md5_init().
+ # */
+ void hts_md5_reset(hts_md5_context *ctx)
+
+ # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
+ # * hex string.
+ # */
+ void hts_md5_hex(char *hex, const unsigned char *digest)
+
+ # /*! @abstract Deallocates any memory allocated by hts_md5_init. */
+ void hts_md5_destroy(hts_md5_context *ctx)
+
inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
inline int hts_bin_bot(int bin, int n_lvls)
@@ -803,7 +916,9 @@ cdef extern from "htslib/sam.h" nogil:
uint8_t qual
uint8_t l_qname
uint16_t flag
- uint16_t n_cigar
+ uint8_t unused1
+ uint8_t l_extranul
+ uint32_t n_cigar
int32_t l_qseq
int32_t mtid
int32_t mpos
@@ -999,7 +1114,7 @@ cdef extern from "htslib/sam.h" nogil:
#*************************************
uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
- int32_t bam_aux2i(const uint8_t *s)
+ int64_t bam_aux2i(const uint8_t *s)
double bam_aux2f(const uint8_t *s)
char bam_aux2A(const uint8_t *s)
char *bam_aux2Z(const uint8_t *s)
@@ -1011,6 +1126,18 @@ cdef extern from "htslib/sam.h" nogil:
#*** Pileup and Mpileup ***
#**************************
+ # @abstract Generic pileup 'client data'.
+ # @discussion The pileup iterator allows setting a constructor and
+ # destructor function, which will be called every time a sequence is
+ # fetched and discarded. This permits caching of per-sequence data in
+ # a tidy manner during the pileup process. This union is the cached
+ # data to be manipulated by the "client" (the caller of pileup).
+ #
+ union bam_pileup_cd:
+ void *p
+ int64_t i
+ double f
+
# @abstract Structure for one alignment covering the pileup position.
# @field b pointer to the alignment
# @field qpos position of the read base at the pileup site, 0-based
@@ -1041,6 +1168,7 @@ cdef extern from "htslib/sam.h" nogil:
uint32_t is_tail
uint32_t is_refskip
uint32_t aux
+ bam_pileup_cd cd
ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
ctypedef int (*bam_test_f)()
@@ -1079,34 +1207,116 @@ cdef extern from "htslib/sam.h" nogil:
# Added by AH
# ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
+ # ***********************************
+ # * BAQ calculation and realignment *
+ # ***********************************/
+ int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres)
+ int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag)
+
cdef extern from "htslib/faidx.h" nogil:
ctypedef struct faidx_t:
pass
+ # /// Build index for a FASTA or bgzip-compressed FASTA file.
+ # /** @param fn FASTA file name
+ # @param fnfai Name of .fai file to build.
+ # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed).
+ # @return 0 on success; or -1 on failure
+
+ # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
+ # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI
+ # file will only be built if fn is bgzip-compressed.
+ # */
+ int fai_build3(const char *fn,
+ const char *fnfai,
+ const char *fngzi)
+
+ # /// Build index for a FASTA or bgzip-compressed FASTA file.
+ # /** @param fn FASTA file name
+ # @return 0 on success; or -1 on failure
+ #
+ # File "fn.fai" will be generated. This function is equivalent to
+ # fai_build3(fn, NULL, NULL);
+ # */
int fai_build(char *fn)
+ # /// Destroy a faidx_t struct
void fai_destroy(faidx_t *fai)
+ # /// Load FASTA indexes.
+ # /** @param fn File name of the FASTA file (can be compressed with bgzip).
+ # @param fnfai File name of the FASTA index.
+ # @param fngzi File name of the bgzip index.
+ # @param flags Option flags to control index file caching and creation.
+ # @return Pointer to a faidx_t struct on success, NULL on failure.
+
+ # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
+ # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
+ # The bgzip index is only needed if fn is compressed.
+
+ # If (flags & FAI_CREATE) is true, the index files will be built using
+ # fai_build3() if they are not already present.
+ # */
+ faidx_t *fai_load3(const char *fn,
+ const char *fnfai,
+ const char *fngzi,
+ int flags)
+
+ # /// Load index from "fn.fai".
+ # /** @param fn File name of the FASTA file
+ # @return Pointer to a faidx_t struct on success, NULL on failure.
+ # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE);
+ # */
faidx_t *fai_load(char *fn)
+ # /// Fetch the sequence in a region
+ # /** @param fai Pointer to the faidx_t struct
+ # @param reg Region in the format "chr2:20,000-30,000"
+ # @param len Length of the region; -2 if seq not present, -1 general error
+ # @return Pointer to the sequence; `NULL` on failure
+ # The returned sequence is allocated by `malloc()` family and should be destroyed
+ # by end users by calling `free()` on it.
+ # */
char *fai_fetch(faidx_t *fai,
char *reg,
int *len)
- int faidx_nseq(faidx_t *fai)
-
- int faidx_has_seq(faidx_t *fai, const char *seq)
-
+ # /// Fetch the sequence in a region
+ # /** @param fai Pointer to the faidx_t struct
+ # @param c_name Region name
+ # @param p_beg_i Beginning position number (zero-based)
+ # @param p_end_i End position number (zero-based)
+ # @param len Length of the region; -2 if c_name not present, -1 general error
+ # @return Pointer to the sequence; null on failure
+ # The returned sequence is allocated by `malloc()` family and should be destroyed
+ # by end users by calling `free()` on it.
+ # */
char *faidx_fetch_seq(faidx_t *fai,
char *c_name,
int p_beg_i,
int p_end_i,
int *len)
- int faidx_seq_len(faidx_t *fai, const char *seq)
+ # /// Query if sequence is present
+ # /** @param fai Pointer to the faidx_t struct
+ # @param seq Sequence name
+ # @return 1 if present or 0 if absent
+ # */
+ int faidx_has_seq(faidx_t *fai, const char *seq)
+
+ # /// Fetch the number of sequences
+ # /** @param fai Pointer to the faidx_t struct
+ # @return The number of sequences
+ # */
+ int faidx_nseq(const faidx_t *fai)
+ # /// Return name of i-th sequence
+ const char *faidx_iseq(const faidx_t *fai, int i)
+
+ # /// Return sequence length, -1 if not present
+ int faidx_seq_len(faidx_t *fai, const char *seq)
# tabix support
cdef extern from "htslib/tbx.h" nogil:
@@ -1695,7 +1905,7 @@ cdef extern from "htslib/vcf.h" nogil:
int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
- int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst)
+ int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst)
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
@@ -1901,6 +2111,455 @@ cdef extern from "htslib/vcfutils.h" nogil:
uint32_t bcf_ij2G(uint32_t i, uint32_t j)
+cdef extern from "htslib/cram.h" nogil:
+
+ enum cram_block_method:
+ ERROR
+ RAW
+ GZIP
+ BZIP2
+ LZMA
+ RANS
+ RANS0
+ RANS1
+ GZIP_RLE
+
+ enum cram_content_type:
+ CT_ERROR
+ FILE_HEADER
+ COMPRESSION_HEADER
+ MAPPED_SLICE
+ UNMAPPED_SLICE
+ EXTERNAL
+ CORE
+
+ # Opaque data types, see cram_structs for the fully fledged versions.
+ ctypedef struct SAM_hdr
+ ctypedef struct cram_file_def
+ ctypedef struct cram_fd
+ ctypedef struct cram_container
+ ctypedef struct cram_block
+ ctypedef struct cram_slice
+ ctypedef struct cram_metrics
+ ctypedef struct cram_block_slice_hdr
+ ctypedef struct cram_block_compression_hdr
+ ctypedef struct refs_t
+
+ # Accessor functions
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_fd
+ #
+ SAM_hdr *cram_fd_get_header(cram_fd *fd)
+ void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr)
+
+ int cram_fd_get_version(cram_fd *fd)
+ void cram_fd_set_version(cram_fd *fd, int vers)
+
+ int cram_major_vers(cram_fd *fd)
+ int cram_minor_vers(cram_fd *fd)
+
+ hFILE *cram_fd_get_fp(cram_fd *fd)
+ void cram_fd_set_fp(cram_fd *fd, hFILE *fp)
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_container
+ #
+ int32_t cram_container_get_length(cram_container *c)
+ void cram_container_set_length(cram_container *c, int32_t length)
+ int32_t cram_container_get_num_blocks(cram_container *c)
+ void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks)
+ int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks)
+ void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
+ int32_t *landmarks)
+
+ # Returns true if the container is empty (EOF marker) */
+ int cram_container_is_empty(cram_fd *fd)
+
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_block
+ #
+ int32_t cram_block_get_content_id(cram_block *b)
+ int32_t cram_block_get_comp_size(cram_block *b)
+ int32_t cram_block_get_uncomp_size(cram_block *b)
+ int32_t cram_block_get_crc32(cram_block *b)
+ void * cram_block_get_data(cram_block *b)
+
+ cram_content_type cram_block_get_content_type(cram_block *b)
+
+ void cram_block_set_content_id(cram_block *b, int32_t id)
+ void cram_block_set_comp_size(cram_block *b, int32_t size)
+ void cram_block_set_uncomp_size(cram_block *b, int32_t size)
+ void cram_block_set_crc32(cram_block *b, int32_t crc)
+ void cram_block_set_data(cram_block *b, void *data)
+
+ int cram_block_append(cram_block *b, void *data, int size)
+ void cram_block_update_size(cram_block *b)
+
+ # Offset is known as "size" internally, but it can be confusing.
+ size_t cram_block_get_offset(cram_block *b)
+ void cram_block_set_offset(cram_block *b, size_t offset)
+
+ #
+ # Computes the size of a cram block, including the block
+ # header itself.
+ #
+ uint32_t cram_block_size(cram_block *b)
+
+ #
+ # Renumbers RG numbers in a cram compression header.
+ #
+ # CRAM stores RG as the Nth number in the header, rather than a
+ # string holding the ID: tag. This is smaller in space, but means
+ # "samtools cat" to join files together that contain single but
+ # different RG lines needs a way of renumbering them.
+ #
+ # The file descriptor is expected to be immediately after the
+ # cram_container structure (ie before the cram compression header).
+ # Due to the nature of the CRAM format, this needs to read and write
+ # the blocks itself. Note that there may be multiple slices within
+ # the container, meaning multiple compression headers to manipulate.
+ # Changing RG may change the size of the compression header and
+ # therefore the length field in the container. Hence we rewrite all
+ # blocks just incase and also emit the adjusted container.
+ #
+ # The current implementation can only cope with renumbering a single
+ # RG (and only then if it is using HUFFMAN or BETA codecs). In
+ # theory it *may* be possible to renumber multiple RGs if they use
+ # HUFFMAN to the CORE block or use an external block unshared by any
+ # other data series. So we have an API that can be upgraded to
+ # support this, but do not implement it for now. An example
+ # implementation of RG as an EXTERNAL block would be to find that
+ # block and rewrite it, returning the number of blocks consumed.
+ #
+ # Returns 0 on success;
+ # -1 if unable to edit;
+ # -2 on other errors (eg I/O).
+ #
+ int cram_transcode_rg(cram_fd *input, cram_fd *output,
+ cram_container *c,
+ int nrg, int *in_rg, int *out_rg)
+
+ #
+ # Copies the blocks representing the next num_slice slices from a
+ # container from 'in' to 'out'. It is expected that the file pointer
+ # is just after the read of the cram_container and cram compression
+ # header.
+ #
+ # Returns 0 on success
+ # -1 on failure
+ #
+ int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice)
+
+ #
+ #-----------------------------------------------------------------------------
+ # SAM_hdr
+ #
+
+ # Tokenises a SAM header into a hash table.
+ #
+ # Also extracts a few bits on specific data types, such as @RG lines.
+ #
+ # @return
+ # Returns a SAM_hdr struct on success (free with sam_hdr_free())
+ # NULL on failure
+ #
+ SAM_hdr *sam_hdr_parse_(const char *hdr, int len)
+
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_io basics
+ #
+
+ # CRAM blocks - the dynamically growable data block. We have code to
+ # create, update, (un)compress and read/write.
+ #
+ # These are derived from the deflate_interlaced.c blocks, but with the
+ # CRAM extension of content types and IDs.
+ #
+
+ # Allocates a new cram_block structure with a specified content_type and
+ # id.
+ #
+ # @return
+ # Returns block pointer on success;
+ # NULL on failure
+ #
+ cram_block *cram_new_block(cram_content_type content_type,
+ int content_id)
+
+ # Reads a block from a cram file.
+ #
+ # @return
+ # Returns cram_block pointer on success;
+ # NULL on failure
+ #
+ cram_block *cram_read_block(cram_fd *fd)
+
+ # Writes a CRAM block.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_write_block(cram_fd *fd, cram_block *b)
+
+ # Frees a CRAM block, deallocating internal data too.
+ #
+ void cram_free_block(cram_block *b)
+
+ # Uncompresses a CRAM block, if compressed.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_uncompress_block(cram_block *b)
+
+ # Compresses a block.
+ #
+ # Compresses a block using one of two different zlib strategies. If we only
+ # want one choice set strat2 to be -1.
+ #
+ # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
+ # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
+ # significantly faster.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
+ int method, int level)
+
+ # Containers
+ #
+
+ # Creates a new container, specifying the maximum number of slices
+ # and records permitted.
+ #
+ # @return
+ # Returns cram_container ptr on success;
+ # NULL on failure
+ #
+ cram_container *cram_new_container(int nrec, int nslice)
+ void cram_free_container(cram_container *c)
+
+ # Reads a container header.
+ #
+ # @return
+ # Returns cram_container on success;
+ # NULL on failure or no container left (fd->err == 0).
+ #
+ cram_container *cram_read_container(cram_fd *fd)
+
+ # Writes a container structure.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_write_container(cram_fd *fd, cram_container *h)
+
+ #
+ # Stores the container structure in dat and returns *size as the
+ # number of bytes written to dat[]. The input size of dat is also
+ # held in *size and should be initialised to cram_container_size(c).
+ #
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
+
+ int cram_container_size(cram_container *c)
+
+ # The top-level cram opening, closing and option handling
+ #
+
+ # Opens a CRAM file for read (mode "rb") or write ("wb").
+ #
+ # The filename may be "-" to indicate stdin or stdout.
+ #
+ # @return
+ # Returns file handle on success;
+ # NULL on failure.
+ #
+ cram_fd *cram_open(const char *filename, const char *mode)
+
+ # Opens an existing stream for reading or writing.
+ #
+ # @return
+ # Returns file handle on success;
+ # NULL on failure.
+ #
+ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode)
+
+ # Closes a CRAM file.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_close(cram_fd *fd)
+
+ #
+ # Seek within a CRAM file.
+ #
+ # Returns 0 on success
+ # -1 on failure
+ #
+ int cram_seek(cram_fd *fd, off_t offset, int whence)
+
+ #
+ # Flushes a CRAM file.
+ # Useful for when writing to stdout without wishing to close the stream.
+ #
+ # Returns 0 on success
+ # -1 on failure
+ #
+ int cram_flush(cram_fd *fd)
+
+ # Checks for end of file on a cram_fd stream.
+ #
+ # @return
+ # Returns 0 if not at end of file
+ # 1 if we hit an expected EOF (end of range or EOF block)
+ # 2 for other EOF (end of stream without EOF block)
+ #
+ int cram_eof(cram_fd *fd)
+
+ # Sets options on the cram_fd.
+ #
+ # See CRAM_OPT_* definitions in hts.h.
+ # Use this immediately after opening.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...)
+
+ # Sets options on the cram_fd.
+ #
+ # See CRAM_OPT_* definitions in hts.h.
+ # Use this immediately after opening.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args)
+
+ #
+ # Attaches a header to a cram_fd.
+ #
+ # This should be used when creating a new cram_fd for writing where
+ # we have an SAM_hdr already constructed (eg from a file we've read
+ # in).
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_set_header(cram_fd *fd, SAM_hdr *hdr)
+
+ # Check if this file has a proper EOF block
+ #
+ # @return
+ # Returns 3 if the file is a version of CRAM that does not contain EOF blocks
+ # 2 if the file is a stream and thus unseekable
+ # 1 if the file contains an EOF block
+ # 0 if the file does not contain an EOF block
+ # -1 if an error occured whilst reading the file or we could not seek back to where we were
+ #
+ #
+ int cram_check_EOF(cram_fd *fd)
+
+ # As int32_decoded/encode, but from/to blocks instead of cram_fd */
+ int int32_put_blk(cram_block *b, int32_t val)
+
+ # Deallocates all storage used by a SAM_hdr struct.
+ #
+ # This also decrements the header reference count. If after decrementing
+ # it is still non-zero then the header is assumed to be in use by another
+ # caller and the free is not done.
+ #
+ # This is a synonym for sam_hdr_dec_ref().
+ #
+ void sam_hdr_free(SAM_hdr *hdr)
+
+ # Returns the current length of the SAM_hdr in text form.
+ #
+ # Call sam_hdr_rebuild() first if editing has taken place.
+ #
+ int sam_hdr_length(SAM_hdr *hdr)
+
+ # Returns the string form of the SAM_hdr.
+ #
+ # Call sam_hdr_rebuild() first if editing has taken place.
+ #
+ char *sam_hdr_str(SAM_hdr *hdr)
+
+ # Appends a formatted line to an existing SAM header.
+ #
+ # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
+ # optional new-line. If it contains more than 1 line then multiple lines
+ # will be added in order.
+ #
+ # Len is the length of the text data, or 0 if unknown (in which case
+ # it should be null terminated).
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+
+ # Add an @PG line.
+ #
+ # If we wish complete control over this use sam_hdr_add() directly. This
+ # function uses that, but attempts to do a lot of tedious house work for
+ # you too.
+ #
+ # - It will generate a suitable ID if the supplied one clashes.
+ # - It will generate multiple @PG records if we have multiple PG chains.
+ #
+ # Call it as per sam_hdr_add() with a series of key,value pairs ending
+ # in NULL.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...)
+
+ #
+ # A function to help with construction of CL tags in @PG records.
+ # Takes an argc, argv pair and returns a single space-separated string.
+ # This string should be deallocated by the calling function.
+ #
+ # @return
+ # Returns malloced char * on success;
+ # NULL on failure
+ #
+ char *stringify_argv(int argc, char *argv[])
+
+ #
+ # Returns the refs_t structure used by a cram file handle.
+ #
+ # This may be used in conjunction with option CRAM_OPT_SHARED_REF to
+ # share reference memory between multiple file handles.
+ #
+ # @return
+ # Returns NULL if none exists or the file handle is not a CRAM file.
+ #
+ refs_t *cram_get_refs(htsFile *fd)
+
+
cdef class HTSFile(object):
cdef htsFile *htsfile # pointer to htsFile structure
cdef int64_t start_offset # BGZF offset of first record
diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx
index 7eea059..4b8d9c0 100644
--- a/pysam/libchtslib.pyx
+++ b/pysam/libchtslib.pyx
@@ -2,8 +2,11 @@
# cython: profile=True
# adds doc-strings for sphinx
import os
+import io
from posix.unistd cimport dup
+from libc.errno cimport errno
+from cpython cimport PyBytes_FromStringAndSize
from pysam.libchtslib cimport *
@@ -11,15 +14,24 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_
from pysam.libcutils cimport encode_filename, from_string_and_size
-__all__ = ["get_verbosity", "set_verbosity"]
+from warnings import warn
+__all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
+
+# defines imported from samtools
+DEF SEEK_SET = 0
+DEF SEEK_CUR = 1
+DEF SEEK_END = 2
+
########################################################################
########################################################################
## Constants
########################################################################
+# maximum genomic coordinace
cdef int MAX_POS = 2 << 29
+
cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
@@ -35,6 +47,230 @@ cpdef get_verbosity():
return hts_get_verbosity()
+cdef class HFile(object):
+ cdef hFILE *fp
+ cdef readonly object name, mode
+
+ def __init__(self, name, mode='r', closedf=True):
+ self._open(name, mode, closefd=True)
+
+ def __dealloc__(self):
+ self.close()
+
+ @property
+ def closed(self):
+ return self.fp == NULL
+
+ cdef _open(self, name, mode, closefd=True):
+ self.name = name
+ self.mode = mode
+
+ mode = force_bytes(mode)
+
+ if isinstance(name, int):
+ if self.fp != NULL:
+ name = dup(name)
+ self.fp = hdopen(name, mode)
+ else:
+ name = encode_filename(name)
+ self.fp = hopen(name, mode)
+
+ if not self.fp:
+ raise OSError(errno, 'failed to open HFile', self.name)
+
+ def close(self):
+ if self.fp == NULL:
+ return
+
+ cdef hFILE *fp = self.fp
+ self.fp = NULL
+
+ if hclose(fp) != 0:
+ raise OSError(herrno(self.fp), 'failed to close HFile', self.name)
+
+ def fileno(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+ if isinstance(self.name, int):
+ return self.name
+ else:
+ raise AttributeError('fileno not available')
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, tb):
+ self.close()
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ line = self.readline()
+ if not line:
+ raise StopIteration()
+ return line
+
+ def flush(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+ if hflush(self.fp) != 0:
+ raise OSError(herrno(self.fp), 'failed to flush HFile', self.name)
+
+ def isatty(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+ return False
+
+ def readable(self):
+ return self.fp != NULL and 'r' in self.mode
+
+ def read(self, Py_ssize_t size=-1):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ if size == 0:
+ return b''
+
+ cdef list parts = []
+ cdef bytes part
+ cdef Py_ssize_t chunk_size, ret, bytes_read = 0
+ cdef char *cpart
+
+ while size == -1 or bytes_read < size:
+ chunk_size = 4096
+ if size != -1:
+ chunk_size = min(chunk_size, size - bytes_read)
+
+ part = PyBytes_FromStringAndSize(NULL, chunk_size)
+ cpart = <char *>part
+ ret = hread(self.fp, <void *>cpart, chunk_size)
+
+ if ret < 0:
+ OSError(herrno(self.fp), 'failed to read HFile', self.name)
+ elif not ret:
+ break
+
+ bytes_read += ret
+
+ if ret < chunk_size:
+ part = cpart[:ret]
+
+ parts.append(part)
+
+ return b''.join(parts)
+
+ def readall(self):
+ return self.read()
+
+ def readinto(self, buf):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ size = len(buf)
+
+ if size == 0:
+ return size
+
+ mv = memoryview(buf)
+ ret = hread(self.fp, <void *>mv, size)
+
+ if ret < 0:
+ OSError(herrno(self.fp), 'failed to read HFile', self.name)
+
+ return ret
+
+ def readline(self, Py_ssize_t size=-1):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ if size == 0:
+ return b''
+
+ cdef list parts = []
+ cdef bytes part
+ cdef Py_ssize_t chunk_size, ret, bytes_read = 0
+ cdef char *cpart
+
+ while size == -1 or bytes_read < size:
+ chunk_size = 4096
+ if size != -1:
+ chunk_size = min(chunk_size, size - bytes_read)
+
+ part = PyBytes_FromStringAndSize(NULL, chunk_size)
+ cpart = <char *>part
+
+ # Python bytes objects allocate an extra byte for a null terminator
+ ret = hgetln(cpart, chunk_size+1, self.fp)
+
+ if ret < 0:
+ OSError(herrno(self.fp), 'failed to read HFile', self.name)
+ elif not ret:
+ break
+
+ bytes_read += ret
+
+ if ret < chunk_size:
+ part = cpart[:ret]
+ cpart = <char *>part
+
+ parts.append(part)
+
+ if cpart[ret-1] == b'\n':
+ break
+
+ return b''.join(parts)
+
+ def readlines(self):
+ return list(self)
+
+ def seek(self, Py_ssize_t offset, int whence=SEEK_SET):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ cdef Py_ssize_t off = hseek(self.fp, offset, whence)
+
+ if off < 0:
+ raise OSError(herrno(self.fp), 'seek failed on HFile', self.name)
+
+ return off
+
+ def tell(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ ret = htell(self.fp)
+
+ if ret < 0:
+ raise OSError(herrno(self.fp), 'tell failed on HFile', self.name)
+
+ return ret
+
+ def seekable(self):
+ return self.fp != NULL
+
+ def truncate(self, size=None):
+ raise NotImplementedError()
+
+ def writable(self):
+ return self.fp != NULL and 'w' in self.mode
+
+ def write(self, bytes b):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ got = hwrite(self.fp, <void *>b, len(b))
+
+ if got < 0:
+ raise OSError(herrno(self.fp), 'write failed on HFile', self.name)
+
+ return got
+
+ def writelines(self, lines):
+ for line in lines:
+ self.write(line)
+
+
class CallableValue(object):
def __init__(self, value):
self.value = value
@@ -62,11 +298,38 @@ cdef class HTSFile(object):
self.htsfile = NULL
self.duplicate_filehandle = True
+ def close(self):
+ if self.htsfile:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+
def __dealloc__(self):
if self.htsfile:
hts_close(self.htsfile)
self.htsfile = NULL
+ def check_truncation(self, ignore_truncation=False):
+ """Check if file is truncated."""
+ if not self.htsfile:
+ return
+
+ if self.htsfile.format.compression != bgzf:
+ return
+
+ cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile)
+ if not bgzfp:
+ return
+
+ cdef int ret = bgzf_check_EOF(bgzfp)
+ if ret < 0:
+ raise OSError(errno, 'error checking for EOF marker')
+ elif ret == 0:
+ msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename)
+ if ignore_truncation:
+ warn(msg)
+ else:
+ raise OSError(msg)
+
def __enter__(self):
return self
@@ -189,12 +452,15 @@ cdef class HTSFile(object):
raise OSError('seek not available in streams')
cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
+ if self.htsfile.format.compression == bgzf:
with nogil:
ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
- else:
+ elif self.htsfile.format.compression == no_compression:
with nogil:
ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
+ else:
+ raise NotImplementedError("seek not implemented in files compressed by method {}".format(
+ self.htsfile.format.compression))
return ret
def tell(self):
@@ -205,12 +471,19 @@ cdef class HTSFile(object):
raise OSError('tell not available in streams')
cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
+ if self.htsfile.format.compression == bgzf:
with nogil:
ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
- else:
+ elif self.htsfile.format.compression == no_compression:
with nogil:
ret = hts_utell(self.htsfile)
+ elif self.htsfile.format.format == cram:
+ with nogil:
+ ret = htell(cram_fd_get_fp(self.htsfile.fp.cram))
+ else:
+ raise NotImplementedError("seek not implemented in files compressed by method {}".format(
+ self.htsfile.format.compression))
+
return ret
cdef htsFile *_open_htsfile(self) except? NULL:
@@ -227,7 +500,7 @@ cdef class HTSFile(object):
fd = self.filename
else:
fd = self.filename.fileno()
-
+
if self.duplicate_filehandle:
dup_fd = dup(fd)
else:
diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd
new file mode 100644
index 0000000..5fdc57f
--- /dev/null
+++ b/pysam/libcsamtools.pxd
@@ -0,0 +1,3 @@
+cdef extern from "csamtools_util.h":
+
+ int samtools_main(int argc, char *argv[])
diff --git a/pysam/libcsamtools.pyx b/pysam/libcsamtools.pyx
new file mode 100644
index 0000000..cc60ace
--- /dev/null
+++ b/pysam/libcsamtools.pyx
@@ -0,0 +1,2 @@
+def py_samtools():
+ pass
diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd
index 12cd9dd..c986f03 100644
--- a/pysam/libctabix.pxd
+++ b/pysam/libctabix.pxd
@@ -81,6 +81,10 @@ cdef class asGTF(Parser):
pass
+cdef class asGFF3(Parser):
+ pass
+
+
cdef class asBed(Parser):
pass
diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx
index 10dc23b..b10c0d0 100644
--- a/pysam/libctabix.pyx
+++ b/pysam/libctabix.pyx
@@ -9,7 +9,8 @@
# class TabixFile class wrapping tabix indexed files in bgzf format
#
# class asTuple Parser class for tuples
-# class asGT Parser class for GTF formatted rows
+# class asGTF Parser class for GTF formatted rows
+# class asGFF3 Parser class for GFF3 formatted rows
# class asBed Parser class for Bed formatted rows
# class asVCF Parser class for VCF formatted rows
#
@@ -110,6 +111,42 @@ cdef class asTuple(Parser):
return r
+cdef class asGFF3(Parser):
+ '''converts a :term:`tabix row` into a GFF record with the following
+ fields:
+
+ +----------+----------+-------------------------------+
+ |*Column* |*Name* |*Content* |
+ +----------+----------+-------------------------------+
+ |1 |contig |the chromosome name |
+ +----------+----------+-------------------------------+
+ |2 |feature |The feature type |
+ +----------+----------+-------------------------------+
+ |3 |source |The feature source |
+ +----------+----------+-------------------------------+
+ |4 |start |genomic start coordinate |
+ | | |(0-based) |
+ +----------+----------+-------------------------------+
+ |5 |end |genomic end coordinate |
+ | | |(0-based) |
+ +----------+----------+-------------------------------+
+ |6 |score |feature score |
+ +----------+----------+-------------------------------+
+ |7 |strand |strand |
+ +----------+----------+-------------------------------+
+ |8 |frame |frame |
+ +----------+----------+-------------------------------+
+ |9 |attributes|the attribute field |
+ +----------+----------+-------------------------------+
+
+ '''
+ cdef parse(self, char * buffer, int len):
+ cdef ctabixproxies.GFF3Proxy r
+ r = ctabixproxies.GFF3Proxy(self.encoding)
+ r.copy(buffer, len)
+ return r
+
+
cdef class asGTF(Parser):
'''converts a :term:`tabix row` into a GTF record with the following
fields:
@@ -155,7 +192,7 @@ cdef class asGTF(Parser):
r = ctabixproxies.GTFProxy(self.encoding)
r.copy(buffer, len)
return r
-
+
cdef class asBed(Parser):
'''converts a :term:`tabix row` into a bed record
@@ -1178,6 +1215,7 @@ __all__ = [
"Tabixfile",
"asTuple",
"asGTF",
+ "asGFF3",
"asVCF",
"asBed",
"GZIterator",
diff --git a/pysam/libctabixproxies.pxd b/pysam/libctabixproxies.pxd
index 5317b81..edea701 100644
--- a/pysam/libctabixproxies.pxd
+++ b/pysam/libctabixproxies.pxd
@@ -25,19 +25,21 @@ cdef class TupleProxy:
cdef copy(self, char * buffer, size_t nbytes, bint reset=*)
cdef update(self, char * buffer, size_t nbytes)
-cdef class GTFProxy(TupleProxy) :
- cdef:
- char * _attributes
- cdef bint hasOwnAttributes
+cdef class NamedTupleProxy(TupleProxy):
+ pass
+
+cdef class GTFProxy(NamedTupleProxy):
+ cdef object attribute_dict
cpdef int getMaxFields(self)
cpdef int getMinFields(self)
- cdef char * getAttributes(self)
-cdef class NamedTupleProxy(TupleProxy):
+
+cdef class GFF3Proxy(GTFProxy):
pass
+
cdef class BedProxy(NamedTupleProxy):
cdef:
diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx
index 9a8a678..dc434e0 100644
--- a/pysam/libctabixproxies.pyx
+++ b/pysam/libctabixproxies.pyx
@@ -10,18 +10,21 @@ from pysam.libcutils cimport encode_filename, from_string_and_size
import collections
+
cdef char *StrOrEmpty(char * buffer):
if buffer == NULL:
return ""
else: return buffer
+
cdef int isNew(char * p, char * buffer, size_t nbytes):
"""return True if `p` is located within `buffer` of size
`nbytes`
"""
if p == NULL:
return 0
- return not (buffer <= p < buffer + nbytes)
+
+ return not (buffer <= p <= buffer + nbytes)
cdef class TupleProxy:
@@ -230,7 +233,7 @@ cdef class TupleProxy:
self.nfields = field
if self.nfields < self.getMinFields():
raise ValueError(
- "parsing error: fewer that %i fields in line: %s" %
+ "parsing error: fewer than %i fields in line: %s" %
(self.getMinFields(), buffer))
def _getindex(self, int index):
@@ -268,7 +271,7 @@ cdef class TupleProxy:
raise IndexError("list index out of range")
if isNew(self.fields[idx], self.data, self.nbytes):
- free(self.fields[idx] )
+ free(self.fields[idx])
self.is_modified = 1
@@ -350,7 +353,62 @@ def quote(v):
return str(v)
-cdef class GTFProxy(TupleProxy):
+cdef class NamedTupleProxy(TupleProxy):
+
+ map_key2field = {}
+
+ def __setattr__(self, key, value):
+ '''set attribute.'''
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ if self.nfields < idx:
+ raise KeyError("field %s not set" % key)
+ TupleProxy.__setitem__(self, idx, str(value))
+
+ def __getattr__(self, key):
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ if self.nfields < idx:
+ raise KeyError("field %s not set" % key)
+ if f == str:
+ return force_str(self.fields[idx],
+ self.encoding)
+ return f(self.fields[idx])
+
+
+cdef dot_or_float(v):
+ if v == "" or v == b".":
+ return None
+ else:
+ try:
+ return int(v)
+ except ValueError:
+ return float(v)
+
+
+cdef dot_or_int(v):
+ if v == "" or v == b".":
+ return None
+ else:
+ return int(v)
+
+
+cdef dot_or_str(v):
+ if v == "" or v == b".":
+ return None
+ else:
+ return force_str(v)
+
+
+cdef int from1based(v):
+ return atoi(v) - 1
+
+
+cdef str to1based(int v):
+ return str(v + 1)
+
+
+cdef class GTFProxy(NamedTupleProxy):
'''Proxy class for access to GTF fields.
This class represents a GTF entry for fast read-access.
@@ -361,18 +419,29 @@ cdef class GTFProxy(TupleProxy):
The only exception is the attributes field when set from
a dictionary - this field will manage its own memory.
+
'''
+ separator = "; "
+ # first value is field index, the tuple contains conversion
+ # functions for getting (converting internal string representation
+ # to pythonic value) and setting (converting pythonic value to
+ # interval string representation)
+ map_key2field = {
+ 'contig' : (0, (str, str)),
+ 'source' : (1, (dot_or_str, str)),
+ 'feature': (2, (dot_or_str, str)),
+ 'start' : (3, (from1based, to1based)),
+ 'end' : (4, (int, int)),
+ 'score' : (5, (dot_or_float, toDot)),
+ 'strand' : (6, (dot_or_str, str)),
+ 'frame' : (7, (dot_or_int, toDot)),
+ 'attributes': (8, (str, str))}
+
def __cinit__(self):
# automatically calls TupleProxy.__cinit__
- self.hasOwnAttributes = False
- self._attributes = NULL
-
- def __dealloc__(self):
- # automatically calls TupleProxy.__dealloc__
- if self.hasOwnAttributes:
- free(self._attributes)
-
+ self.attribute_dict = None
+
cpdef int getMinFields(self):
'''return minimum number of fields.'''
return 9
@@ -381,182 +450,18 @@ cdef class GTFProxy(TupleProxy):
'''return max number of fields.'''
return 9
- property contig:
- '''contig of feature.'''
- def __get__(self):
- return self._getindex(0)
- def __set__(self, value):
- self._setindex(0, value)
-
- property source:
- '''feature source.'''
- def __get__(self):
- return self._getindex(1)
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(1, value)
-
- property feature:
- '''feature name.'''
- def __get__(self):
- return self._getindex(2)
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(2, value)
-
- property start:
- '''feature start (in 0-based open/closed coordinates).'''
- def __get__(self ):
- return int( self._getindex(3)) - 1
- def __set__(self, value ):
- self._setindex(3, str(value+1))
-
- property end:
- '''feature end (in 0-based open/closed coordinates).'''
- def __get__(self):
- return int(self._getindex(4))
- def __set__(self, value):
- self._setindex(4, str(value))
-
- property score:
- '''feature score.'''
- def __get__(self):
- v = self._getindex(5)
- if v == "" or v[0] == '.':
- return None
- else:
- return float(v)
-
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(5, str(value))
-
- property strand:
- '''feature strand.'''
- def __get__(self):
- return self._getindex(6)
- def __set__(self, value ):
- if value is None:
- value = "."
- self._setindex(6, value)
-
- property frame:
- '''feature frame.'''
- def __get__(self):
- v = self._getindex(7)
- if v == "" or v[0] == '.':
- return v
- else:
- return int(v)
-
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(7, str(value))
-
- property attributes:
- '''feature attributes (as a string).'''
- def __get__(self):
- if self.hasOwnAttributes:
- return force_str(self._attributes)
- else:
- return force_str(self._getindex(8))
- def __set__( self, value):
- if self.hasOwnAttributes:
- free(self._attributes)
- self._attributes = NULL
- self.hasOwnAttributes = False
- self._setindex(8, value)
-
- cdef char * getAttributes(self):
- '''return pointer to attributes.'''
- cdef char * attributes
- if self.hasOwnAttributes:
- attributes = self._attributes
- else:
- attributes = self.fields[8]
- if attributes == NULL:
- raise KeyError("no attributes defined GTF entry")
- return attributes
-
def asDict(self):
"""parse attributes - return as dict
"""
-
- # remove comments
- attributes = self.attributes
-
- # separate into fields
- # Fields might contain a ";", for example in ENSEMBL GTF file
- # for mouse, v78:
- # ...; transcript_name "TXNRD2;-001"; ....
- # The current heuristic is to split on a semicolon followed by a
- # space, see also http://mblab.wustl.edu/GTF22.html
-
- # Remove white space to prevent a last empty field.
- fields = [x.strip() for x in attributes.strip().split("; ")]
-
- result = collections.OrderedDict()
-
- for f in fields:
-
- # strip semicolon (GTF files without a space after the last semicolon)
- if f.endswith(";"):
- f = f[:-1]
-
- # split at most once in order to avoid separating
- # multi-word values
- d = [x.strip() for x in f.split(" ", 1)]
-
- n,v = d[0], d[1]
- if len(d) > 2:
- v = d[1:]
-
- if v[0] == '"' and v[-1] == '"':
- v = v[1:-1]
- else:
- ## try to convert to a value
- try:
- v = float(v)
- v = int(v)
- except ValueError:
- pass
- except TypeError:
- pass
-
- result[n] = v
-
- return result
+ return collections.OrderedDict(self.attribute_iterator())
def fromDict(self, d):
'''set attributes from a dictionary.'''
- cdef char * p
- cdef int l
-
- # clean up if this field is set twice
- if self.hasOwnAttributes:
- free(self._attributes)
-
- aa = []
- for k,v in d.items():
- if isinstance(v, str):
- aa.append( '%s "%s"' % (k,v) )
- else:
- aa.append( '%s %s' % (k,str(v)) )
-
- a = force_bytes("; ".join(aa) + ";")
- p = a
- l = len(a)
- self._attributes = <char *>calloc(l + 1, sizeof(char))
- if self._attributes == NULL:
- raise ValueError("out of memory")
- memcpy(self._attributes, p, l)
-
- self.hasOwnAttributes = True
- self.is_modified = True
+ self.attribute_dict = None
+ attribute_string = force_bytes(
+ self.attribute_dict2string(d),
+ self.encoding)
+ self._setindex(8, attribute_string)
def __str__(self):
cdef char * cpy
@@ -565,9 +470,9 @@ cdef class GTFProxy(TupleProxy):
if self.is_modified:
return "\t".join(
(self.contig,
- self.source,
- self.feature,
- str(self.start+1),
+ toDot(self.source),
+ toDot(self.feature),
+ str(self.start + 1),
str(self.end),
toDot(self.score),
toDot(self.strand),
@@ -589,73 +494,26 @@ cdef class GTFProxy(TupleProxy):
def keys(self):
'''return a list of attributes defined in this entry.'''
- r = self.attributes
- return [x.strip().split(" ")[0]
- # separator is ';' followed by space
- for x in r.split("; ") if x.strip() != '']
+ if not self.attribute_dict:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ return self.attribute_dict.keys()
def __getitem__(self, key):
return self.__getattr__(key)
- def __getattr__(self, item):
- """Generic lookup of attribute from GFF/GTF attributes
- Only called if there *isn't* an attribute with this name
- """
- cdef char * start
- cdef char * query
- cdef char * cpy
- cdef char * end
- cdef int l
-
- #
- # important to use the getAttributes function.
- # Using the self.attributes property to access
- # the attributes caused a hard-to-trace bug
- # in which fields in the attribute string were
- # set to 0.
- # Running through valgrind complained that
- # memory was accessed in the memory field
- # that has been released. It is not clear
- # why this happened and might be a cython bug
- # (Version 0.16). The valgrind warnings
- # disappeard after accessing the C data structures
- # directly and so did the bug.
- cdef char * attributes = self.getAttributes()
- if attributes == NULL:
- raise KeyError("key %s not found, no attributes" % item)
-
- # add space in order to make sure
- # to not pick up a field that is a prefix of another field
- r = force_bytes(item + " ")
- query = r
- start = strstr(attributes, query)
-
- if start == NULL:
- raise AttributeError("'GTFProxy' has no attribute '%s'" % item)
-
- start += strlen(query)
- # skip gaps before
- while start[0] == ' ':
- start += 1
-
- if start[0] == '"':
- start += 1
- end = start
- while end[0] != '\0' and end[0] != '"':
- end += 1
- l = end - start
- result = force_str(PyBytes_FromStringAndSize(start, l),
- self.encoding)
- return result
- else:
- return force_str(start, self.encoding)
-
def setAttribute(self, name, value):
- '''convenience method to set an attribute.'''
- r = self.asDict()
- r[name] = value
- self.fromDict(r)
-
+ '''convenience method to set an attribute.
+ '''
+ if not self.attribute_dict:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ self.attribute_dict[name] = value
+
+ def attribute_string2dict(self, s):
+ return collections.OrderedDict(
+ self.attribute_string2iterator(s))
+
def __cmp__(self, other):
return (self.contig, self.strand, self.start) < \
(other.contig, other.strand, other.start)
@@ -676,29 +534,148 @@ cdef class GTFProxy(TupleProxy):
err_msg = "op {0} isn't implemented yet".format(op)
raise NotImplementedError(err_msg)
+ def dict2attribute_string(self, d):
+ """convert dictionary to attribute string in GTF format.
-cdef class NamedTupleProxy(TupleProxy):
+ """
+ aa = []
+ for k, v in d.items():
+ if isinstance(v, str):
+ aa.append('{} "{}"'.format(k, v))
+ else:
+ aa.append("{} {}".format(k, str(v)))
- map_key2field = {}
+ return self.separator.join(aa) + ";"
+
+ def attribute_string2iterator(self, s):
+ """convert attribute string in GTF format to records
+ and iterate over key, value pairs.
+ """
+
+ # remove comments
+ attributes = force_str(s, encoding=self.encoding)
+
+ # separate into fields
+ # Fields might contain a ";", for example in ENSEMBL GTF file
+ # for mouse, v78:
+ # ...; transcript_name "TXNRD2;-001"; ....
+ # The current heuristic is to split on a semicolon followed by a
+ # space, see also http://mblab.wustl.edu/GTF22.html
+
+ # Remove white space to prevent a last empty field.
+ fields = [x.strip() for x in attributes.strip().split("; ")]
+ for f in fields:
+
+ # strip semicolon (GTF files without a space after the last semicolon)
+ if f.endswith(";"):
+ f = f[:-1]
+
+ # split at most once in order to avoid separating
+ # multi-word values
+ d = [x.strip() for x in f.split(" ", 1)]
+
+ n, v = d[0], d[1]
+ if len(d) > 2:
+ v = d[1:]
+
+ if v[0] == '"' and v[-1] == '"':
+ v = v[1:-1]
+ else:
+ ## try to convert to a value
+ try:
+ v = float(v)
+ v = int(v)
+ except ValueError:
+ pass
+ except TypeError:
+ pass
+
+ yield n, v
+
+ def __getattr__(self, key):
+ """Generic lookup of attribute from GFF/GTF attributes
+ """
+
+ # Only called if there *isn't* an attribute with this name
+ cdef int idx
+ idx, f = self.map_key2field.get(key, (-1, None))
+ if idx >= 0:
+ # deal with known attributes (fields 0-8)
+ if idx == 8:
+ # flush attributes if requested
+ if self.is_modified and self.attribute_dict is not None:
+ s = self.dict2attribute_string(self.attribute_dict)
+ TupleProxy._setindex(self, idx, s)
+ self.attribute_dict = None
+ return s
+
+ if f[0] == str:
+ return force_str(self.fields[idx],
+ self.encoding)
+ else:
+ return f[0](self.fields[idx])
+ else:
+ # deal with generic attributes (gene_id, ...)
+ if self.attribute_dict is None:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ return self.attribute_dict[key]
def __setattr__(self, key, value):
'''set attribute.'''
- cdef int idx
- idx, f = self.map_key2field[key]
- if self.nfields < idx:
- raise KeyError("field %s not set" % key)
- TupleProxy.__setitem__(self, idx, str(value))
- def __getattr__(self, key):
+ # Note that __setattr__ is called before properties, so __setattr__ and
+ # properties don't mix well. This is different from __getattr__ which is
+ # called after any properties have been resolved.
cdef int idx
- idx, f = self.map_key2field[key]
- if self.nfields < idx:
- raise KeyError("field %s not set" % key)
- if f == str:
- return force_str(self.fields[idx],
- self.encoding)
- return f(self.fields[idx])
+ idx, f = self.map_key2field.get(key, (-1, None))
+
+ if idx >= 0:
+ if value is None:
+ s = "."
+ elif f[1] == str:
+ s = force_bytes(value,
+ self.encoding)
+ else:
+ s = str(f[1](value))
+ TupleProxy._setindex(self, idx, s)
+ else:
+ if self.attribute_dict is None:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ self.attribute_dict[key] = value
+ self.is_modified = True
+
+
+cdef class GFF3Proxy(GTFProxy):
+
+ def dict2attribute_string(self, d):
+ """convert dictionary to attribute string."""
+ return ";".join(["{}={}".format(k, v) for k, v in d.items()])
+
+ def attribute_string2iterator(self, s):
+ """convert attribute string in GFF3 format to records
+ and iterate over key, value pairs.
+ """
+
+ for f in (x.strip() for x in s.split(";")):
+ if not f:
+ continue
+ key, value = f.split("=", 1)
+ value = value.strip()
+
+ ## try to convert to a value
+ try:
+ value = float(value)
+ value = int(value)
+ except ValueError:
+ pass
+ except TypeError:
+ pass
+
+ yield key.strip(), value
+
cdef class BedProxy(NamedTupleProxy):
'''Proxy class for access to Bed fields.
@@ -762,7 +739,7 @@ cdef class BedProxy(NamedTupleProxy):
self.nfields = save_fields
return retval
- def __setattr__(self, key, value ):
+ def __setattr__(self, key, value):
'''set attribute.'''
if key == "start":
self.start = value
@@ -771,7 +748,8 @@ cdef class BedProxy(NamedTupleProxy):
cdef int idx
idx, f = self.map_key2field[key]
- TupleProxy._setindex(self, idx, str(value) )
+ TupleProxy._setindex(self, idx, str(value))
+
cdef class VCFProxy(NamedTupleProxy):
'''Proxy class for access to VCF fields.
diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd
index 81e544a..479d337 100644
--- a/pysam/libcutils.pxd
+++ b/pysam/libcutils.pxd
@@ -28,11 +28,11 @@ cdef from_string_and_size(const char *s, size_t length)
cdef extern from "pysam_util.h":
- int samtools_main(int argc, char *argv[])
- int bcftools_main(int argc, char *argv[])
void pysam_set_stderr(int fd)
void pysam_unset_stderr()
void pysam_set_stdout(int fd)
void pysam_set_stdout_fn(const char *)
void pysam_unset_stdout()
void set_optind(int)
+ extern int samtools_main(int argc, char *argv[])
+ extern int bcftools_main(int argc, char *argv[])
diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx
index 80bd9e4..2b90420 100644
--- a/pysam/libcutils.pyx
+++ b/pysam/libcutils.pyx
@@ -16,6 +16,9 @@ from libc.stdio cimport fprintf, stderr, fflush
from libc.stdio cimport stdout as c_stdout
from posix.fcntl cimport open as c_open, O_WRONLY
+from libcbcftools cimport bcftools_main
+from libcsamtools cimport samtools_main
+
#####################################################################
# hard-coded constants
cdef int MAX_POS = 2 << 29
@@ -234,16 +237,22 @@ def _pysam_dispatch(collection,
method,
args=None,
catch_stdout=True,
+ is_usage=False,
save_stdout=None):
'''call ``method`` in samtools/bcftools providing arguments in args.
+ By default, stdout is redirected to a temporary file using the patched
+ C sources except for a few commands that have an explicit output option
+ (typically: -o). In these commands (such as samtools view), this explicit
+ option is used. If *is_usage* is True, then these explicit output options
+ will not be used.
+
Catching of stdout can be turned off by setting *catch_stdout* to
False.
-
'''
if method == "index":
- if not os.path.exists(args[0]):
+ if args and not os.path.exists(args[0]):
raise IOError("No such file or directory: '%s'" % args[0])
if args is None:
@@ -267,17 +276,16 @@ def _pysam_dispatch(collection,
pysam_set_stdout(stdout_h)
elif catch_stdout:
stdout_h, stdout_f = tempfile.mkstemp()
-
MAP_STDOUT_OPTIONS = {
- "samtools": {
- "view": "-o {}",
- "mpileup": "-o {}",
- "depad": "-o {}",
- "calmd": "", # uses pysam_stdout_fn
- },
+ "samtools": {
+ "view": "-o {}",
+ "mpileup": "-o {}",
+ "depad": "-o {}",
+ "calmd": "", # uses pysam_stdout_fn
+ },
"bcftools": {}
}
-
+
stdout_option = None
if collection == "bcftools":
# in bcftools, most methods accept -o, the exceptions
@@ -289,7 +297,7 @@ def _pysam_dispatch(collection,
if not(method == "view" and "-c" in args):
stdout_option = MAP_STDOUT_OPTIONS[collection][method]
- if stdout_option is not None:
+ if stdout_option is not None and not is_usage:
os.close(stdout_h)
pysam_set_stdout_fn(force_bytes(stdout_f))
args.extend(stdout_option.format(stdout_f).split(" "))
diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c
index 94717c8..5940a35 100644
--- a/pysam/pysam_util.c
+++ b/pysam/pysam_util.c
@@ -2,8 +2,10 @@
#include <assert.h>
#include <unistd.h>
#include <stdio.h>
-#include "bam.h"
-#include "bam_endian.h"
+
+/* #include "bam.h" */
+/* #include "bam_endian.h" */
+
#include "htslib/khash.h"
#include "htslib/ksort.h"
#include "htslib/knetfile.h"
diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h
index a30808f..8627d96 100644
--- a/pysam/pysam_util.h
+++ b/pysam/pysam_util.h
@@ -34,4 +34,8 @@ int pysam_dispatch(int argc, char *argv[]);
void set_optind(int);
+extern int samtools_main(int argc, char *argv[]);
+
+extern int bcftools_main(int argc, char *argv[]);
+
#endif
diff --git a/pysam/samfile_util.c b/pysam/samfile_util.c
index f5724ae..b6917ed 100644
--- a/pysam/samfile_util.c
+++ b/pysam/samfile_util.c
@@ -1,8 +1,6 @@
#include "samfile_util.h"
#include "htslib/sam.h"
-#include "kprobaln.h"
-
// taken from bam_md.c
// replace bam1_{qual,seq,cigar} with bam_get_{qual,seq,cigar}
// bam1_seqi -> bam_seqi
@@ -14,175 +12,5 @@
char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
-int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
-{
- uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, mm, q, len, clip_l, clip_q;
- double t;
- if (thres < 0) thres = 40; // set the default
- mm = q = len = clip_l = clip_q = 0;
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
- if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
- ++len;
- if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
- ++mm;
- q += qual[z] > 33? 33 : qual[z];
- }
- }
- }
- if (j < l) break;
- x += l; y += l; len += l;
- } else if (op == BAM_CDEL) {
- for (j = 0; j < l; ++j)
- if (ref[x+j] == 0) break;
- if (j < l) break;
- x += l;
- } else if (op == BAM_CSOFT_CLIP) {
- for (j = 0; j < l; ++j) clip_q += qual[y+j];
- clip_l += l;
- y += l;
- } else if (op == BAM_CHARD_CLIP) {
- clip_q += 13 * l;
- clip_l += l;
- } else if (op == BAM_CINS) y += l;
- else if (op == BAM_CREF_SKIP) x += l;
- }
- for (i = 0, t = 1; i < mm; ++i)
- t *= (double)len / (i+1);
- t = q - 4.343 * log(t) + clip_q / 5.;
- if (t > thres) return -1;
- if (t < 0) t = 0;
- t = sqrt((thres - t) / thres) * thres;
-// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
- return (int)(t + .499);
-}
-
-
-int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
-{
- int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
- // test if BQ or ZQ is present
- if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
- if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
- if (bq && redo_baq)
- {
- bam_aux_del(b, bq-1);
- bq = 0;
- }
- if (bq && zq) { // remove the ZQ tag
- bam_aux_del(b, zq-1);
- zq = 0;
- }
- if (bq || zq) {
- if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
- if (bq && apply_baq) { // then convert BQ to ZQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
- *(bq - 3) = 'Z';
- } else if (zq && !apply_baq) { // then convert ZQ to BQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] += (int)zq[i] - 64;
- *(zq - 3) = 'B';
- }
- return 0;
- }
- // find the start and end of the alignment
- x = c->pos, y = 0, yb = ye = xb = xe = -1;
- for (k = 0; k < c->n_cigar; ++k) {
- int op, l;
- op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (yb < 0) yb = y;
- if (xb < 0) xb = x;
- ye = y + l; xe = x + l;
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
- }
- // set bandwidth and the start and the end
- bw = 7;
- if (abs((xe - xb) - (ye - yb)) > bw)
- bw = abs((xe - xb) - (ye - yb)) + 3;
- conf.bw = bw;
- xb -= yb + bw/2; if (xb < 0) xb = 0;
- xe += c->l_qseq - ye + bw/2;
- if (xe - xb - c->l_qseq > bw)
- xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
- { // glocal
- uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
- int *state;
- bq = calloc(c->l_qseq + 1, 1);
- memcpy(bq, qual, c->l_qseq);
- s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam_seqi(seq, i)];
- r = calloc(xe - xb, 1);
- for (i = xb; i < xe; ++i) {
- if (ref[i] == 0) { xe = i; break; }
- r[i-xb] = bam_nt16_nt4_table[seq_nt16_table[(int)ref[i]]];
- }
- state = calloc(c->l_qseq, sizeof(int));
- q = calloc(c->l_qseq, 1);
- kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
- if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i) {
- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
- else bq[i] = bq[i] < q[i]? bq[i] : q[i];
- }
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
- } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
- uint8_t *left, *rght;
- left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i)
- bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
- for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
- left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
- for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
- rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
- for (i = y; i < y + l; ++i)
- bq[i] = left[i] < rght[i]? left[i] : rght[i];
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
- free(left); free(rght);
- }
- if (apply_baq) {
- for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
- bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
- } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
- free(bq); free(s); free(r); free(q); free(state);
- }
- return 0;
-}
-
-int bam_prob_realn(bam1_t *b, const char *ref)
-{
- return bam_prob_realn_core(b, ref, 1);
-}
diff --git a/pysam/samfile_util.h b/pysam/samfile_util.h
index dd3e27a..94ce096 100644
--- a/pysam/samfile_util.h
+++ b/pysam/samfile_util.h
@@ -3,8 +3,5 @@
#include "htslib/sam.h"
-int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
-int bam_prob_realn(bam1_t *b, const char *ref);
-
#endif
diff --git a/pysam/tabix_util.c b/pysam/tabix_util.c
index bff140e..319808a 100644
--- a/pysam/tabix_util.c
+++ b/pysam/tabix_util.c
@@ -1,6 +1,7 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
+#include <string.h>
#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700)
/*
diff --git a/pysam/utils.py b/pysam/utils.py
index 5c045df..239f5db 100644
--- a/pysam/utils.py
+++ b/pysam/utils.py
@@ -92,7 +92,14 @@ class PysamDispatcher(object):
def usage(self):
'''return the samtools usage information for this command'''
- retval, stderr, stdout = csamtools._samtools_dispatch(
- self.dispatch)
- return stderr
+ retval, stderr, stdout = _pysam_dispatch(
+ self.collection,
+ self.dispatch,
+ is_usage=True,
+ catch_stdout=True)
+ # some tools write usage to stderr, such as mpileup
+ if stderr:
+ return stderr
+ else:
+ return stdout
diff --git a/pysam/version.py b/pysam/version.py
index facb3bb..ac832cf 100644
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,9 +1,10 @@
# pysam versioning information
+__version__ = "0.11.2.2"
-__version__ = "0.10.0"
+# TODO: upgrade number
+__samtools_version__ = "1.4.1"
-__samtools_version__ = "1.3.1"
+# TODO: upgrade code and number
+__bcftools_version__ = "1.4.1"
-__bcftools_version__ = "1.3.1"
-
-__htslib_version__ = "1.3.2"
+__htslib_version__ = "1.4.1"
diff --git a/samtools/bam.h b/samtools/bam.h
index e928ce4..108987c 100644
--- a/samtools/bam.h
+++ b/samtools/bam.h
@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "1.3.1"
+#define BAM_VERSION "1.4.1"
#include <stdint.h>
#include <stdlib.h>
diff --git a/samtools/bam2bcf.c b/samtools/bam2bcf.c
index 85ce307..a824d5a 100644
--- a/samtools/bam2bcf.c
+++ b/samtools/bam2bcf.c
@@ -29,11 +29,11 @@ DEALINGS IN THE SOFTWARE. */
#include <stdint.h>
#include <assert.h>
#include <float.h>
+#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include <htslib/kfunc.h>
#include "bam2bcf.h"
-#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
diff --git a/samtools/bam2bcf.c.pysam.c b/samtools/bam2bcf.c.pysam.c
index 6938ec0..3e3e01c 100644
--- a/samtools/bam2bcf.c.pysam.c
+++ b/samtools/bam2bcf.c.pysam.c
@@ -31,11 +31,11 @@ DEALINGS IN THE SOFTWARE. */
#include <stdint.h>
#include <assert.h>
#include <float.h>
+#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include <htslib/kfunc.h>
#include "bam2bcf.h"
-#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
diff --git a/samtools/bam2bcf.h b/samtools/bam2bcf.h
index 22c67cc..54e5faa 100644
--- a/samtools/bam2bcf.h
+++ b/samtools/bam2bcf.h
@@ -27,8 +27,8 @@ DEALINGS IN THE SOFTWARE. */
#define BAM2BCF_H
#include <stdint.h>
+#include <htslib/hts.h>
#include <htslib/vcf.h>
-#include "errmod.h"
/**
* A simplified version of Mann-Whitney U-test is calculated
diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c
index 5b353fc..9749d5b 100644
--- a/samtools/bam2bcf_indel.c
+++ b/samtools/bam2bcf_indel.c
@@ -28,9 +28,9 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "bam2bcf.h"
-#include "kprobaln.h"
#include "htslib/khash.h"
KHASH_SET_INIT_STR(rg)
@@ -359,7 +359,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
bca->indelreg = 0;
for (t = 0; t < n_types; ++t) {
int l, ir;
- kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
apf1.bw = apf2.bw = abs(types[t]) + 3;
// compute indelreg
if (types[t] == 0) ir = 0;
@@ -412,14 +412,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
}
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
if (l > 255) l = 255;
score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
if (sc > 5) {
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499);
if (l > 255) l = 255;
score2[K*n_types + t] = sc<<8 | l;
@@ -439,10 +439,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
free(ref2); free(query);
{ // compute indelQ
- int *sc, tmp, *sumq;
- sc = alloca(n_types * sizeof(int));
- sumq = alloca(n_types * sizeof(int));
- memset(sumq, 0, sizeof(int) * n_types);
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
for (s = K = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
@@ -523,6 +526,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
//fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
}
}
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
}
free(score1); free(score2);
// free
diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c
index 21cbb03..fcbc90f 100644
--- a/samtools/bam2bcf_indel.c.pysam.c
+++ b/samtools/bam2bcf_indel.c.pysam.c
@@ -30,9 +30,9 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "bam2bcf.h"
-#include "kprobaln.h"
#include "htslib/khash.h"
KHASH_SET_INIT_STR(rg)
@@ -361,7 +361,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
bca->indelreg = 0;
for (t = 0; t < n_types; ++t) {
int l, ir;
- kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
apf1.bw = apf2.bw = abs(types[t]) + 3;
// compute indelreg
if (types[t] == 0) ir = 0;
@@ -414,14 +414,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
}
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
if (l > 255) l = 255;
score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
if (sc > 5) {
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499);
if (l > 255) l = 255;
score2[K*n_types + t] = sc<<8 | l;
@@ -441,10 +441,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
free(ref2); free(query);
{ // compute indelQ
- int *sc, tmp, *sumq;
- sc = alloca(n_types * sizeof(int));
- sumq = alloca(n_types * sizeof(int));
- memset(sumq, 0, sizeof(int) * n_types);
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
for (s = K = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
@@ -525,6 +528,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
//fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
}
}
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
}
free(score1); free(score2);
// free
diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c
index 21220f1..b732e8e 100644
--- a/samtools/bam2depth.c
+++ b/samtools/bam2depth.c
@@ -80,13 +80,13 @@ static int usage() {
fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
fprintf(stderr, " -b <bed> list of positions or regions\n");
fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
+ fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>) [0]\n");
fprintf(stderr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
- fprintf(stderr, " -q <int> base quality threshold\n");
- fprintf(stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(stderr, " -q <int> base quality threshold [0]\n");
+ fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(stderr, " -r <chr:from-to> region\n");
- sam_global_opt_help(stderr, "-.--.");
+ sam_global_opt_help(stderr, "-.--.-");
fprintf(stderr, "\n");
fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
@@ -99,7 +99,7 @@ static int usage() {
int main_depth(int argc, char *argv[])
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
+ int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
@@ -112,7 +112,7 @@ int main_depth(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
@@ -149,7 +149,7 @@ int main_depth(int argc, char *argv[])
else
n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- beg = 0; end = INT_MAX; // set the default region
+ reg_tid = 0; beg = 0; end = INT_MAX; // set the default region
for (i = 0; i < n; ++i) {
int rf;
data[i] = calloc(1, sizeof(aux_t));
@@ -199,6 +199,7 @@ int main_depth(int argc, char *argv[])
if (reg) {
beg = data[0]->iter->beg; // and to the parsed region coordinates
end = data[0]->iter->end;
+ reg_tid = data[0]->iter->tid;
}
// the core multi-pileup loop
@@ -210,12 +211,12 @@ int main_depth(int argc, char *argv[])
while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
if (tid >= h->n_targets) continue; // diff number of @SQ lines per file?
- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
if (all) {
while (tid > last_tid) {
- if (last_tid >= 0 && all > 1 && !reg) {
- // Deal with remainder or entirety of last tid
+ if (last_tid >= 0 && !reg) {
+ // Deal with remainder or entirety of last tid.
while (++last_pos < h->target_len[last_tid]) {
+ // Horribly inefficient, but the bed API is an obfuscated black box.
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
continue;
fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
@@ -226,6 +227,8 @@ int main_depth(int argc, char *argv[])
}
last_tid++;
last_pos = -1;
+ if (all < 2)
+ break;
}
// Deal with missing portion of current tid
@@ -242,6 +245,7 @@ int main_depth(int argc, char *argv[])
last_tid = tid;
last_pos = pos;
}
+ if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue;
fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
@@ -260,7 +264,11 @@ int main_depth(int argc, char *argv[])
if (all) {
// Handle terminating region
- while (last_tid < h->n_targets) {
+ if (last_tid < 0 && reg && all > 1) {
+ last_tid = reg_tid;
+ last_pos = beg-1;
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
while (++last_pos < h->target_len[last_tid]) {
if (last_pos >= end) break;
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c
index 9d9dc40..4d9110b 100644
--- a/samtools/bam2depth.c.pysam.c
+++ b/samtools/bam2depth.c.pysam.c
@@ -82,13 +82,13 @@ static int usage() {
fprintf(pysam_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
fprintf(pysam_stderr, " -b <bed> list of positions or regions\n");
fprintf(pysam_stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(pysam_stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
+ fprintf(pysam_stderr, " -l <int> read length threshold (ignore reads shorter than <int>) [0]\n");
fprintf(pysam_stderr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
- fprintf(pysam_stderr, " -q <int> base quality threshold\n");
- fprintf(pysam_stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(pysam_stderr, " -q <int> base quality threshold [0]\n");
+ fprintf(pysam_stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(pysam_stderr, " -r <chr:from-to> region\n");
- sam_global_opt_help(pysam_stderr, "-.--.");
+ sam_global_opt_help(pysam_stderr, "-.--.-");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
@@ -101,7 +101,7 @@ static int usage() {
int main_depth(int argc, char *argv[])
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
+ int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
@@ -114,7 +114,7 @@ int main_depth(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
@@ -151,7 +151,7 @@ int main_depth(int argc, char *argv[])
else
n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- beg = 0; end = INT_MAX; // set the default region
+ reg_tid = 0; beg = 0; end = INT_MAX; // set the default region
for (i = 0; i < n; ++i) {
int rf;
data[i] = calloc(1, sizeof(aux_t));
@@ -201,6 +201,7 @@ int main_depth(int argc, char *argv[])
if (reg) {
beg = data[0]->iter->beg; // and to the parsed region coordinates
end = data[0]->iter->end;
+ reg_tid = data[0]->iter->tid;
}
// the core multi-pileup loop
@@ -212,12 +213,12 @@ int main_depth(int argc, char *argv[])
while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
if (tid >= h->n_targets) continue; // diff number of @SQ lines per file?
- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
if (all) {
while (tid > last_tid) {
- if (last_tid >= 0 && all > 1 && !reg) {
- // Deal with remainder or entirety of last tid
+ if (last_tid >= 0 && !reg) {
+ // Deal with remainder or entirety of last tid.
while (++last_pos < h->target_len[last_tid]) {
+ // Horribly inefficient, but the bed API is an obfuscated black box.
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
continue;
fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1);
@@ -228,6 +229,8 @@ int main_depth(int argc, char *argv[])
}
last_tid++;
last_pos = -1;
+ if (all < 2)
+ break;
}
// Deal with missing portion of current tid
@@ -244,6 +247,7 @@ int main_depth(int argc, char *argv[])
last_tid = tid;
last_pos = pos;
}
+ if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue;
fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", pos+1); // a customized fprintf(pysam_stdout, ) would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
@@ -262,7 +266,11 @@ int main_depth(int argc, char *argv[])
if (all) {
// Handle terminating region
- while (last_tid < h->n_targets) {
+ if (last_tid < 0 && reg && all > 1) {
+ last_tid = reg_tid;
+ last_pos = beg-1;
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
while (++last_pos < h->target_len[last_tid]) {
if (last_pos >= end) break;
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
diff --git a/samtools/bam_addrprg.c b/samtools/bam_addrprg.c
index f7bbfab..99a198d 100644
--- a/samtools/bam_addrprg.c
+++ b/samtools/bam_addrprg.c
@@ -1,6 +1,6 @@
/* bam_addrprg.c -- samtools command to add or replace readgroups.
- Copyright (c) 2013, 2015 Genome Research Limited.
+ Copyright (c) 2013, 2015, 2016 Genome Research Limited.
Author: Martin O. Pollard <mp15 at sanger.ac.uk>
@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include <string.h>
#include <stdio.h>
@@ -48,6 +49,7 @@ struct parsed_opts {
char* rg_line;
rg_mode mode;
sam_global_args ga;
+ htsThreadPool p;
};
struct state;
@@ -69,6 +71,7 @@ static void cleanup_opts(parsed_opts_t* opts)
free(opts->rg_id);
free(opts->output_name);
free(opts->input_name);
+ if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
sam_global_args_free(&opts->ga);
free(opts);
}
@@ -131,6 +134,19 @@ static char* basic_unescape(const char* in)
return tmp;
}
+// Malloc a string containing [s,slim) or to the end of s if slim is NULL.
+// If lenp is non-NULL, stores the length of the resulting string there.
+static char *dup_substring(const char *s, const char *slim, size_t *lenp)
+{
+ size_t len = slim? (slim - s) : strlen(s);
+ char *ns = malloc(len+1);
+ if (ns == NULL) return NULL;
+ memcpy(ns, s, len);
+ ns[len] = '\0';
+ if (lenp) *lenp = len;
+ return ns;
+}
+
// These are to be replaced by samtools header parser
// Extracts the first @RG line from a string.
static char* get_rg_line(const char* text, size_t* last)
@@ -143,37 +159,17 @@ static char* get_rg_line(const char* text, size_t* last)
rg++;//skip initial \n
}
// duplicate the line for return
- char* line;
- char* end = strchr(rg, '\n');
- if (end) {
- line = strndup(rg,(end-rg));
- *last = end - rg;
- } else {
- line = strdup(rg);
- *last = strlen(rg);
- }
- return line;
+ return dup_substring(rg, strchr(rg, '\n'), last);
}
// Given a @RG line return the id
-static char* get_rg_id(const char* input)
+static char* get_rg_id(const char *line)
{
- assert(input!=NULL);
- char* line = strdup(input);
- char *next = line;
- char* token = strsep(&next, "\t");
- token = strsep(&next,"\t"); // skip first token it should always be "@RG"
- while (next != NULL) {
- char* key = strsep(&token,":");
- if (!strcmp(key,"ID")) {
- char* retval = strdup(token);
- free(line);
- return retval;
- }
- token = strsep(&next,"\t");
- }
- free(line);
- return NULL;
+ const char *id = strstr(line, "\tID:");
+ if (! id) return NULL;
+
+ id += 4;
+ return dup_substring(id, strchr(id, '\t'), NULL);
}
// Confirms the existance of an RG line with a given ID in a bam header
@@ -181,9 +177,8 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid )
{
assert( hdr != NULL && rgid != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
bool found = false;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == false ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
@@ -196,16 +191,14 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid )
free(line);
ptr += end;
}
- free(start);
return found;
}
static char* get_first_rgid( const bam_hdr_t *hdr )
{
assert( hdr != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
char* found = NULL;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == NULL ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
@@ -215,7 +208,6 @@ static char* get_first_rgid( const bam_hdr_t *hdr )
free(line);
ptr += end;
}
- free(start);
return found;
}
@@ -230,7 +222,7 @@ static void usage(FILE *fp)
" -r STRING @RG line text\n"
" -R STRING ID of @RG line in existing header to use\n"
);
- sam_global_opt_help(fp, "..O..");
+ sam_global_opt_help(fp, "..O..@");
}
static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
@@ -249,12 +241,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
retval->mode = overwrite_all;
sam_global_args_init(&retval->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
kstring_t rg_line = {0,0,NULL};
- while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
switch (n) {
case 'r':
// Are we adding to existing rg line?
@@ -328,6 +320,13 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
}
retval->input_name = strdup(argv[optind+0]);
+ if (retval->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ return false;
+ }
+ }
+
*opts = retval;
return true;
}
@@ -369,7 +368,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
// Open files
retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
if (retval->input_file == NULL) {
- fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name);
+ print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
return false;
}
retval->input_header = sam_hdr_read(retval->input_file);
@@ -378,10 +377,15 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);
if (retval->output_file == NULL) {
- print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
+ print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
return false;
}
+ if (opts->p.pool) {
+ hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p);
+ hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
+ }
+
if (opts->rg_line) {
// Append new RG line to header.
// Check does not already exist
@@ -466,13 +470,13 @@ int main_addreplacerg(int argc, char** argv)
if (!readgroupise(state)) goto error;
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_SUCCESS;
error:
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_FAILURE;
}
diff --git a/samtools/bam_addrprg.c.pysam.c b/samtools/bam_addrprg.c.pysam.c
index 2ddd1b1..56986dd 100644
--- a/samtools/bam_addrprg.c.pysam.c
+++ b/samtools/bam_addrprg.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_addrprg.c -- samtools command to add or replace readgroups.
- Copyright (c) 2013, 2015 Genome Research Limited.
+ Copyright (c) 2013, 2015, 2016 Genome Research Limited.
Author: Martin O. Pollard <mp15 at sanger.ac.uk>
@@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include <string.h>
#include <stdio.h>
@@ -50,6 +51,7 @@ struct parsed_opts {
char* rg_line;
rg_mode mode;
sam_global_args ga;
+ htsThreadPool p;
};
struct state;
@@ -71,6 +73,7 @@ static void cleanup_opts(parsed_opts_t* opts)
free(opts->rg_id);
free(opts->output_name);
free(opts->input_name);
+ if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
sam_global_args_free(&opts->ga);
free(opts);
}
@@ -133,6 +136,19 @@ static char* basic_unescape(const char* in)
return tmp;
}
+// Malloc a string containing [s,slim) or to the end of s if slim is NULL.
+// If lenp is non-NULL, stores the length of the resulting string there.
+static char *dup_substring(const char *s, const char *slim, size_t *lenp)
+{
+ size_t len = slim? (slim - s) : strlen(s);
+ char *ns = malloc(len+1);
+ if (ns == NULL) return NULL;
+ memcpy(ns, s, len);
+ ns[len] = '\0';
+ if (lenp) *lenp = len;
+ return ns;
+}
+
// These are to be replaced by samtools header parser
// Extracts the first @RG line from a string.
static char* get_rg_line(const char* text, size_t* last)
@@ -145,37 +161,17 @@ static char* get_rg_line(const char* text, size_t* last)
rg++;//skip initial \n
}
// duplicate the line for return
- char* line;
- char* end = strchr(rg, '\n');
- if (end) {
- line = strndup(rg,(end-rg));
- *last = end - rg;
- } else {
- line = strdup(rg);
- *last = strlen(rg);
- }
- return line;
+ return dup_substring(rg, strchr(rg, '\n'), last);
}
// Given a @RG line return the id
-static char* get_rg_id(const char* input)
+static char* get_rg_id(const char *line)
{
- assert(input!=NULL);
- char* line = strdup(input);
- char *next = line;
- char* token = strsep(&next, "\t");
- token = strsep(&next,"\t"); // skip first token it should always be "@RG"
- while (next != NULL) {
- char* key = strsep(&token,":");
- if (!strcmp(key,"ID")) {
- char* retval = strdup(token);
- free(line);
- return retval;
- }
- token = strsep(&next,"\t");
- }
- free(line);
- return NULL;
+ const char *id = strstr(line, "\tID:");
+ if (! id) return NULL;
+
+ id += 4;
+ return dup_substring(id, strchr(id, '\t'), NULL);
}
// Confirms the existance of an RG line with a given ID in a bam header
@@ -183,9 +179,8 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid )
{
assert( hdr != NULL && rgid != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
bool found = false;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == false ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
@@ -198,16 +193,14 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid )
free(line);
ptr += end;
}
- free(start);
return found;
}
static char* get_first_rgid( const bam_hdr_t *hdr )
{
assert( hdr != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
char* found = NULL;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == NULL ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
@@ -217,7 +210,6 @@ static char* get_first_rgid( const bam_hdr_t *hdr )
free(line);
ptr += end;
}
- free(start);
return found;
}
@@ -232,7 +224,7 @@ static void usage(FILE *fp)
" -r STRING @RG line text\n"
" -R STRING ID of @RG line in existing header to use\n"
);
- sam_global_opt_help(fp, "..O..");
+ sam_global_opt_help(fp, "..O..@");
}
static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
@@ -251,12 +243,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
retval->mode = overwrite_all;
sam_global_args_init(&retval->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
kstring_t rg_line = {0,0,NULL};
- while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
switch (n) {
case 'r':
// Are we adding to existing rg line?
@@ -330,6 +322,13 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
}
retval->input_name = strdup(argv[optind+0]);
+ if (retval->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ return false;
+ }
+ }
+
*opts = retval;
return true;
}
@@ -371,7 +370,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
// Open files
retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
if (retval->input_file == NULL) {
- fprintf(pysam_stderr, "[init] Could not open input file: %s\n", opts->input_name);
+ print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
return false;
}
retval->input_header = sam_hdr_read(retval->input_file);
@@ -380,10 +379,15 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);
if (retval->output_file == NULL) {
- print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
+ print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
return false;
}
+ if (opts->p.pool) {
+ hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p);
+ hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
+ }
+
if (opts->rg_line) {
// Append new RG line to header.
// Check does not already exist
@@ -468,13 +472,13 @@ int main_addreplacerg(int argc, char** argv)
if (!readgroupise(state)) goto error;
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_SUCCESS;
error:
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_FAILURE;
}
diff --git a/samtools/bam_cat.c b/samtools/bam_cat.c
index 5c303d1..95498ec 100644
--- a/samtools/bam_cat.c
+++ b/samtools/bam_cat.c
@@ -40,6 +40,7 @@ Illumina.
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#include <strings.h>
#include "htslib/bgzf.h"
#include "htslib/sam.h"
@@ -468,7 +469,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam)
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) != 0) goto write_fail;
}
@@ -531,10 +532,12 @@ int main_cat(int argc, char *argv[])
{
bam_hdr_t *h = 0;
char *outfn = 0;
+ char **infns = NULL; // files to concatenate
+ int infns_size = 0;
int c, ret = 0;
samFile *in;
- while ((c = getopt(argc, argv, "h:o:")) >= 0) {
+ while ((c = getopt(argc, argv, "h:o:b:")) >= 0) {
switch (c) {
case 'h': {
samFile *fph = sam_open(optarg, "r");
@@ -553,29 +556,61 @@ int main_cat(int argc, char *argv[])
break;
}
case 'o': outfn = strdup(optarg); break;
+ case 'b': {
+ // add file names in "optarg" to the list
+ // of files to concatenate
+ int nfns;
+ char **fns_read = hts_readlines(optarg, &nfns);
+ if (fns_read) {
+ infns = realloc(infns, (infns_size + nfns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns+infns_size, fns_read, nfns * sizeof(char*));
+ infns_size += nfns;
+ free(fns_read);
+ } else {
+ print_error("cat", "Invalid file list \"%s\"", optarg);
+ ret = 1;
+ }
+ break;
+ }
}
}
- if (argc - optind < 1) {
- fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
+
+ // Append files specified in argv to the list.
+ int nargv_fns = argc - optind;
+ if (nargv_fns > 0) {
+ infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*));
+ }
+
+ // Require at least one input file
+ if (infns_size + nargv_fns == 0) {
+ fprintf(stderr, "Usage: samtools cat [options] <in1.bam> [... <inN.bam>]\n");
+ fprintf(stderr, " samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
+ fprintf(stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
+ fprintf(stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n");
+ fprintf(stderr, " -h FILE copy the header from FILE [default is 1st input file]\n");
+ fprintf(stderr, " -o FILE output BAM/CRAM\n");
return 1;
}
- in = sam_open(argv[optind], "r");
+ in = sam_open(infns[0], "r");
if (!in) {
- print_error_errno("cat", "failed to open file '%s'", argv[optind]);
+ print_error_errno("cat", "failed to open file '%s'", infns[0]);
return 1;
}
switch (hts_get_format(in)->format) {
case bam:
sam_close(in);
- if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
case cram:
sam_close(in);
- if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
@@ -584,7 +619,16 @@ int main_cat(int argc, char *argv[])
fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
return 1;
}
+
+ end:
+ if (infns_size > 0) {
+ int i;
+ for (i=0; i<infns_size; i++)
+ free(infns[i]);
+ }
+
free(outfn);
+ free(infns);
if (h)
bam_hdr_destroy(h);
diff --git a/samtools/bam_cat.c.pysam.c b/samtools/bam_cat.c.pysam.c
index daa0454..20adbc1 100644
--- a/samtools/bam_cat.c.pysam.c
+++ b/samtools/bam_cat.c.pysam.c
@@ -42,6 +42,7 @@ Illumina.
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#include <strings.h>
#include "htslib/bgzf.h"
#include "htslib/sam.h"
@@ -470,7 +471,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam)
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) != 0) goto write_fail;
}
@@ -533,10 +534,12 @@ int main_cat(int argc, char *argv[])
{
bam_hdr_t *h = 0;
char *outfn = 0;
+ char **infns = NULL; // files to concatenate
+ int infns_size = 0;
int c, ret = 0;
samFile *in;
- while ((c = getopt(argc, argv, "h:o:")) >= 0) {
+ while ((c = getopt(argc, argv, "h:o:b:")) >= 0) {
switch (c) {
case 'h': {
samFile *fph = sam_open(optarg, "r");
@@ -555,29 +558,61 @@ int main_cat(int argc, char *argv[])
break;
}
case 'o': outfn = strdup(optarg); break;
+ case 'b': {
+ // add file names in "optarg" to the list
+ // of files to concatenate
+ int nfns;
+ char **fns_read = hts_readlines(optarg, &nfns);
+ if (fns_read) {
+ infns = realloc(infns, (infns_size + nfns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns+infns_size, fns_read, nfns * sizeof(char*));
+ infns_size += nfns;
+ free(fns_read);
+ } else {
+ print_error("cat", "Invalid file list \"%s\"", optarg);
+ ret = 1;
+ }
+ break;
+ }
}
}
- if (argc - optind < 1) {
- fprintf(pysam_stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
+
+ // Append files specified in argv to the list.
+ int nargv_fns = argc - optind;
+ if (nargv_fns > 0) {
+ infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*));
+ }
+
+ // Require at least one input file
+ if (infns_size + nargv_fns == 0) {
+ fprintf(pysam_stderr, "Usage: samtools cat [options] <in1.bam> [... <inN.bam>]\n");
+ fprintf(pysam_stderr, " samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
+ fprintf(pysam_stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
+ fprintf(pysam_stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n");
+ fprintf(pysam_stderr, " -h FILE copy the header from FILE [default is 1st input file]\n");
+ fprintf(pysam_stderr, " -o FILE output BAM/CRAM\n");
return 1;
}
- in = sam_open(argv[optind], "r");
+ in = sam_open(infns[0], "r");
if (!in) {
- print_error_errno("cat", "failed to open file '%s'", argv[optind]);
+ print_error_errno("cat", "failed to open file '%s'", infns[0]);
return 1;
}
switch (hts_get_format(in)->format) {
case bam:
sam_close(in);
- if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
case cram:
sam_close(in);
- if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
@@ -586,7 +621,16 @@ int main_cat(int argc, char *argv[])
fprintf(pysam_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
return 1;
}
+
+ end:
+ if (infns_size > 0) {
+ int i;
+ for (i=0; i<infns_size; i++)
+ free(infns[i]);
+ }
+
free(outfn);
+ free(infns);
if (h)
bam_hdr_destroy(h);
diff --git a/samtools/bam_index.c b/samtools/bam_index.c
index 3a5acf6..40b7e0f 100644
--- a/samtools/bam_index.c
+++ b/samtools/bam_index.c
@@ -46,20 +46,23 @@ static void index_usage(FILE *fp)
"Options:\n"
" -b Generate BAI-format index for BAM files [default]\n"
" -c Generate CSI-format index for BAM files\n"
-" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n", BAM_LIDX_SHIFT);
+" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
+ int n_threads = 0;
int c, ret;
- while ((c = getopt(argc, argv, "bcm:")) >= 0)
+ while ((c = getopt(argc, argv, "bcm:@:")) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
case 'm': csi = 1; min_shift = atoi(optarg); break;
+ case '@': n_threads = atoi(optarg); break;
default:
index_usage(stderr);
return 1;
@@ -70,18 +73,32 @@ int bam_index(int argc, char *argv[])
return 1;
}
- ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0);
- if (ret != 0) {
- if (ret == -2)
- print_error_errno("index", "failed to open \"%s\"", argv[optind]);
- else if (ret == -3)
- print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads);
+ switch (ret) {
+ case 0:
+ return 0;
+
+ case -2:
+ print_error_errno("index", "failed to open \"%s\"", argv[optind]);
+ break;
+
+ case -3:
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ break;
+
+ case -4:
+ if (argv[optind+1])
+ print_error("index", "failed to create or write index \"%s\"", argv[optind+1]);
else
- print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]);
- return EXIT_FAILURE;
+ print_error("index", "failed to create or write index");
+ break;
+
+ default:
+ print_error_errno("index", "failed to create index for \"%s\"", argv[optind]);
+ break;
}
- return 0;
+ return EXIT_FAILURE;
}
int bam_idxstats(int argc, char *argv[])
@@ -95,15 +112,20 @@ int bam_idxstats(int argc, char *argv[])
return 1;
}
fp = sam_open(argv[1], "r");
- if (fp == NULL) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+ if (fp == NULL) {
+ print_error_errno("idxstats", "failed to open \"%s\"", argv[1]);
+ return 1;
+ }
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(stderr, "[%s] failed to read header for '%s'.\n",
- __func__, argv[1]);
+ print_error("idxstats", "failed to read header for \"%s\"", argv[1]);
return 1;
}
idx = sam_index_load(fp, argv[1]);
- if (idx == NULL) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+ if (idx == NULL) {
+ print_error("idxstats", "fail to load index for \"%s\"", argv[1]);
+ return 1;
+ }
int i;
for (i = 0; i < header->n_targets; ++i) {
diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c
index 6c0efdc..a91ee76 100644
--- a/samtools/bam_index.c.pysam.c
+++ b/samtools/bam_index.c.pysam.c
@@ -48,20 +48,23 @@ static void index_usage(FILE *fp)
"Options:\n"
" -b Generate BAI-format index for BAM files [default]\n"
" -c Generate CSI-format index for BAM files\n"
-" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n", BAM_LIDX_SHIFT);
+" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
+ int n_threads = 0;
int c, ret;
- while ((c = getopt(argc, argv, "bcm:")) >= 0)
+ while ((c = getopt(argc, argv, "bcm:@:")) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
case 'm': csi = 1; min_shift = atoi(optarg); break;
+ case '@': n_threads = atoi(optarg); break;
default:
index_usage(pysam_stderr);
return 1;
@@ -72,18 +75,32 @@ int bam_index(int argc, char *argv[])
return 1;
}
- ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0);
- if (ret != 0) {
- if (ret == -2)
- print_error_errno("index", "failed to open \"%s\"", argv[optind]);
- else if (ret == -3)
- print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads);
+ switch (ret) {
+ case 0:
+ return 0;
+
+ case -2:
+ print_error_errno("index", "failed to open \"%s\"", argv[optind]);
+ break;
+
+ case -3:
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ break;
+
+ case -4:
+ if (argv[optind+1])
+ print_error("index", "failed to create or write index \"%s\"", argv[optind+1]);
else
- print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]);
- return EXIT_FAILURE;
+ print_error("index", "failed to create or write index");
+ break;
+
+ default:
+ print_error_errno("index", "failed to create index for \"%s\"", argv[optind]);
+ break;
}
- return 0;
+ return EXIT_FAILURE;
}
int bam_idxstats(int argc, char *argv[])
@@ -97,15 +114,20 @@ int bam_idxstats(int argc, char *argv[])
return 1;
}
fp = sam_open(argv[1], "r");
- if (fp == NULL) { fprintf(pysam_stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+ if (fp == NULL) {
+ print_error_errno("idxstats", "failed to open \"%s\"", argv[1]);
+ return 1;
+ }
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(pysam_stderr, "[%s] failed to read header for '%s'.\n",
- __func__, argv[1]);
+ print_error("idxstats", "failed to read header for \"%s\"", argv[1]);
return 1;
}
idx = sam_index_load(fp, argv[1]);
- if (idx == NULL) { fprintf(pysam_stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+ if (idx == NULL) {
+ print_error("idxstats", "fail to load index for \"%s\"", argv[1]);
+ return 1;
+ }
int i;
for (i = 0; i < header->n_targets; ++i) {
diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c
index 5b13b2e..75c2f51 100644
--- a/samtools/bam_mate.c
+++ b/samtools/bam_mate.c
@@ -1,6 +1,6 @@
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2016 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2017 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
@@ -155,9 +156,30 @@ static bool plausibly_properly_paired(bam1_t* a, bam1_t* b)
return false;
}
-static void sync_mq(bam1_t* src, bam1_t* dest)
+// Returns 0 on success, -1 on failure.
+static int bam_format_cigar(const bam1_t* b, kstring_t* str)
+{
+ // An empty cigar is a special case return "*" rather than ""
+ if (b->core.n_cigar == 0) {
+ return (kputc('*', str) == EOF) ? -1 : 0;
+ }
+
+ const uint32_t *cigar = bam_get_cigar(b);
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; ++i) {
+ if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1;
+ if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1;
+ }
+
+ return 0;
+}
+
+// Returns 0 on success, -1 on failure.
+static int sync_mq_mc(bam1_t* src, bam1_t* dest)
{
if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
+ // Copy Mate Mapping Quality
uint32_t mq = src->core.qual;
uint8_t* data;
if ((data = bam_aux_get(dest,"MQ")) != NULL) {
@@ -166,17 +188,34 @@ static void sync_mq(bam1_t* src, bam1_t* dest)
bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
}
+ // Copy mate cigar if either read is mapped
+ if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) {
+ uint8_t* data_mc;
+ if ((data_mc = bam_aux_get(dest,"MC")) != NULL) {
+ bam_aux_del(dest, data_mc);
+ }
+
+ // Convert cigar to string
+ kstring_t mc = { 0, 0, NULL };
+ if (bam_format_cigar(src, &mc) < 0) return -1;
+
+ bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc));
+ free(mc.s);
+ }
+ return 0;
}
-// copy flags
-static void sync_mate(bam1_t* a, bam1_t* b)
+// Copy flags.
+// Returns 0 on success, -1 on failure.
+static int sync_mate(bam1_t* a, bam1_t* b)
{
sync_unmapped_pos_inner(a,b);
sync_unmapped_pos_inner(b,a);
sync_mate_inner(a,b);
sync_mate_inner(b,a);
- sync_mq(a,b);
- sync_mq(b,a);
+ if (sync_mq_mc(a,b) < 0) return -1;
+ if (sync_mq_mc(b,a) < 0) return -1;
+ return 0;
}
// currently, this function ONLY works if each read has one hit
@@ -239,7 +278,7 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop
if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
pre->core.flag |= BAM_FPAIRED;
cur->core.flag |= BAM_FPAIRED;
- sync_mate(pre, cur);
+ if (sync_mate(pre, cur)) goto fail;
if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
&& !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
@@ -324,7 +363,7 @@ void usage(FILE* where)
" -p Disable FR proper pair check\n"
" -c Add template cigar ct tag\n");
- sam_global_opt_help(where, "-.O..");
+ sam_global_opt_help(where, "-.O..@");
fprintf(where,
"\n"
@@ -335,18 +374,19 @@ void usage(FILE* where)
int bam_mating(int argc, char *argv[])
{
+ htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
// parse args
if (argc == 1) { usage(stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
@@ -369,6 +409,15 @@ int bam_mating(int argc, char *argv[])
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
+
// run
res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
@@ -379,12 +428,14 @@ int bam_mating(int argc, char *argv[])
res = 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return res;
fail:
if (in) sam_close(in);
if (out) sam_close(out);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return 1;
}
diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c
index a416d07..a03de96 100644
--- a/samtools/bam_mate.c.pysam.c
+++ b/samtools/bam_mate.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2016 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2017 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
@@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
@@ -157,9 +158,30 @@ static bool plausibly_properly_paired(bam1_t* a, bam1_t* b)
return false;
}
-static void sync_mq(bam1_t* src, bam1_t* dest)
+// Returns 0 on success, -1 on failure.
+static int bam_format_cigar(const bam1_t* b, kstring_t* str)
+{
+ // An empty cigar is a special case return "*" rather than ""
+ if (b->core.n_cigar == 0) {
+ return (kputc('*', str) == EOF) ? -1 : 0;
+ }
+
+ const uint32_t *cigar = bam_get_cigar(b);
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; ++i) {
+ if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1;
+ if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1;
+ }
+
+ return 0;
+}
+
+// Returns 0 on success, -1 on failure.
+static int sync_mq_mc(bam1_t* src, bam1_t* dest)
{
if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
+ // Copy Mate Mapping Quality
uint32_t mq = src->core.qual;
uint8_t* data;
if ((data = bam_aux_get(dest,"MQ")) != NULL) {
@@ -168,17 +190,34 @@ static void sync_mq(bam1_t* src, bam1_t* dest)
bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
}
+ // Copy mate cigar if either read is mapped
+ if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) {
+ uint8_t* data_mc;
+ if ((data_mc = bam_aux_get(dest,"MC")) != NULL) {
+ bam_aux_del(dest, data_mc);
+ }
+
+ // Convert cigar to string
+ kstring_t mc = { 0, 0, NULL };
+ if (bam_format_cigar(src, &mc) < 0) return -1;
+
+ bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc));
+ free(mc.s);
+ }
+ return 0;
}
-// copy flags
-static void sync_mate(bam1_t* a, bam1_t* b)
+// Copy flags.
+// Returns 0 on success, -1 on failure.
+static int sync_mate(bam1_t* a, bam1_t* b)
{
sync_unmapped_pos_inner(a,b);
sync_unmapped_pos_inner(b,a);
sync_mate_inner(a,b);
sync_mate_inner(b,a);
- sync_mq(a,b);
- sync_mq(b,a);
+ if (sync_mq_mc(a,b) < 0) return -1;
+ if (sync_mq_mc(b,a) < 0) return -1;
+ return 0;
}
// currently, this function ONLY works if each read has one hit
@@ -241,7 +280,7 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop
if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
pre->core.flag |= BAM_FPAIRED;
cur->core.flag |= BAM_FPAIRED;
- sync_mate(pre, cur);
+ if (sync_mate(pre, cur)) goto fail;
if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
&& !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
@@ -326,7 +365,7 @@ void usage(FILE* where)
" -p Disable FR proper pair check\n"
" -c Add template cigar ct tag\n");
- sam_global_opt_help(where, "-.O..");
+ sam_global_opt_help(where, "-.O..@");
fprintf(where,
"\n"
@@ -337,18 +376,19 @@ void usage(FILE* where)
int bam_mating(int argc, char *argv[])
{
+ htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
// parse args
if (argc == 1) { usage(pysam_stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
@@ -371,6 +411,15 @@ int bam_mating(int argc, char *argv[])
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
+
// run
res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
@@ -381,12 +430,14 @@ int bam_mating(int argc, char *argv[])
res = 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return res;
fail:
if (in) sam_close(in);
if (out) sam_close(out);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return 1;
}
diff --git a/samtools/bam_md.c b/samtools/bam_md.c
index 71206cd..f095030 100644
--- a/samtools/bam_md.c
+++ b/samtools/bam_md.c
@@ -25,15 +25,15 @@ DEALINGS IN THE SOFTWARE. */
#include <config.h>
-#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
-#include <math.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "kprobaln.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "samtools.h"
@@ -161,178 +161,6 @@ void bam_fillmd1(bam1_t *b, char *ref, int flag)
bam_fillmd1_core(b, ref, INT_MAX, flag, 0);
}
-int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres)
-{
- uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, mm, q, len, clip_l, clip_q;
- double t;
- if (thres < 0) thres = 40; // set the default
- mm = q = len = clip_l = clip_q = 0;
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
- if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
- ++len;
- if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
- ++mm;
- q += qual[z] > 33? 33 : qual[z];
- }
- }
- }
- if (j < l) break;
- x += l; y += l; len += l;
- } else if (op == BAM_CDEL) {
- for (j = 0; j < l; ++j)
- if (x+j >= ref_len || ref[x+j] == '\0') break;
- if (j < l) break;
- x += l;
- } else if (op == BAM_CSOFT_CLIP) {
- for (j = 0; j < l; ++j) clip_q += qual[y+j];
- clip_l += l;
- y += l;
- } else if (op == BAM_CHARD_CLIP) {
- clip_q += 13 * l;
- clip_l += l;
- } else if (op == BAM_CINS) y += l;
- else if (op == BAM_CREF_SKIP) x += l;
- }
- for (i = 0, t = 1; i < mm; ++i)
- t *= (double)len / (i+1);
- t = q - 4.343 * log(t) + clip_q / 5.;
- if (t > thres) return -1;
- if (t < 0) t = 0;
- t = sqrt((thres - t) / thres) * thres;
-// fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
- return (int)(t + .499);
-}
-
-int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag)
-{
- int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
- return -1; // do nothing
-
- // test if BQ or ZQ is present
- if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
- if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
- if (bq && redo_baq)
- {
- bam_aux_del(b, bq-1);
- bq = 0;
- }
- if (bq && zq) { // remove the ZQ tag
- bam_aux_del(b, zq-1);
- zq = 0;
- }
- if (bq || zq) {
- if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
- if (bq && apply_baq) { // then convert BQ to ZQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
- *(bq - 3) = 'Z';
- } else if (zq && !apply_baq) { // then convert ZQ to BQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] += (int)zq[i] - 64;
- *(zq - 3) = 'B';
- }
- return 0;
- }
- // find the start and end of the alignment
- x = c->pos, y = 0, yb = ye = xb = xe = -1;
- for (k = 0; k < c->n_cigar; ++k) {
- int op, l;
- op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (yb < 0) yb = y;
- if (xb < 0) xb = x;
- ye = y + l; xe = x + l;
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
- }
- // set bandwidth and the start and the end
- bw = 7;
- if (abs((xe - xb) - (ye - yb)) > bw)
- bw = abs((xe - xb) - (ye - yb)) + 3;
- conf.bw = bw;
- xb -= yb + bw/2; if (xb < 0) xb = 0;
- xe += c->l_qseq - ye + bw/2;
- if (xe - xb - c->l_qseq > bw)
- xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
- { // glocal
- uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
- int *state;
- bq = calloc(c->l_qseq + 1, 1);
- memcpy(bq, qual, c->l_qseq);
- s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
- r = calloc(xe - xb, 1);
- for (i = xb; i < xe; ++i) {
- if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
- r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
- }
- state = calloc(c->l_qseq, sizeof(int));
- q = calloc(c->l_qseq, 1);
- kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
- if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i) {
- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
- else bq[i] = bq[i] < q[i]? bq[i] : q[i];
- }
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
- } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
- uint8_t *left, *rght;
- left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i)
- bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
- for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
- left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
- for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
- rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
- for (i = y; i < y + l; ++i)
- bq[i] = left[i] < rght[i]? left[i] : rght[i];
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
- free(left); free(rght);
- }
- if (apply_baq) {
- for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
- bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
- } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
- free(bq); free(s); free(r); free(q); free(state);
- }
- return 0;
-}
-
-int bam_prob_realn(bam1_t *b, const char *ref)
-{
- return bam_prob_realn_core(b, ref, INT_MAX, 1);
-}
-
int calmd_usage() {
fprintf(stderr,
"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
@@ -345,13 +173,14 @@ int calmd_usage() {
" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
" -E extended BAQ for better sensitivity but lower specificity\n");
- sam_global_opt_help(stderr, "-....");
+ sam_global_opt_help(stderr, "-....@");
return 1;
}
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ htsThreadPool p = {NULL, 0};
samFile *fp = NULL, *fpout = NULL;
bam_hdr_t *header = NULL;
faidx_t *fai = NULL;
@@ -360,14 +189,14 @@ int bam_fillmd(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
{ NULL, 0, NULL, 0 }
};
flt_flag = UPDATE_NM | UPDATE_MD;
is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
strcpy(mode_w, "w");
- while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
@@ -415,6 +244,15 @@ int bam_fillmd(int argc, char *argv[])
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p);
+ }
+
ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
fai = fai_load(ref_file);
@@ -440,9 +278,9 @@ int bam_fillmd(int argc, char *argv[])
if (is_realn || capQ > 10) goto fail; // Would otherwise crash
}
}
- if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
+ if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
if (capQ > 10) {
- int q = bam_cap_mapQ(b, ref, len, capQ);
+ int q = sam_cap_mapq(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
@@ -466,6 +304,8 @@ int bam_fillmd(int argc, char *argv[])
fprintf(stderr, "[bam_fillmd] error when closing output file\n");
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 0;
fail:
@@ -475,5 +315,7 @@ int bam_fillmd(int argc, char *argv[])
if (fai) fai_destroy(fai);
if (fp) sam_close(fp);
if (fpout) sam_close(fpout);
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 1;
}
diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c
index d00c01d..5e4cdb5 100644
--- a/samtools/bam_md.c.pysam.c
+++ b/samtools/bam_md.c.pysam.c
@@ -27,15 +27,15 @@ DEALINGS IN THE SOFTWARE. */
#include <config.h>
-#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
-#include <math.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "kprobaln.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "samtools.h"
@@ -163,178 +163,6 @@ void bam_fillmd1(bam1_t *b, char *ref, int flag)
bam_fillmd1_core(b, ref, INT_MAX, flag, 0);
}
-int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres)
-{
- uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, mm, q, len, clip_l, clip_q;
- double t;
- if (thres < 0) thres = 40; // set the default
- mm = q = len = clip_l = clip_q = 0;
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
- if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
- ++len;
- if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
- ++mm;
- q += qual[z] > 33? 33 : qual[z];
- }
- }
- }
- if (j < l) break;
- x += l; y += l; len += l;
- } else if (op == BAM_CDEL) {
- for (j = 0; j < l; ++j)
- if (x+j >= ref_len || ref[x+j] == '\0') break;
- if (j < l) break;
- x += l;
- } else if (op == BAM_CSOFT_CLIP) {
- for (j = 0; j < l; ++j) clip_q += qual[y+j];
- clip_l += l;
- y += l;
- } else if (op == BAM_CHARD_CLIP) {
- clip_q += 13 * l;
- clip_l += l;
- } else if (op == BAM_CINS) y += l;
- else if (op == BAM_CREF_SKIP) x += l;
- }
- for (i = 0, t = 1; i < mm; ++i)
- t *= (double)len / (i+1);
- t = q - 4.343 * log(t) + clip_q / 5.;
- if (t > thres) return -1;
- if (t < 0) t = 0;
- t = sqrt((thres - t) / thres) * thres;
-// fprintf(pysam_stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
- return (int)(t + .499);
-}
-
-int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag)
-{
- int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
- return -1; // do nothing
-
- // test if BQ or ZQ is present
- if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
- if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
- if (bq && redo_baq)
- {
- bam_aux_del(b, bq-1);
- bq = 0;
- }
- if (bq && zq) { // remove the ZQ tag
- bam_aux_del(b, zq-1);
- zq = 0;
- }
- if (bq || zq) {
- if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
- if (bq && apply_baq) { // then convert BQ to ZQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
- *(bq - 3) = 'Z';
- } else if (zq && !apply_baq) { // then convert ZQ to BQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] += (int)zq[i] - 64;
- *(zq - 3) = 'B';
- }
- return 0;
- }
- // find the start and end of the alignment
- x = c->pos, y = 0, yb = ye = xb = xe = -1;
- for (k = 0; k < c->n_cigar; ++k) {
- int op, l;
- op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (yb < 0) yb = y;
- if (xb < 0) xb = x;
- ye = y + l; xe = x + l;
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
- }
- // set bandwidth and the start and the end
- bw = 7;
- if (abs((xe - xb) - (ye - yb)) > bw)
- bw = abs((xe - xb) - (ye - yb)) + 3;
- conf.bw = bw;
- xb -= yb + bw/2; if (xb < 0) xb = 0;
- xe += c->l_qseq - ye + bw/2;
- if (xe - xb - c->l_qseq > bw)
- xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
- { // glocal
- uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
- int *state;
- bq = calloc(c->l_qseq + 1, 1);
- memcpy(bq, qual, c->l_qseq);
- s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
- r = calloc(xe - xb, 1);
- for (i = xb; i < xe; ++i) {
- if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
- r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
- }
- state = calloc(c->l_qseq, sizeof(int));
- q = calloc(c->l_qseq, 1);
- kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
- if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i) {
- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
- else bq[i] = bq[i] < q[i]? bq[i] : q[i];
- }
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
- } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
- uint8_t *left, *rght;
- left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i)
- bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
- for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
- left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
- for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
- rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
- for (i = y; i < y + l; ++i)
- bq[i] = left[i] < rght[i]? left[i] : rght[i];
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
- free(left); free(rght);
- }
- if (apply_baq) {
- for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
- bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
- } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
- free(bq); free(s); free(r); free(q); free(state);
- }
- return 0;
-}
-
-int bam_prob_realn(bam1_t *b, const char *ref)
-{
- return bam_prob_realn_core(b, ref, INT_MAX, 1);
-}
-
int calmd_usage() {
fprintf(pysam_stderr,
"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
@@ -347,13 +175,14 @@ int calmd_usage() {
" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
" -E extended BAQ for better sensitivity but lower specificity\n");
- sam_global_opt_help(pysam_stderr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....@");
return 1;
}
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ htsThreadPool p = {NULL, 0};
samFile *fp = NULL, *fpout = NULL;
bam_hdr_t *header = NULL;
faidx_t *fai = NULL;
@@ -362,14 +191,14 @@ int bam_fillmd(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
{ NULL, 0, NULL, 0 }
};
flt_flag = UPDATE_NM | UPDATE_MD;
is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
strcpy(mode_w, "w");
- while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
@@ -406,7 +235,7 @@ int bam_fillmd(int argc, char *argv[])
fprintf(pysam_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
goto fail;
}
-
+
fpout = sam_open_format(pysam_stdout_fn, mode_w, &ga.out);
if (fpout == NULL) {
print_error_errno("calmd", "Failed to open output");
@@ -417,6 +246,15 @@ int bam_fillmd(int argc, char *argv[])
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p);
+ }
+
ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
fai = fai_load(ref_file);
@@ -442,9 +280,9 @@ int bam_fillmd(int argc, char *argv[])
if (is_realn || capQ > 10) goto fail; // Would otherwise crash
}
}
- if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
+ if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
if (capQ > 10) {
- int q = bam_cap_mapQ(b, ref, len, capQ);
+ int q = sam_cap_mapq(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
@@ -468,6 +306,8 @@ int bam_fillmd(int argc, char *argv[])
fprintf(pysam_stderr, "[bam_fillmd] error when closing output file\n");
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 0;
fail:
@@ -477,5 +317,7 @@ int bam_fillmd(int argc, char *argv[])
if (fai) fai_destroy(fai);
if (fp) sam_close(fp);
if (fpout) sam_close(fpout);
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 1;
}
diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c
index dc12bf3..d17e9d6 100644
--- a/samtools/bam_plcmd.c
+++ b/samtools/bam_plcmd.c
@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <limits.h>
#include <errno.h>
#include <sys/stat.h>
@@ -118,7 +119,7 @@ void bed_destroy(void *_h);
int bed_overlap(const void *_h, const char *chr, int beg, int end);
typedef struct {
- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all;
int rflag_require, rflag_filter;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
@@ -209,11 +210,22 @@ static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
return 1;
}
+static void
+print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
+ int pos, int n, const char *ref, int ref_len)
+{
+ int i;
+ fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
+ for (i = 0; i < n; ++i) {
+ fputs("\t0\t*\t*", fp);
+ if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp);
+ if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp);
+ }
+ putc('\n', fp);
+}
+
static int mplp_func(void *data, bam1_t *b)
{
- extern int bam_realn(bam1_t *b, const char *ref);
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
- extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres);
char *ref;
mplp_aux_t *ma = (mplp_aux_t*)data;
int ret, skip = 0, ref_len;
@@ -229,7 +241,7 @@ static int mplp_func(void *data, bam1_t *b)
}
if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
- if (ma->conf->bed) { // test overlap
+ if (ma->conf->bed && ma->conf->all == 0) { // test overlap
skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b));
if (skip) continue;
}
@@ -258,9 +270,9 @@ static int mplp_func(void *data, bam1_t *b)
}
skip = 0;
- if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
if (has_ref && ma->conf->capQ_thres > 10) {
- int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres);
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) skip = 1;
else if (b->core.qual > q) b->core.qual = q;
}
@@ -308,7 +320,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
mplp_ref_t mp_ref = MPLP_REF_INIT;
bam_mplp_t iter;
@@ -379,7 +391,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
exit(EXIT_FAILURE);
}
- if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
+ if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid;
hts_idx_destroy(idx);
}
else
@@ -551,14 +563,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bam_mplp_set_maxcnt(iter, max_depth);
bcf1_t *bcf_rec = bcf_init1();
int ret;
+ int last_tid = -1, last_pos = -1;
+
// begin pileup
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
mplp_get_ref(data[0], tid, &ref, &ref_len);
//printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
if (conf->flag & MPLP_BCF) {
int total_depth, _ref0, ref16;
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
@@ -584,6 +598,35 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
}
} else {
+ if (conf->all) {
+ // Deal with missing portions of previous tids
+ while (tid > last_tid) {
+ if (last_tid >= 0 && !conf->reg) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2)
+ break;
+ }
+ }
+ if (conf->all) {
+ // Deal with missing portion of current tid
+ while (++last_pos < pos) {
+ if (conf->reg && last_pos < beg0) continue; // out of range; skip
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len);
+ }
+ last_tid = tid;
+ last_pos = pos;
+ }
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
+
fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
for (i = 0; i < n; ++i) {
int j, cnt;
@@ -600,14 +643,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
} else {
+ int n = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = p->qpos < p->b->core.l_qseq
? bam_get_qual(p->b)[p->qpos]
: 0;
if (c >= conf->min_baseQ)
- pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
+ n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
}
+ if (!n) putc('*', pileup_fp);
+
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
@@ -617,9 +664,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (c >= conf->min_baseQ) {
c = c + 33 < 126? c + 33 : 126;
putc(c, pileup_fp);
+ n++;
}
}
+ if (!n) putc('*', pileup_fp);
+
if (conf->flag & MPLP_PRINT_MAPQ) {
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
@@ -628,19 +679,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
c = plp[i][j].b->core.qual + 33;
if (c > 126) c = 126;
putc(c, pileup_fp);
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
+
if (conf->flag & MPLP_PRINT_POS) {
+ n = 0;
putc('\t', pileup_fp);
- int last = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = bam_get_qual(p->b)[p->qpos];
if ( c < conf->min_baseQ ) continue;
- if (last++) putc(',', pileup_fp);
+ if (n > 0) putc(',', pileup_fp);
fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
}
}
@@ -648,6 +704,27 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
}
+ if (conf->all && !(conf->flag & MPLP_BCF)) {
+ // Handle terminating region
+ if (last_tid < 0 && conf->reg && conf->all > 1) {
+ last_tid = tid0;
+ last_pos = beg0-1;
+ mplp_get_ref(data[0], tid0, &ref, &ref_len);
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (last_pos >= end0) break;
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2 || conf->reg)
+ break;
+ }
+ }
+
// clean up
free(bc.tmp.s);
bcf_destroy1(bcf_rec);
@@ -681,6 +758,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
return ret;
}
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
#define MAX_PATH_LEN 1024
int read_file_list(const char *file_list,int *n,char **argv[])
{
@@ -710,7 +794,7 @@ int read_file_list(const char *file_list,int *n,char **argv[])
// check sanity of the file list
buf[len] = 0;
- if (stat(buf, &sb) != 0)
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
{
// no such file, check if it is safe to print its name
int i, safe_to_print = 1;
@@ -814,6 +898,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
"Output options for mpileup format (without -g/-v):\n"
" -O, --output-BP output base positions on reads\n"
" -s, --output-MQ output mapping quality\n"
+" -a output all positions (including zero depth)\n"
+" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
"Output options for genotype likelihoods (when -g/-v is used):\n"
" -t, --output-tags LIST optional tags to output:\n"
@@ -836,7 +922,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
fprintf(fp,
" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
" -P, --platforms STR comma separated list of platforms for indels [all]\n");
- sam_global_opt_help(fp, "-.--.");
+ sam_global_opt_help(fp, "-.--.-");
fprintf(fp,
"\n"
"Notes: Assuming diploid individuals.\n");
@@ -862,11 +948,12 @@ int bam_mpileup(int argc, char *argv[])
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
+ mplp.all = 0;
sam_global_args_init(&mplp.ga);
static const struct option lopts[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{"rf", required_argument, NULL, 1}, // require flag
{"ff", required_argument, NULL, 2}, // filter flag
{"incl-flags", required_argument, NULL, 1},
@@ -916,7 +1003,7 @@ int bam_mpileup(int argc, char *argv[])
{"platforms", required_argument, NULL, 'P'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
@@ -988,6 +1075,7 @@ int bam_mpileup(int argc, char *argv[])
}
break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); break;
+ case 'a': mplp.all++; break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
/* else fall-through */
diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c
index 650e818..03e5f8a 100644
--- a/samtools/bam_plcmd.c.pysam.c
+++ b/samtools/bam_plcmd.c.pysam.c
@@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <limits.h>
#include <errno.h>
#include <sys/stat.h>
@@ -120,7 +121,7 @@ void bed_destroy(void *_h);
int bed_overlap(const void *_h, const char *chr, int beg, int end);
typedef struct {
- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all;
int rflag_require, rflag_filter;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
@@ -211,11 +212,22 @@ static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
return 1;
}
+static void
+print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
+ int pos, int n, const char *ref, int ref_len)
+{
+ int i;
+ fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
+ for (i = 0; i < n; ++i) {
+ fputs("\t0\t*\t*", fp);
+ if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp);
+ if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp);
+ }
+ putc('\n', fp);
+}
+
static int mplp_func(void *data, bam1_t *b)
{
- extern int bam_realn(bam1_t *b, const char *ref);
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
- extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres);
char *ref;
mplp_aux_t *ma = (mplp_aux_t*)data;
int ret, skip = 0, ref_len;
@@ -231,7 +243,7 @@ static int mplp_func(void *data, bam1_t *b)
}
if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
- if (ma->conf->bed) { // test overlap
+ if (ma->conf->bed && ma->conf->all == 0) { // test overlap
skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b));
if (skip) continue;
}
@@ -260,9 +272,9 @@ static int mplp_func(void *data, bam1_t *b)
}
skip = 0;
- if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
if (has_ref && ma->conf->capQ_thres > 10) {
- int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres);
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) skip = 1;
else if (b->core.qual > q) b->core.qual = q;
}
@@ -310,7 +322,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
mplp_ref_t mp_ref = MPLP_REF_INIT;
bam_mplp_t iter;
@@ -381,7 +393,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(pysam_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
exit(EXIT_FAILURE);
}
- if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
+ if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid;
hts_idx_destroy(idx);
}
else
@@ -553,14 +565,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bam_mplp_set_maxcnt(iter, max_depth);
bcf1_t *bcf_rec = bcf_init1();
int ret;
+ int last_tid = -1, last_pos = -1;
+
// begin pileup
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
mplp_get_ref(data[0], tid, &ref, &ref_len);
//printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
if (conf->flag & MPLP_BCF) {
int total_depth, _ref0, ref16;
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
@@ -586,6 +600,35 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
}
} else {
+ if (conf->all) {
+ // Deal with missing portions of previous tids
+ while (tid > last_tid) {
+ if (last_tid >= 0 && !conf->reg) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2)
+ break;
+ }
+ }
+ if (conf->all) {
+ // Deal with missing portion of current tid
+ while (++last_pos < pos) {
+ if (conf->reg && last_pos < beg0) continue; // out of range; skip
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len);
+ }
+ last_tid = tid;
+ last_pos = pos;
+ }
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
+
fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
for (i = 0; i < n; ++i) {
int j, cnt;
@@ -602,14 +645,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
} else {
+ int n = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = p->qpos < p->b->core.l_qseq
? bam_get_qual(p->b)[p->qpos]
: 0;
if (c >= conf->min_baseQ)
- pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
+ n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
}
+ if (!n) putc('*', pileup_fp);
+
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
@@ -619,9 +666,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
if (c >= conf->min_baseQ) {
c = c + 33 < 126? c + 33 : 126;
putc(c, pileup_fp);
+ n++;
}
}
+ if (!n) putc('*', pileup_fp);
+
if (conf->flag & MPLP_PRINT_MAPQ) {
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
@@ -630,19 +681,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
c = plp[i][j].b->core.qual + 33;
if (c > 126) c = 126;
putc(c, pileup_fp);
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
+
if (conf->flag & MPLP_PRINT_POS) {
+ n = 0;
putc('\t', pileup_fp);
- int last = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = bam_get_qual(p->b)[p->qpos];
if ( c < conf->min_baseQ ) continue;
- if (last++) putc(',', pileup_fp);
+ if (n > 0) putc(',', pileup_fp);
fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(pysam_stdout, ) is very slow...
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
}
}
@@ -650,6 +706,27 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
}
+ if (conf->all && !(conf->flag & MPLP_BCF)) {
+ // Handle terminating region
+ if (last_tid < 0 && conf->reg && conf->all > 1) {
+ last_tid = tid0;
+ last_pos = beg0-1;
+ mplp_get_ref(data[0], tid0, &ref, &ref_len);
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (last_pos >= end0) break;
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2 || conf->reg)
+ break;
+ }
+ }
+
// clean up
free(bc.tmp.s);
bcf_destroy1(bcf_rec);
@@ -683,6 +760,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
return ret;
}
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
#define MAX_PATH_LEN 1024
int read_file_list(const char *file_list,int *n,char **argv[])
{
@@ -712,7 +796,7 @@ int read_file_list(const char *file_list,int *n,char **argv[])
// check sanity of the file list
buf[len] = 0;
- if (stat(buf, &sb) != 0)
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
{
// no such file, check if it is safe to print its name
int i, safe_to_print = 1;
@@ -816,6 +900,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
"Output options for mpileup format (without -g/-v):\n"
" -O, --output-BP output base positions on reads\n"
" -s, --output-MQ output mapping quality\n"
+" -a output all positions (including zero depth)\n"
+" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
"Output options for genotype likelihoods (when -g/-v is used):\n"
" -t, --output-tags LIST optional tags to output:\n"
@@ -838,7 +924,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
fprintf(fp,
" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
" -P, --platforms STR comma separated list of platforms for indels [all]\n");
- sam_global_opt_help(fp, "-.--.");
+ sam_global_opt_help(fp, "-.--.-");
fprintf(fp,
"\n"
"Notes: Assuming diploid individuals.\n");
@@ -864,11 +950,12 @@ int bam_mpileup(int argc, char *argv[])
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
+ mplp.all = 0;
sam_global_args_init(&mplp.ga);
static const struct option lopts[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{"rf", required_argument, NULL, 1}, // require flag
{"ff", required_argument, NULL, 2}, // filter flag
{"incl-flags", required_argument, NULL, 1},
@@ -918,7 +1005,7 @@ int bam_mpileup(int argc, char *argv[])
{"platforms", required_argument, NULL, 'P'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
@@ -990,6 +1077,7 @@ int bam_mpileup(int argc, char *argv[])
}
break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); break;
+ case 'a': mplp.all++; break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
/* else fall-through */
diff --git a/samtools/bam_quickcheck.c b/samtools/bam_quickcheck.c
index 6c3c664..02616fe 100644
--- a/samtools/bam_quickcheck.c
+++ b/samtools/bam_quickcheck.c
@@ -26,7 +26,6 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/hts.h>
#include <htslib/sam.h>
-#include <htslib/bgzf.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -102,7 +101,7 @@ int main_quickcheck(int argc, char** argv)
// attempt to open
htsFile *hts_fp = hts_open(fn, "r");
if (hts_fp == NULL) {
- if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading\n", fn);
+ if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading.\n", fn);
file_state |= 2;
}
else {
@@ -110,37 +109,54 @@ int main_quickcheck(int argc, char** argv)
// make sure we have sequence data
const htsFormat *fmt = hts_get_format(hts_fp);
if (fmt->category != sequence_data ) {
- if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data\n", fn);
+ if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data.\n", fn);
file_state |= 4;
}
else {
if (verbose >= 3) fprintf(stderr, "%s is sequence data\n", fn);
// check header
bam_hdr_t *header = sam_hdr_read(hts_fp);
- if (header->n_targets <= 0) {
- if (verbose >= 2) fprintf(stderr, "%s had no targets in header\n", fn);
+ if (header == NULL) {
+ if (verbose >= 2) fprintf(stderr, "%s caused an error whilst reading its header.\n", fn);
file_state |= 8;
- }
- else {
- if (verbose >= 3) fprintf(stderr, "%s has %d targets in header\n", fn, header->n_targets);
- }
-
- // only check EOF on BAM for now
- // TODO implement and use hts_check_EOF() to include CRAM support
- if (fmt->format == bam) {
- if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
- if (verbose >= 2) fprintf(stderr, "%s was missing EOF block\n", fn);
- file_state |= 16;
+ } else {
+ if (header->n_targets <= 0) {
+ if (verbose >= 2) fprintf(stderr, "%s had no targets in header.\n", fn);
+ file_state |= 8;
}
else {
- if (verbose >= 3) fprintf(stderr, "%s has good EOF block\n", fn);
+ if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, header->n_targets);
}
+ bam_hdr_destroy(header);
+ }
+ }
+ // check EOF on formats that support this
+ int ret;
+ if ((ret = hts_check_EOF(hts_fp)) < 0) {
+ if (verbose >= 2) fprintf(stderr, "%s caused an error whilst checking for EOF block.\n", fn);
+ file_state |= 16;
+ }
+ else {
+ switch (ret) {
+ case 0:
+ if (verbose >= 2) fprintf(stderr, "%s was missing EOF block when one should be present.\n", fn);
+ file_state |= 16;
+ break;
+ case 1:
+ if (verbose >= 3) fprintf(stderr, "%s has good EOF block.\n", fn);
+ break;
+ case 2:
+ if (verbose >= 3) fprintf(stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn);
+ break;
+ case 3:
+ if (verbose >= 3) fprintf(stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn);
+ break;
}
}
if (hts_close(hts_fp) < 0) {
file_state |= 32;
- if (verbose >= 2) fprintf(stderr, "%s did not close cleanly\n", fn);
+ if (verbose >= 2) fprintf(stderr, "%s did not close cleanly.\n", fn);
}
}
diff --git a/samtools/bam_quickcheck.c.pysam.c b/samtools/bam_quickcheck.c.pysam.c
index 26dbeb9..c9dc3d2 100644
--- a/samtools/bam_quickcheck.c.pysam.c
+++ b/samtools/bam_quickcheck.c.pysam.c
@@ -28,7 +28,6 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/hts.h>
#include <htslib/sam.h>
-#include <htslib/bgzf.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -104,7 +103,7 @@ int main_quickcheck(int argc, char** argv)
// attempt to open
htsFile *hts_fp = hts_open(fn, "r");
if (hts_fp == NULL) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading.\n", fn);
file_state |= 2;
}
else {
@@ -112,37 +111,54 @@ int main_quickcheck(int argc, char** argv)
// make sure we have sequence data
const htsFormat *fmt = hts_get_format(hts_fp);
if (fmt->category != sequence_data ) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data.\n", fn);
file_state |= 4;
}
else {
if (verbose >= 3) fprintf(pysam_stderr, "%s is sequence data\n", fn);
// check header
bam_hdr_t *header = sam_hdr_read(hts_fp);
- if (header->n_targets <= 0) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header\n", fn);
+ if (header == NULL) {
+ if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst reading its header.\n", fn);
file_state |= 8;
- }
- else {
- if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header\n", fn, header->n_targets);
- }
-
- // only check EOF on BAM for now
- // TODO implement and use hts_check_EOF() to include CRAM support
- if (fmt->format == bam) {
- if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block\n", fn);
- file_state |= 16;
+ } else {
+ if (header->n_targets <= 0) {
+ if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header.\n", fn);
+ file_state |= 8;
}
else {
- if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block\n", fn);
+ if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header.\n", fn, header->n_targets);
}
+ bam_hdr_destroy(header);
+ }
+ }
+ // check EOF on formats that support this
+ int ret;
+ if ((ret = hts_check_EOF(hts_fp)) < 0) {
+ if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst checking for EOF block.\n", fn);
+ file_state |= 16;
+ }
+ else {
+ switch (ret) {
+ case 0:
+ if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block when one should be present.\n", fn);
+ file_state |= 16;
+ break;
+ case 1:
+ if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block.\n", fn);
+ break;
+ case 2:
+ if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn);
+ break;
+ case 3:
+ if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn);
+ break;
}
}
if (hts_close(hts_fp) < 0) {
file_state |= 32;
- if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly.\n", fn);
}
}
diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c
index 0469c06..acaebd4 100644
--- a/samtools/bam_reheader.c
+++ b/samtools/bam_reheader.c
@@ -91,7 +91,7 @@ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
goto fail;
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) < 0) goto write_fail;
}
while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
@@ -246,7 +246,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list
int32_put_blk(b, header_len);
cram_block_append(b, sam_hdr_str(hdr), header_len);
// Zero the remaining block
- memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
+ memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0,
cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
// Make sure all sizes and byte-offsets are consistent after memset
cram_block_set_offset(b, cram_block_get_uncomp_size(b));
diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c
index 16990e6..18cb6c4 100644
--- a/samtools/bam_reheader.c.pysam.c
+++ b/samtools/bam_reheader.c.pysam.c
@@ -93,7 +93,7 @@ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
goto fail;
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) < 0) goto write_fail;
}
while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
@@ -248,7 +248,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list
int32_put_blk(b, header_len);
cram_block_append(b, sam_hdr_str(hdr), header_len);
// Zero the remaining block
- memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
+ memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0,
cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
// Make sure all sizes and byte-offsets are consistent after memset
cram_block_set_offset(b, cram_block_get_uncomp_size(b));
@@ -436,7 +436,7 @@ int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
}
}
-static void usage(FILE *fp, int ret) {
+static int usage(FILE *fp, int ret) {
fprintf(fp,
"Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n"
" or samtools reheader [-P] -i in.header.sam file.bam\n"
@@ -445,7 +445,7 @@ static void usage(FILE *fp, int ret) {
" -P, --no-PG Do not generate an @PG header line.\n"
" -i, --in-place Modify the bam/cram file directly.\n"
" (Defaults to outputting to pysam_stdout.)\n");
- exit(ret);
+ return(ret);
}
int main_reheader(int argc, char *argv[])
@@ -466,15 +466,15 @@ int main_reheader(int argc, char *argv[])
switch (c) {
case 'P': add_PG = 0; break;
case 'i': inplace = 1; break;
- case 'h': usage(pysam_stdout, 0); break;
+ case 'h': return(usage(pysam_stdout, 0)); break;
default:
fprintf(pysam_stderr, "Invalid option '%c'\n", c);
- usage(pysam_stderr, 1);
+ return(usage(pysam_stderr, 1));
}
}
if (argc - optind != 2)
- usage(pysam_stderr, 1);
+ return(usage(pysam_stderr, 1));
{ // read the header
samFile *fph = sam_open(argv[optind], "r");
diff --git a/samtools/bam_rmdup.c b/samtools/bam_rmdup.c
index 57612b4..513848d 100644
--- a/samtools/bam_rmdup.c
+++ b/samtools/bam_rmdup.c
@@ -258,7 +258,7 @@ static int rmdup_usage(void) {
fprintf(stderr, "Option: -s rmdup for SE reads\n");
fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n");
- sam_global_opt_help(stderr, "-....");
+ sam_global_opt_help(stderr, "-....-");
return 1;
}
@@ -271,7 +271,7 @@ int bam_rmdup(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
diff --git a/samtools/bam_rmdup.c.pysam.c b/samtools/bam_rmdup.c.pysam.c
index 3c16025..6742fc8 100644
--- a/samtools/bam_rmdup.c.pysam.c
+++ b/samtools/bam_rmdup.c.pysam.c
@@ -260,7 +260,7 @@ static int rmdup_usage(void) {
fprintf(pysam_stderr, "Option: -s rmdup for SE reads\n");
fprintf(pysam_stderr, " -S treat PE reads as SE in rmdup (force -s)\n");
- sam_global_opt_help(pysam_stderr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....-");
return 1;
}
@@ -273,7 +273,7 @@ int bam_rmdup(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c
index 4955dcc..be9789c 100644
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -43,6 +43,17 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/kstring.h"
#include "htslib/sam.h"
#include "sam_opts.h"
+#include "samtools.h"
+
+/* Minimum memory required in megabytes before sort will attempt to run. This
+ is to prevent accidents where failing to use the -m option correctly results
+ in the creation of a temporary file for each read in the input file.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_MIN_MEGS_PER_THREAD = 1;
+
+/* Default per-thread memory for sort. Must be >= SORT_MIN_MEGS_PER_THREAD.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_DEFAULT_MEGS_PER_THREAD = 768;
#if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L
#define NEED_MEMSET_PATTERN4
@@ -1098,6 +1109,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
@param flag flags that control how the merge is undertaken
@param reg region to merge
@param n_threads number of threads to use (passed to htslib)
+ @param cmd command name (used in print_error() etc)
@param in_fmt format options for input files
@param out_fmt output file format and options
@discussion Padding information may NOT correctly maintained. This
@@ -1105,7 +1117,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
*/
int bam_merge_core2(int by_qname, const char *out, const char *mode,
const char *headers, int n, char * const *fn, int flag,
- const char *reg, int n_threads,
+ const char *reg, int n_threads, const char *cmd,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
samFile *fpout, **fp = NULL;
@@ -1126,25 +1138,15 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
if (headers) {
samFile* fpheaders = sam_open(headers, "r");
if (fpheaders == NULL) {
- const char *message = strerror(errno);
- fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ print_error_errno(cmd, "cannot open \"%s\"", headers);
return -1;
}
hin = sam_hdr_read(fpheaders);
sam_close(fpheaders);
if (hin == NULL) {
- fprintf(stderr, "[bam_merge_core] couldn't read headers for '%s'\n",
- headers);
- goto mem_fail;
- }
- } else {
- hout = bam_hdr_init();
- if (!hout) {
- fprintf(stderr, "[bam_merge_core] couldn't allocate bam header\n");
+ print_error(cmd, "couldn't read headers from \"%s\"", headers);
goto mem_fail;
}
- hout->text = strdup("");
- if (!hout->text) goto mem_fail;
}
g_is_by_qname = by_qname;
@@ -1194,13 +1196,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_hdr_t *hin;
fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
- fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+ print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
- fprintf(stderr, "[bam_merge_core] failed to read header for '%s'\n",
- fn[i]);
+ print_error(cmd, "failed to read header from \"%s\"", fn[i]);
goto fail;
}
@@ -1218,6 +1219,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
}
+
+ // Potential future improvement is to share headers between CRAM files for
+ // samtools sort (where all headers are identical.
+ // Eg:
+ //
+ // if (i > 1) {
+ // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram));
+ // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram));
+ // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram));
+ // }
}
// Did we get an @HD line?
@@ -1326,19 +1337,18 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_destroy1(h->b);
h->b = NULL;
} else {
- fprintf(stderr, "[%s] failed to read first record from %s\n",
- __func__, fn[i]);
+ print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
}
}
// Open output file and write header
if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
- fprintf(stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno));
+ print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
if (sam_hdr_write(fpout, hout) != 0) {
- fprintf(stderr, "[%s] failed to write header.\n", __func__);
+ print_error_errno(cmd, "failed to write header to \"%s\"", out);
sam_close(fpout);
return -1;
}
@@ -1354,7 +1364,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
if (sam_write1(fpout, hout, b) < 0) {
- fprintf(stderr, "[%s] failed to write to output file.\n", __func__);
+ print_error_errno(cmd, "failed writing to \"%s\"", out);
sam_close(fpout);
return -1;
}
@@ -1367,8 +1377,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_destroy1(heap->b);
heap->b = NULL;
} else {
- fprintf(stderr, "[bam_merge_core] error: '%s' is truncated.\n",
- fn[heap->i]);
+ print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
goto fail;
}
ks_heapadjust(heap, 0, n, heap);
@@ -1390,13 +1399,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
free_merged_header(merged_hdr);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
if (sam_close(fpout) < 0) {
- fprintf(stderr, "[bam_merge_core] error closing output file\n");
+ print_error(cmd, "error closing output file");
return -1;
}
return 0;
mem_fail:
- fprintf(stderr, "[bam_merge_core] Out of memory\n");
+ print_error(cmd, "Out of memory");
fail:
if (flag & MERGE_RG) {
@@ -1430,7 +1439,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL);
+ return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
}
static void merge_usage(FILE *to)
@@ -1450,15 +1459,13 @@ static void merge_usage(FILE *to)
" -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n"
" -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n"
" -s VALUE Override random seed\n"
-" -b FILE List of input BAM filenames, one per line [null]\n"
-" -@, --threads INT\n"
-" Number of BAM/CRAM compression threads [0]\n");
- sam_global_opt_help(to, "-.O..");
+" -b FILE List of input BAM filenames, one per line [null]\n");
+ sam_global_opt_help(to, "-.O..@");
}
int bam_merge(int argc, char *argv[])
{
- int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
+ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1;
char *fn_headers = NULL, *reg = NULL, mode[12];
long random_seed = (long)time(NULL);
char** fn = NULL;
@@ -1466,7 +1473,7 @@ int bam_merge(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
@@ -1486,7 +1493,6 @@ int bam_merge(int argc, char *argv[])
case 'u': flag |= MERGE_UNCOMP; level = 0; break;
case 'R': reg = strdup(optarg); break;
case 'l': level = atoi(optarg); break;
- case '@': n_threads = atoi(optarg); break;
case 'c': flag |= MERGE_COMBINE_RG; break;
case 'p': flag |= MERGE_COMBINE_PG; break;
case 's': random_seed = atol(optarg); break;
@@ -1500,9 +1506,10 @@ int bam_merge(int argc, char *argv[])
if (fn == NULL) { ret = 1; goto end; }
memcpy(fn+fn_size, fn_read, nfiles * sizeof(char*));
fn_size += nfiles;
+ free(fn_read);
}
else {
- fprintf(stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg);
+ print_error("merge", "Invalid file list \"%s\"", optarg);
ret = 1;
}
break;
@@ -1514,7 +1521,7 @@ int bam_merge(int argc, char *argv[])
}
}
if ( argc - optind < 1 ) {
- fprintf(stderr, "You must at least specify the output file.\n");
+ print_error("merge", "You must at least specify the output file");
merge_usage(stderr);
return 1;
}
@@ -1537,7 +1544,7 @@ int bam_merge(int argc, char *argv[])
memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
}
if (fn_size+nargcfiles < 1) {
- fprintf(stderr, "You must specify at least one (and usually two or more) input files.\n");
+ print_error("merge", "You must specify at least one (and usually two or more) input files");
merge_usage(stderr);
return 1;
}
@@ -1545,8 +1552,8 @@ int bam_merge(int argc, char *argv[])
sam_open_mode(mode+1, argv[optind], NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
- fn_size+nargcfiles, fn, flag, reg, n_threads,
- &ga.in, &ga.out) < 0)
+ fn_size+nargcfiles, fn, flag, reg, ga.nthreads,
+ "merge", &ga.in, &ga.out) < 0)
ret = 1;
end:
@@ -1651,18 +1658,30 @@ static void *worker(void *data)
name = (char*)calloc(strlen(w->prefix) + 20, 1);
if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
- w->error = errno;
-
-// Consider using CRAM temporary files if the final output is CRAM.
-// Typically it is comparable speed while being smaller.
-// hts_opt opt[2] = {
-// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL},
-// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
-// };
-// opt[0].next = &opt[1];
-// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0)
-// w->error = errno;
+
+ uint32_t max_ncigar = 0;
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ uint32_t nc = w->buf[i]->core.n_cigar;
+ if (max_ncigar < nc)
+ max_ncigar = nc;
+ }
+
+ if (max_ncigar > 65535) {
+ htsFormat fmt;
+ memset(&fmt, 0, sizeof(fmt));
+ if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) {
+ w->error = errno;
+ free(name);
+ return 0;
+ }
+
+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0)
+ w->error = errno;
+ } else {
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
+ w->error = errno;
+ }
free(name);
return 0;
@@ -1697,7 +1716,8 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
if (w[i].error != 0) {
- fprintf(stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error));
+ errno = w[i].error;
+ print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
n_failed++;
}
}
@@ -1741,17 +1761,23 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
buf = NULL;
fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
- const char *message = strerror(errno);
- fprintf(stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
+ print_error_errno("sort", "can't open \"%s\"", fn);
return -2;
}
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(stderr, "[bam_sort_core] failed to read header for '%s'\n", fn);
+ print_error("sort", "failed to read header from \"%s\"", fn);
goto err;
}
if (is_by_qname) change_SO(header, "queryname");
else change_SO(header, "coordinate");
+
+ // No gain to using the thread pool here as the flow of this code
+ // is such that we are *either* reading *or* sorting. Hence a shared
+ // pool makes no real difference except to reduce the thread count a little.
+ if (n_threads > 1)
+ hts_set_threads(fp, n_threads);
+
// write sub files
for (;;) {
if (k == max_k) {
@@ -1780,7 +1806,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
}
}
if (ret != -1) {
- fprintf(stderr, "[bam_sort_core] truncated file. Aborting.\n");
+ print_error("sort", "truncated file. Aborting");
ret = -1;
goto err;
}
@@ -1789,7 +1815,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
- fprintf(stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno));
+ print_error_errno("sort", "failed to create \"%s\"", fnout);
ret = -1;
goto err;
}
@@ -1808,7 +1834,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
}
if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
- NULL, n_threads, in_fmt, out_fmt) < 0) {
+ NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
@@ -1851,23 +1877,38 @@ static void sort_usage(FILE *fp)
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -n Sort by read name\n"
" -o FILE Write final output to FILE rather than standard output\n"
-" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
-" -@, --threads INT\n"
-" Set number of sorting and compression threads [1]\n");
- sam_global_opt_help(fp, "-.O..");
+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n");
+ sam_global_opt_help(fp, "-.O..@");
+}
+
+static void complain_about_memory_setting(size_t max_mem) {
+ char *suffix = "";
+ const size_t nine_k = 9<<10;
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "K"; }
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "M"; }
+
+ fprintf(stderr,
+"[bam_sort] -m setting (%zu%s bytes) is less than the minimum required (%zuM).\n\n"
+"Trying to run with -m too small can lead to the creation of a very large number\n"
+"of temporary files. This may make sort fail due to it exceeding limits on the\n"
+"number of files it can have open at the same time.\n\n"
+"Please check your -m parameter. It should be an integer followed by one of the\n"
+"letters K (for kilobytes), M (megabytes) or G (gigabytes). You should ensure it\n"
+"is at least the minimum above, and much higher if you are sorting a large file.\n",
+ max_mem, suffix, SORT_MIN_MEGS_PER_THREAD);
}
int bam_sort(int argc, char *argv[])
{
- size_t max_mem = 768<<20; // 512MB
- int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
+ size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
@@ -1885,7 +1926,6 @@ int bam_sort(int argc, char *argv[])
break;
}
case 'T': kputs(optarg, &tmpprefix); break;
- case '@': n_threads = atoi(optarg); break;
case 'l': level = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
@@ -1910,6 +1950,12 @@ int bam_sort(int argc, char *argv[])
goto sort_end;
}
+ if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) {
+ complain_about_memory_setting(max_mem);
+ ret = EXIT_FAILURE;
+ goto sort_end;
+ }
+
strcpy(modeout, "wb");
sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
@@ -1925,7 +1971,7 @@ int bam_sort(int argc, char *argv[])
}
ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem, n_threads,
+ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out);
if (ret >= 0)
ret = EXIT_SUCCESS;
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c
index b2b625d..ea2a30d 100644
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -45,6 +45,17 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/kstring.h"
#include "htslib/sam.h"
#include "sam_opts.h"
+#include "samtools.h"
+
+/* Minimum memory required in megabytes before sort will attempt to run. This
+ is to prevent accidents where failing to use the -m option correctly results
+ in the creation of a temporary file for each read in the input file.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_MIN_MEGS_PER_THREAD = 1;
+
+/* Default per-thread memory for sort. Must be >= SORT_MIN_MEGS_PER_THREAD.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_DEFAULT_MEGS_PER_THREAD = 768;
#if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L
#define NEED_MEMSET_PATTERN4
@@ -1100,6 +1111,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
@param flag flags that control how the merge is undertaken
@param reg region to merge
@param n_threads number of threads to use (passed to htslib)
+ @param cmd command name (used in print_error() etc)
@param in_fmt format options for input files
@param out_fmt output file format and options
@discussion Padding information may NOT correctly maintained. This
@@ -1107,7 +1119,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
*/
int bam_merge_core2(int by_qname, const char *out, const char *mode,
const char *headers, int n, char * const *fn, int flag,
- const char *reg, int n_threads,
+ const char *reg, int n_threads, const char *cmd,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
samFile *fpout, **fp = NULL;
@@ -1128,25 +1140,15 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
if (headers) {
samFile* fpheaders = sam_open(headers, "r");
if (fpheaders == NULL) {
- const char *message = strerror(errno);
- fprintf(pysam_stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ print_error_errno(cmd, "cannot open \"%s\"", headers);
return -1;
}
hin = sam_hdr_read(fpheaders);
sam_close(fpheaders);
if (hin == NULL) {
- fprintf(pysam_stderr, "[bam_merge_core] couldn't read headers for '%s'\n",
- headers);
- goto mem_fail;
- }
- } else {
- hout = bam_hdr_init();
- if (!hout) {
- fprintf(pysam_stderr, "[bam_merge_core] couldn't allocate bam header\n");
+ print_error(cmd, "couldn't read headers from \"%s\"", headers);
goto mem_fail;
}
- hout->text = strdup("");
- if (!hout->text) goto mem_fail;
}
g_is_by_qname = by_qname;
@@ -1196,13 +1198,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_hdr_t *hin;
fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
- fprintf(pysam_stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+ print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
- fprintf(pysam_stderr, "[bam_merge_core] failed to read header for '%s'\n",
- fn[i]);
+ print_error(cmd, "failed to read header from \"%s\"", fn[i]);
goto fail;
}
@@ -1220,6 +1221,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(pysam_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
}
+
+ // Potential future improvement is to share headers between CRAM files for
+ // samtools sort (where all headers are identical.
+ // Eg:
+ //
+ // if (i > 1) {
+ // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram));
+ // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram));
+ // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram));
+ // }
}
// Did we get an @HD line?
@@ -1328,19 +1339,18 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_destroy1(h->b);
h->b = NULL;
} else {
- fprintf(pysam_stderr, "[%s] failed to read first record from %s\n",
- __func__, fn[i]);
+ print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
}
}
// Open output file and write header
if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
- fprintf(pysam_stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno));
+ print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
if (sam_hdr_write(fpout, hout) != 0) {
- fprintf(pysam_stderr, "[%s] failed to write header.\n", __func__);
+ print_error_errno(cmd, "failed to write header to \"%s\"", out);
sam_close(fpout);
return -1;
}
@@ -1356,7 +1366,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
if (sam_write1(fpout, hout, b) < 0) {
- fprintf(pysam_stderr, "[%s] failed to write to output file.\n", __func__);
+ print_error_errno(cmd, "failed writing to \"%s\"", out);
sam_close(fpout);
return -1;
}
@@ -1369,8 +1379,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
bam_destroy1(heap->b);
heap->b = NULL;
} else {
- fprintf(pysam_stderr, "[bam_merge_core] error: '%s' is truncated.\n",
- fn[heap->i]);
+ print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
goto fail;
}
ks_heapadjust(heap, 0, n, heap);
@@ -1392,13 +1401,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
free_merged_header(merged_hdr);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
if (sam_close(fpout) < 0) {
- fprintf(pysam_stderr, "[bam_merge_core] error closing output file\n");
+ print_error(cmd, "error closing output file");
return -1;
}
return 0;
mem_fail:
- fprintf(pysam_stderr, "[bam_merge_core] Out of memory\n");
+ print_error(cmd, "Out of memory");
fail:
if (flag & MERGE_RG) {
@@ -1432,7 +1441,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL);
+ return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
}
static void merge_usage(FILE *to)
@@ -1452,15 +1461,13 @@ static void merge_usage(FILE *to)
" -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n"
" -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n"
" -s VALUE Override random seed\n"
-" -b FILE List of input BAM filenames, one per line [null]\n"
-" -@, --threads INT\n"
-" Number of BAM/CRAM compression threads [0]\n");
- sam_global_opt_help(to, "-.O..");
+" -b FILE List of input BAM filenames, one per line [null]\n");
+ sam_global_opt_help(to, "-.O..@");
}
int bam_merge(int argc, char *argv[])
{
- int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
+ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1;
char *fn_headers = NULL, *reg = NULL, mode[12];
long random_seed = (long)time(NULL);
char** fn = NULL;
@@ -1468,7 +1475,7 @@ int bam_merge(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
@@ -1488,7 +1495,6 @@ int bam_merge(int argc, char *argv[])
case 'u': flag |= MERGE_UNCOMP; level = 0; break;
case 'R': reg = strdup(optarg); break;
case 'l': level = atoi(optarg); break;
- case '@': n_threads = atoi(optarg); break;
case 'c': flag |= MERGE_COMBINE_RG; break;
case 'p': flag |= MERGE_COMBINE_PG; break;
case 's': random_seed = atol(optarg); break;
@@ -1502,9 +1508,10 @@ int bam_merge(int argc, char *argv[])
if (fn == NULL) { ret = 1; goto end; }
memcpy(fn+fn_size, fn_read, nfiles * sizeof(char*));
fn_size += nfiles;
+ free(fn_read);
}
else {
- fprintf(pysam_stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg);
+ print_error("merge", "Invalid file list \"%s\"", optarg);
ret = 1;
}
break;
@@ -1516,7 +1523,7 @@ int bam_merge(int argc, char *argv[])
}
}
if ( argc - optind < 1 ) {
- fprintf(pysam_stderr, "You must at least specify the output file.\n");
+ print_error("merge", "You must at least specify the output file");
merge_usage(pysam_stderr);
return 1;
}
@@ -1539,7 +1546,7 @@ int bam_merge(int argc, char *argv[])
memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
}
if (fn_size+nargcfiles < 1) {
- fprintf(pysam_stderr, "You must specify at least one (and usually two or more) input files.\n");
+ print_error("merge", "You must specify at least one (and usually two or more) input files");
merge_usage(pysam_stderr);
return 1;
}
@@ -1547,8 +1554,8 @@ int bam_merge(int argc, char *argv[])
sam_open_mode(mode+1, argv[optind], NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
- fn_size+nargcfiles, fn, flag, reg, n_threads,
- &ga.in, &ga.out) < 0)
+ fn_size+nargcfiles, fn, flag, reg, ga.nthreads,
+ "merge", &ga.in, &ga.out) < 0)
ret = 1;
end:
@@ -1653,18 +1660,30 @@ static void *worker(void *data)
name = (char*)calloc(strlen(w->prefix) + 20, 1);
if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
- w->error = errno;
-
-// Consider using CRAM temporary files if the final output is CRAM.
-// Typically it is comparable speed while being smaller.
-// hts_opt opt[2] = {
-// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL},
-// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
-// };
-// opt[0].next = &opt[1];
-// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0)
-// w->error = errno;
+
+ uint32_t max_ncigar = 0;
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ uint32_t nc = w->buf[i]->core.n_cigar;
+ if (max_ncigar < nc)
+ max_ncigar = nc;
+ }
+
+ if (max_ncigar > 65535) {
+ htsFormat fmt;
+ memset(&fmt, 0, sizeof(fmt));
+ if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) {
+ w->error = errno;
+ free(name);
+ return 0;
+ }
+
+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0)
+ w->error = errno;
+ } else {
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
+ w->error = errno;
+ }
free(name);
return 0;
@@ -1699,7 +1718,8 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
if (w[i].error != 0) {
- fprintf(pysam_stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error));
+ errno = w[i].error;
+ print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
n_failed++;
}
}
@@ -1743,17 +1763,23 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
buf = NULL;
fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
- const char *message = strerror(errno);
- fprintf(pysam_stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
+ print_error_errno("sort", "can't open \"%s\"", fn);
return -2;
}
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(pysam_stderr, "[bam_sort_core] failed to read header for '%s'\n", fn);
+ print_error("sort", "failed to read header from \"%s\"", fn);
goto err;
}
if (is_by_qname) change_SO(header, "queryname");
else change_SO(header, "coordinate");
+
+ // No gain to using the thread pool here as the flow of this code
+ // is such that we are *either* reading *or* sorting. Hence a shared
+ // pool makes no real difference except to reduce the thread count a little.
+ if (n_threads > 1)
+ hts_set_threads(fp, n_threads);
+
// write sub files
for (;;) {
if (k == max_k) {
@@ -1782,7 +1808,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
}
}
if (ret != -1) {
- fprintf(pysam_stderr, "[bam_sort_core] truncated file. Aborting.\n");
+ print_error("sort", "truncated file. Aborting");
ret = -1;
goto err;
}
@@ -1791,7 +1817,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
- fprintf(pysam_stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno));
+ print_error_errno("sort", "failed to create \"%s\"", fnout);
ret = -1;
goto err;
}
@@ -1810,7 +1836,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
}
if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
- NULL, n_threads, in_fmt, out_fmt) < 0) {
+ NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
@@ -1853,23 +1879,38 @@ static void sort_usage(FILE *fp)
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -n Sort by read name\n"
" -o FILE Write final output to FILE rather than standard output\n"
-" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
-" -@, --threads INT\n"
-" Set number of sorting and compression threads [1]\n");
- sam_global_opt_help(fp, "-.O..");
+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n");
+ sam_global_opt_help(fp, "-.O..@");
+}
+
+static void complain_about_memory_setting(size_t max_mem) {
+ char *suffix = "";
+ const size_t nine_k = 9<<10;
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "K"; }
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "M"; }
+
+ fprintf(pysam_stderr,
+"[bam_sort] -m setting (%zu%s bytes) is less than the minimum required (%zuM).\n\n"
+"Trying to run with -m too small can lead to the creation of a very large number\n"
+"of temporary files. This may make sort fail due to it exceeding limits on the\n"
+"number of files it can have open at the same time.\n\n"
+"Please check your -m parameter. It should be an integer followed by one of the\n"
+"letters K (for kilobytes), M (megabytes) or G (gigabytes). You should ensure it\n"
+"is at least the minimum above, and much higher if you are sorting a large file.\n",
+ max_mem, suffix, SORT_MIN_MEGS_PER_THREAD);
}
int bam_sort(int argc, char *argv[])
{
- size_t max_mem = 768<<20; // 512MB
- int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
+ size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
@@ -1887,7 +1928,6 @@ int bam_sort(int argc, char *argv[])
break;
}
case 'T': kputs(optarg, &tmpprefix); break;
- case '@': n_threads = atoi(optarg); break;
case 'l': level = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
@@ -1912,6 +1952,12 @@ int bam_sort(int argc, char *argv[])
goto sort_end;
}
+ if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) {
+ complain_about_memory_setting(max_mem);
+ ret = EXIT_FAILURE;
+ goto sort_end;
+ }
+
strcpy(modeout, "wb");
sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
@@ -1927,7 +1973,7 @@ int bam_sort(int argc, char *argv[])
}
ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem, n_threads,
+ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out);
if (ret >= 0)
ret = EXIT_SUCCESS;
diff --git a/samtools/bam_split.c b/samtools/bam_split.c
index 9a2998a..9bb2030 100644
--- a/samtools/bam_split.c
+++ b/samtools/bam_split.c
@@ -1,6 +1,6 @@
/* bam_split.c -- split subcommand.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Martin Pollard <mp15 at sanger.ac.uk>
@@ -34,7 +34,10 @@ DEALINGS IN THE SOFTWARE. */
#include <regex.h>
#include <htslib/khash.h>
#include <htslib/kstring.h>
+#include <htslib/cram.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
+#include "samtools.h"
KHASH_MAP_INIT_STR(c2i, int)
@@ -61,6 +64,7 @@ struct state {
samFile** rg_output_file;
bam_hdr_t** rg_output_header;
kh_c2i_t* rg_hash;
+ htsThreadPool p;
};
typedef struct state state_t;
@@ -78,7 +82,7 @@ static void usage(FILE *write_to)
" -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n"
" -u FILE1:FILE2 ...and override the header with FILE2\n"
" -v verbose output\n");
- sam_global_opt_help(write_to, "-....");
+ sam_global_opt_help(write_to, "-....@");
fprintf(write_to,
"\n"
"Format string expansions:\n"
@@ -95,11 +99,11 @@ static parsed_opts_t* parse_args(int argc, char** argv)
{
if (argc == 1) { usage(stdout); return NULL; }
- const char* optstring = "vf:u:";
+ const char* optstring = "vf:u:@:";
char* delim;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
@@ -143,7 +147,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
argv += optind;
if (argc != 1) {
- fprintf(stderr, "Invalid number of arguments: %d\n", argc);
+ print_error("split", "Invalid number of arguments: %d", argc);
usage(stderr);
free(retval);
return NULL;
@@ -270,7 +274,7 @@ static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name)
// Filters a header of @RG lines where ID != id_keep
// TODO: strip @PG's descended from other RGs and their descendants
-static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep)
+static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list)
{
kstring_t str = {0, 0, NULL};
@@ -315,28 +319,52 @@ static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep)
free(hdr->text);
hdr->text = ks_release(&str);
+ // Add the PG line
+ SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text);
+ if (sam_hdr_add_PG(sh, "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ return -1;
+
+ free(hdr->text);
+ hdr->text = strdup(sam_hdr_str(sh));
+ hdr->l_text = sam_hdr_length(sh);
+ if (!hdr->text)
+ return false;
+ sam_hdr_free(sh);
+
return true;
}
// Set the initial state
-static state_t* init(parsed_opts_t* opts)
+static state_t* init(parsed_opts_t* opts, const char *arg_list)
{
state_t* retval = calloc(sizeof(state_t), 1);
if (!retval) {
- fprintf(stderr, "Out of memory");
+ print_error_errno("split", "Initialisation failed");
return NULL;
}
+ if (opts->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ return NULL;
+ }
+ }
+
retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
if (!retval->merged_input_file) {
- fprintf(stderr, "Could not open input file (%s)\n", opts->merged_input_name);
+ print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
free(retval);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p);
retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
if (retval->merged_input_header == NULL) {
- fprintf(stderr, "Could not read header for file '%s'\n",
- opts->merged_input_name);
+ print_error("split", "Could not read header from \"%s\"", opts->merged_input_name);
cleanup_state(retval, false);
return NULL;
}
@@ -345,14 +373,13 @@ static state_t* init(parsed_opts_t* opts)
if (opts->unaccounted_header_name) {
samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
- fprintf(stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
+ print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
if (retval->unaccounted_header == NULL) {
- fprintf(stderr, "Could not read header for file '%s'\n",
- opts->unaccounted_header_name);
+ print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
@@ -363,10 +390,12 @@ static state_t* init(parsed_opts_t* opts)
retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
- fprintf(stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
+ print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
cleanup_state(retval, false);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
}
// Open output files for RGs
@@ -378,7 +407,7 @@ static state_t* init(parsed_opts_t* opts)
retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*));
retval->rg_hash = kh_init_c2i();
if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) {
- fprintf(stderr, "Could not allocate memory for output file array. Out of memory?");
+ print_error_errno("split", "Could not initialise output file array");
cleanup_state(retval, false);
return NULL;
}
@@ -386,7 +415,7 @@ static state_t* init(parsed_opts_t* opts)
char* dirsep = strrchr(opts->merged_input_name, '/');
char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
if (!input_base_name) {
- fprintf(stderr, "Out of memory\n");
+ print_error_errno("split", "Filename manipulation failed");
cleanup_state(retval, false);
return NULL;
}
@@ -403,7 +432,7 @@ static state_t* init(parsed_opts_t* opts)
&opts->ga.out);
if ( output_filename == NULL ) {
- fprintf(stderr, "Error expanding output filename format string.\n");
+ print_error("split", "Error expanding output filename format string");
cleanup_state(retval, false);
free(input_base_name);
return NULL;
@@ -412,11 +441,13 @@ static state_t* init(parsed_opts_t* opts)
retval->rg_output_file_name[i] = output_filename;
retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
- fprintf(stderr, "Could not open output file: %s\n", output_filename);
+ print_error_errno("split", "Could not open \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
// Record index in hash
int ret;
@@ -425,8 +456,8 @@ static state_t* init(parsed_opts_t* opts)
// Set and edit header
retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header);
- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) {
- fprintf(stderr, "Could not rewrite header for file: %s\n", output_filename);
+ if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) {
+ print_error("split", "Could not rewrite header for \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
@@ -441,14 +472,13 @@ static state_t* init(parsed_opts_t* opts)
static bool split(state_t* state)
{
if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
- fprintf(stderr, "Could not write output file header\n");
+ print_error_errno("split", "Could not write output file header");
return false;
}
size_t i;
for (i = 0; i < state->output_count; i++) {
if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
- fprintf(stderr, "Could not write output file header for '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
return false;
}
}
@@ -461,7 +491,7 @@ static bool split(state_t* state)
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(stderr, "Could not read first input record\n");
+ print_error("split", "Could not read first input record");
return false;
}
}
@@ -482,8 +512,7 @@ static bool split(state_t* state)
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
- fprintf(stderr, "Could not write to output file '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
bam_destroy1(file_read);
return false;
}
@@ -499,7 +528,7 @@ static bool split(state_t* state)
return false;
} else {
if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
- fprintf(stderr, "Could not write to unaccounted output file\n");
+ print_error_errno("split", "Could not write to unaccounted output file");
bam_destroy1(file_read);
return false;
}
@@ -512,7 +541,7 @@ static bool split(state_t* state)
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(stderr, "Could not read input record\n");
+ print_error("split", "Could not read input record");
return false;
}
}
@@ -529,7 +558,7 @@ static int cleanup_state(state_t* status, bool check_close)
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
if (status->unaccounted_file) {
if (sam_close(status->unaccounted_file) < 0 && check_close) {
- fprintf(stderr, "Error on closing unaccounted file\n");
+ print_error("split", "Error on closing unaccounted file");
ret = -1;
}
}
@@ -540,8 +569,7 @@ static int cleanup_state(state_t* status, bool check_close)
bam_hdr_destroy(status->rg_output_header[i]);
if (status->rg_output_file && status->rg_output_file[i]) {
if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
- fprintf(stderr, "Error on closing output file '%s'\n",
- status->rg_output_file_name[i]);
+ print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
ret = -1;
}
}
@@ -557,6 +585,9 @@ static int cleanup_state(state_t* status, bool check_close)
free(status->rg_id);
free(status);
+ if (status->p.pool)
+ hts_tpool_destroy(status->p.pool);
+
return ret;
}
@@ -574,9 +605,10 @@ static void cleanup_opts(parsed_opts_t* opts)
int main_split(int argc, char** argv)
{
int ret = 1;
+ char *arg_list = stringify_argv(argc+1, argv-1);
parsed_opts_t* opts = parse_args(argc, argv);
if (!opts) goto cleanup_opts;
- state_t* status = init(opts);
+ state_t* status = init(opts, arg_list);
if (!status) goto cleanup_opts;
if (!split(status)) {
@@ -588,6 +620,7 @@ int main_split(int argc, char** argv)
cleanup_opts:
cleanup_opts(opts);
+ free(arg_list);
return ret;
}
diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c
index 2348f48..8a584ed 100644
--- a/samtools/bam_split.c.pysam.c
+++ b/samtools/bam_split.c.pysam.c
@@ -2,7 +2,7 @@
/* bam_split.c -- split subcommand.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Martin Pollard <mp15 at sanger.ac.uk>
@@ -36,7 +36,10 @@ DEALINGS IN THE SOFTWARE. */
#include <regex.h>
#include <htslib/khash.h>
#include <htslib/kstring.h>
+#include <htslib/cram.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
+#include "samtools.h"
KHASH_MAP_INIT_STR(c2i, int)
@@ -63,6 +66,7 @@ struct state {
samFile** rg_output_file;
bam_hdr_t** rg_output_header;
kh_c2i_t* rg_hash;
+ htsThreadPool p;
};
typedef struct state state_t;
@@ -80,7 +84,7 @@ static void usage(FILE *write_to)
" -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n"
" -u FILE1:FILE2 ...and override the header with FILE2\n"
" -v verbose output\n");
- sam_global_opt_help(write_to, "-....");
+ sam_global_opt_help(write_to, "-....@");
fprintf(write_to,
"\n"
"Format string expansions:\n"
@@ -97,11 +101,11 @@ static parsed_opts_t* parse_args(int argc, char** argv)
{
if (argc == 1) { usage(pysam_stdout); return NULL; }
- const char* optstring = "vf:u:";
+ const char* optstring = "vf:u:@:";
char* delim;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
@@ -145,7 +149,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
argv += optind;
if (argc != 1) {
- fprintf(pysam_stderr, "Invalid number of arguments: %d\n", argc);
+ print_error("split", "Invalid number of arguments: %d", argc);
usage(pysam_stderr);
free(retval);
return NULL;
@@ -272,7 +276,7 @@ static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name)
// Filters a header of @RG lines where ID != id_keep
// TODO: strip @PG's descended from other RGs and their descendants
-static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep)
+static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list)
{
kstring_t str = {0, 0, NULL};
@@ -317,28 +321,52 @@ static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep)
free(hdr->text);
hdr->text = ks_release(&str);
+ // Add the PG line
+ SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text);
+ if (sam_hdr_add_PG(sh, "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ return -1;
+
+ free(hdr->text);
+ hdr->text = strdup(sam_hdr_str(sh));
+ hdr->l_text = sam_hdr_length(sh);
+ if (!hdr->text)
+ return false;
+ sam_hdr_free(sh);
+
return true;
}
// Set the initial state
-static state_t* init(parsed_opts_t* opts)
+static state_t* init(parsed_opts_t* opts, const char *arg_list)
{
state_t* retval = calloc(sizeof(state_t), 1);
if (!retval) {
- fprintf(pysam_stderr, "Out of memory");
+ print_error_errno("split", "Initialisation failed");
return NULL;
}
+ if (opts->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ return NULL;
+ }
+ }
+
retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
if (!retval->merged_input_file) {
- fprintf(pysam_stderr, "Could not open input file (%s)\n", opts->merged_input_name);
+ print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
free(retval);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p);
retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
if (retval->merged_input_header == NULL) {
- fprintf(pysam_stderr, "Could not read header for file '%s'\n",
- opts->merged_input_name);
+ print_error("split", "Could not read header from \"%s\"", opts->merged_input_name);
cleanup_state(retval, false);
return NULL;
}
@@ -347,14 +375,13 @@ static state_t* init(parsed_opts_t* opts)
if (opts->unaccounted_header_name) {
samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
- fprintf(pysam_stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
+ print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
if (retval->unaccounted_header == NULL) {
- fprintf(pysam_stderr, "Could not read header for file '%s'\n",
- opts->unaccounted_header_name);
+ print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
@@ -365,10 +392,12 @@ static state_t* init(parsed_opts_t* opts)
retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
- fprintf(pysam_stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
+ print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
cleanup_state(retval, false);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
}
// Open output files for RGs
@@ -380,7 +409,7 @@ static state_t* init(parsed_opts_t* opts)
retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*));
retval->rg_hash = kh_init_c2i();
if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) {
- fprintf(pysam_stderr, "Could not allocate memory for output file array. Out of memory?");
+ print_error_errno("split", "Could not initialise output file array");
cleanup_state(retval, false);
return NULL;
}
@@ -388,7 +417,7 @@ static state_t* init(parsed_opts_t* opts)
char* dirsep = strrchr(opts->merged_input_name, '/');
char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
if (!input_base_name) {
- fprintf(pysam_stderr, "Out of memory\n");
+ print_error_errno("split", "Filename manipulation failed");
cleanup_state(retval, false);
return NULL;
}
@@ -405,7 +434,7 @@ static state_t* init(parsed_opts_t* opts)
&opts->ga.out);
if ( output_filename == NULL ) {
- fprintf(pysam_stderr, "Error expanding output filename format string.\n");
+ print_error("split", "Error expanding output filename format string");
cleanup_state(retval, false);
free(input_base_name);
return NULL;
@@ -414,11 +443,13 @@ static state_t* init(parsed_opts_t* opts)
retval->rg_output_file_name[i] = output_filename;
retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
- fprintf(pysam_stderr, "Could not open output file: %s\n", output_filename);
+ print_error_errno("split", "Could not open \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
// Record index in hash
int ret;
@@ -427,8 +458,8 @@ static state_t* init(parsed_opts_t* opts)
// Set and edit header
retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header);
- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) {
- fprintf(pysam_stderr, "Could not rewrite header for file: %s\n", output_filename);
+ if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) {
+ print_error("split", "Could not rewrite header for \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
@@ -443,14 +474,13 @@ static state_t* init(parsed_opts_t* opts)
static bool split(state_t* state)
{
if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
- fprintf(pysam_stderr, "Could not write output file header\n");
+ print_error_errno("split", "Could not write output file header");
return false;
}
size_t i;
for (i = 0; i < state->output_count; i++) {
if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
- fprintf(pysam_stderr, "Could not write output file header for '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
return false;
}
}
@@ -463,7 +493,7 @@ static bool split(state_t* state)
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(pysam_stderr, "Could not read first input record\n");
+ print_error("split", "Could not read first input record");
return false;
}
}
@@ -484,8 +514,7 @@ static bool split(state_t* state)
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
- fprintf(pysam_stderr, "Could not write to output file '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
bam_destroy1(file_read);
return false;
}
@@ -501,7 +530,7 @@ static bool split(state_t* state)
return false;
} else {
if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
- fprintf(pysam_stderr, "Could not write to unaccounted output file\n");
+ print_error_errno("split", "Could not write to unaccounted output file");
bam_destroy1(file_read);
return false;
}
@@ -514,7 +543,7 @@ static bool split(state_t* state)
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(pysam_stderr, "Could not read input record\n");
+ print_error("split", "Could not read input record");
return false;
}
}
@@ -531,7 +560,7 @@ static int cleanup_state(state_t* status, bool check_close)
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
if (status->unaccounted_file) {
if (sam_close(status->unaccounted_file) < 0 && check_close) {
- fprintf(pysam_stderr, "Error on closing unaccounted file\n");
+ print_error("split", "Error on closing unaccounted file");
ret = -1;
}
}
@@ -542,8 +571,7 @@ static int cleanup_state(state_t* status, bool check_close)
bam_hdr_destroy(status->rg_output_header[i]);
if (status->rg_output_file && status->rg_output_file[i]) {
if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
- fprintf(pysam_stderr, "Error on closing output file '%s'\n",
- status->rg_output_file_name[i]);
+ print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
ret = -1;
}
}
@@ -559,6 +587,9 @@ static int cleanup_state(state_t* status, bool check_close)
free(status->rg_id);
free(status);
+ if (status->p.pool)
+ hts_tpool_destroy(status->p.pool);
+
return ret;
}
@@ -576,9 +607,10 @@ static void cleanup_opts(parsed_opts_t* opts)
int main_split(int argc, char** argv)
{
int ret = 1;
+ char *arg_list = stringify_argv(argc+1, argv-1);
parsed_opts_t* opts = parse_args(argc, argv);
if (!opts) goto cleanup_opts;
- state_t* status = init(opts);
+ state_t* status = init(opts, arg_list);
if (!status) goto cleanup_opts;
if (!split(status)) {
@@ -590,6 +622,7 @@ int main_split(int argc, char** argv)
cleanup_opts:
cleanup_opts(opts);
+ free(arg_list);
return ret;
}
diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c
index f6cf1d5..aa5f8d3 100644
--- a/samtools/bam_stat.c
+++ b/samtools/bam_stat.c
@@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/sam.h"
#include "samtools.h"
+#include "sam_opts.h"
typedef struct {
long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2];
@@ -94,7 +95,8 @@ static const char *percent(char *buffer, long long n, long long total)
static void usage_exit(FILE *fp, int exit_status)
{
- fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] <in.bam>\n");
+ fprintf(fp, "Usage: samtools flagstat [options] <in.bam>\n");
+ sam_global_opt_help(fp, "-.---@");
exit(exit_status);
}
@@ -104,25 +106,23 @@ int bam_flagstat(int argc, char *argv[])
bam_hdr_t *header;
bam_flagstat_t *s;
char b0[16], b1[16];
- hts_opt *in_opts = NULL;
int c;
enum {
INPUT_FMT_OPTION = CHAR_MAX+1,
};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION},
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) {
switch (c) {
- case INPUT_FMT_OPTION:
- if (hts_opt_add(&in_opts, optarg) < 0)
- usage_exit(stderr, EXIT_FAILURE);
- break;
- default:
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?':
usage_exit(stderr, EXIT_FAILURE);
}
}
@@ -131,15 +131,13 @@ int bam_flagstat(int argc, char *argv[])
if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
else usage_exit(stderr, EXIT_FAILURE);
}
- fp = sam_open(argv[optind], "r");
+ fp = sam_open_format(argv[optind], "r", &ga.in);
if (fp == NULL) {
print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]);
return 1;
}
- if (hts_opt_apply(fp, in_opts)) {
- fprintf(stderr, "Failed to apply input-fmt-options\n");
- return 1;
- }
+ if (ga.nthreads > 0)
+ hts_set_threads(fp, ga.nthreads);
if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) {
@@ -174,6 +172,6 @@ int bam_flagstat(int argc, char *argv[])
free(s);
bam_hdr_destroy(header);
sam_close(fp);
- hts_opt_free(in_opts);
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c
index cdca4dd..bbfe602 100644
--- a/samtools/bam_stat.c.pysam.c
+++ b/samtools/bam_stat.c.pysam.c
@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/sam.h"
#include "samtools.h"
+#include "sam_opts.h"
typedef struct {
long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2];
@@ -94,10 +95,11 @@ static const char *percent(char *buffer, long long n, long long total)
return buffer;
}
-static void usage_exit(FILE *fp, int exit_status)
+static int usage_exit(FILE *fp, int exit_status)
{
- fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] <in.bam>\n");
- exit(exit_status);
+ fprintf(fp, "Usage: samtools flagstat [options] <in.bam>\n");
+ sam_global_opt_help(fp, "-.---@");
+ return(exit_status);
}
int bam_flagstat(int argc, char *argv[])
@@ -106,42 +108,38 @@ int bam_flagstat(int argc, char *argv[])
bam_hdr_t *header;
bam_flagstat_t *s;
char b0[16], b1[16];
- hts_opt *in_opts = NULL;
int c;
enum {
INPUT_FMT_OPTION = CHAR_MAX+1,
};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION},
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) {
switch (c) {
- case INPUT_FMT_OPTION:
- if (hts_opt_add(&in_opts, optarg) < 0)
- usage_exit(pysam_stderr, EXIT_FAILURE);
- break;
- default:
- usage_exit(pysam_stderr, EXIT_FAILURE);
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?':
+ return(usage_exit(pysam_stderr, EXIT_FAILURE));
}
}
if (argc != optind+1) {
- if (argc == optind) usage_exit(pysam_stdout, EXIT_SUCCESS);
- else usage_exit(pysam_stderr, EXIT_FAILURE);
+ if (argc == optind) return(usage_exit(pysam_stdout, EXIT_SUCCESS));
+ else return(usage_exit(pysam_stderr, EXIT_FAILURE));
}
- fp = sam_open(argv[optind], "r");
+ fp = sam_open_format(argv[optind], "r", &ga.in);
if (fp == NULL) {
print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]);
return 1;
}
- if (hts_opt_apply(fp, in_opts)) {
- fprintf(pysam_stderr, "Failed to apply input-fmt-options\n");
- return 1;
- }
+ if (ga.nthreads > 0)
+ hts_set_threads(fp, ga.nthreads);
if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) {
@@ -176,6 +174,6 @@ int bam_flagstat(int argc, char *argv[])
free(s);
bam_hdr_destroy(header);
sam_close(fp);
- hts_opt_free(in_opts);
+ sam_global_args_free(&ga);
return 0;
}
diff --git a/samtools/bam_tview.c b/samtools/bam_tview.c
deleted file mode 100644
index f1f0cc7..0000000
--- a/samtools/bam_tview.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/* bam_tview.c -- tview subcommand.
-
- Copyright (C) 2008-2015 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <regex.h>
-#include <assert.h>
-#include "bam_tview.h"
-#include <htslib/faidx.h>
-#include <htslib/sam.h>
-#include <htslib/bgzf.h>
-#include "sam_opts.h"
-
-khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
-{
- khash_t(kh_rg)* rg_hash = kh_init(kh_rg);
- // given sample id return all the RD ID's
- const char rg_regex[] = "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)";
-
- regex_t rg_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- regcomp(&rg_id, rg_regex, REG_EXTENDED|REG_NEWLINE);
- char* text = strdup(header);
- char* end = text + strlen(header);
- char* tofree = text;
- while (end > text && regexec(&rg_id, text, 2, matches, 0) == 0) { // foreach rg id in header
- int ret;
- text[matches[1].rm_eo] = '\0';
- kh_put(kh_rg, rg_hash, strdup(text+matches[1].rm_so), &ret); // Add the RG to the list
- text += matches[0].rm_eo + 1; // Move search pointer forward
- }
- free(tofree);
- return rg_hash;
-}
-
-int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt)
-{
- assert(tv!=NULL);
- assert(fn!=NULL);
- tv->mrow = 24; tv->mcol = 80;
- tv->color_for = TV_COLOR_MAPQ;
- tv->is_dot = 1;
-
- tv->fp = sam_open_format(fn, "r", fmt);
- if(tv->fp == NULL)
- {
- fprintf(stderr,"sam_open %s. %s\n", fn,fn_fa);
- exit(EXIT_FAILURE);
- }
- // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024);
- assert(tv->fp);
-
- tv->header = sam_hdr_read(tv->fp);
- if(tv->header == NULL)
- {
- fprintf(stderr,"Cannot read '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->idx = sam_index_load(tv->fp, fn);
- if (tv->idx == NULL)
- {
- fprintf(stderr,"Cannot read index for '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
- if (fn_fa) tv->fai = fai_load(fn_fa);
- tv->bca = bcf_call_init(0.83, 13);
- tv->ins = 1;
-
- // If the user has asked for specific samples find out create a list of readgroups make up these samples
- if ( samples )
- {
- tv->rg_hash = get_rg_sample(tv->header->text, samples); // Init the list of rg's
- }
-
- return 0;
-}
-
-
-void base_tv_destroy(tview_t* tv)
-{
- bam_lplbuf_destroy(tv->lplbuf);
- bcf_call_destroy(tv->bca);
- hts_idx_destroy(tv->idx);
- if (tv->fai) fai_destroy(tv->fai);
- free(tv->ref);
- bam_hdr_destroy(tv->header);
- sam_close(tv->fp);
-}
-
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
-{
- tview_t *tv = (tview_t*)data;
- int i, j, c, rb, attr, max_ins = 0;
- uint32_t call = 0;
- if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
- // print reference
- rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
- for (i = tv->last_pos + 1; i < pos; ++i) {
- if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1);
- c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- { // call consensus
- bcf_callret1_t bcr;
- memset(&bcr, 0, sizeof bcr);
- int qsum[4], a1, a2, tmp;
- double p[3], prior = 30;
- bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr);
- for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i;
- for (i = 1; i < 4; ++i) // insertion sort
- for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
- tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
- a1 = qsum[0]&3; a2 = qsum[1]&3;
- p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
- if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
- if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
- if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
- else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
- else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
- }
- attr = tv->my_underline(tv);
- c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
- i = (call&0xffff)/10+1;
- if (i > 4) i = 4;
- attr |= tv->my_colorpair(tv,i);
- if (c == toupper(rb)) c = '.';
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,2, tv->ccol, c);
- tv->my_attroff(tv,attr);
- if(tv->ins) {
- // calculate maximum insert
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
- }
- }
- // core loop
- for (j = 0; j <= max_ins; ++j) {
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- int row = TV_MIN_ALNROW + p->level - tv->row_shift;
- if (j == 0) {
- if (!p->is_del) {
- if (tv->base_for == TV_BASE_COLOR_SPACE &&
- (c = bam_aux_getCSi(p->b, p->qpos))) {
- // assume that if we found one color, we will be able to get the color error
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.';
- } else {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
- if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*';
- } else { // padding
- if (j > p->indel) c = '*';
- else { // insertion
- if (tv->base_for == TV_BASE_NUCL) {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)];
- if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- } else {
- c = bam_aux_getCSi(p->b, p->qpos + j);
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- }
- if (row > TV_MIN_ALNROW && row < tv->mrow) {
- int x;
- attr = 0;
- if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
- || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
- if (tv->color_for == TV_COLOR_BASEQ) {
- x = bam_get_qual(p->b)[p->qpos]/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_MAPQ) {
- x = p->b->core.qual/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_NUCL) {
- x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COL) {
- x = 0;
- switch(bam_aux_getCSi(p->b, p->qpos)) {
- case '0': x = 0; break;
- case '1': x = 1; break;
- case '2': x = 2; break;
- case '3': x = 3; break;
- case '4': x = 4; break;
- default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
- }
- x+=5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COLQ) {
- x = bam_aux_getCQi(p->b, p->qpos);
- if(0 == x) x = bam_get_qual(p->b)[p->qpos];
- x = x/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- }
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c));
- tv->my_attroff(tv,attr);
- }
- }
- c = j? '*' : rb;
- if (c == '*') {
- attr = tv->my_colorpair(tv,8);
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- tv->my_attroff(tv,attr);
- } else tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- tv->last_pos = pos;
- return 0;
-}
-
-
-
-
-static int tv_push_aln(const bam1_t *b, tview_t *tv)
-{
- /* If we are restricted to specific readgroups check RG is in the list */
- if ( tv->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(b, "RG");
- if ( !rg ) return 0; // If we don't have an RG tag exclude read
- khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
- if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read
- }
- if (tv->no_skip) {
- uint32_t *cigar = bam_get_cigar(b); // this is cheating...
- int i;
- for (i = 0; i <b->core.n_cigar; ++i) {
- if ((cigar[i]&0xf) == BAM_CREF_SKIP)
- cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
- }
- }
- bam_lplbuf_push(b, tv->lplbuf);
- return 0;
-}
-
-int base_draw_aln(tview_t *tv, int tid, int pos)
-{
- assert(tv!=NULL);
- // reset
- tv->my_clear(tv);
- tv->curr_tid = tid; tv->left_pos = pos;
- tv->last_pos = tv->left_pos - 1;
- tv->ccol = 0;
- // print ref and consensus
- if (tv->fai) {
- char *str;
- if (tv->ref) free(tv->ref);
- assert(tv->curr_tid>=0);
-
- str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
- assert(str!=NULL);
- sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
- tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
- free(str);
- if ( !tv->ref )
- {
- fprintf(stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
- exit(1);
- }
- }
- // draw aln
- bam_lplbuf_reset(tv->lplbuf);
- hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
- bam1_t *b = bam_init1();
- while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv);
- bam_destroy1(b);
- hts_itr_destroy(iter);
- bam_lplbuf_push(0, tv->lplbuf);
-
- while (tv->ccol < tv->mcol) {
- int pos = tv->last_pos + 1;
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
- ++tv->last_pos;
- }
- return 0;
-}
-
-
-
-
-static void error(const char *format, ...)
-{
- if ( !format )
- {
- fprintf(stderr,
-"Usage: samtools tview [options] <aln.bam> [ref.fasta]\n"
-"Options:\n"
-" -d display output as (H)tml or (C)urses or (T)ext \n"
-" -p chr:pos go directly to this position\n"
-" -s STR display only reads from this sample or group\n");
- sam_global_opt_help(stderr, "-.--.");
- }
- else
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
- }
- exit(-1);
-}
-
-enum dipsay_mode {display_ncurses,display_html,display_text};
-extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* html_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-
-int bam_tview_main(int argc, char *argv[])
-{
- int view_mode=display_ncurses;
- tview_t* tv=NULL;
- char *samples=NULL, *position=NULL, *ref;
- int c;
-
- sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
- { NULL, 0, NULL, 0 }
- };
-
- while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) {
- switch (c) {
- case 's': samples=optarg; break;
- case 'p': position=optarg; break;
- case 'd':
- {
- switch(optarg[0])
- {
- case 'H': case 'h': view_mode=display_html;break;
- case 'T': case 't': view_mode=display_text;break;
- case 'C': case 'c': view_mode=display_ncurses;break;
- default: view_mode=display_ncurses;break;
- }
- break;
- }
- default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': error(NULL);
- }
- }
- if (argc==optind) error(NULL);
-
- ref = (optind+1>=argc)? ga.reference : argv[optind+1];
-
- switch(view_mode)
- {
- case display_ncurses:
- tv = curses_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_text:
- tv = text_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_html:
- tv = html_tv_init(argv[optind], ref, samples, &ga.in);
- break;
- }
- if (tv==NULL)
- {
- error("cannot create view");
- return EXIT_FAILURE;
- }
-
- if ( position )
- {
- int tid, beg, end;
- char *name_lim = (char *) hts_parse_reg(position, &beg, &end);
- if (name_lim) *name_lim = '\0';
- else beg = 0; // region parsing failed, but possibly a seq named "foo:a"
- tid = bam_name2id(tv->header, position);
- if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; }
- }
- else if ( tv->fai )
- {
- // find the first sequence present in both BAM and the reference file
- int i;
- for (i=0; i<tv->header->n_targets; i++)
- {
- if ( faidx_has_seq(tv->fai, tv->header->target_name[i]) ) break;
- }
- if ( i==tv->header->n_targets )
- {
- fprintf(stderr,"None of the BAM sequence names present in the fasta file\n");
- exit(EXIT_FAILURE);
- }
- tv->curr_tid = i;
- }
- tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- tv->my_loop(tv);
- tv->my_destroy(tv);
-
- return EXIT_SUCCESS;
-}
diff --git a/samtools/bam_tview.c.pysam.c b/samtools/bam_tview.c.pysam.c
deleted file mode 100644
index a47bced..0000000
--- a/samtools/bam_tview.c.pysam.c
+++ /dev/null
@@ -1,443 +0,0 @@
-#include "pysam.h"
-
-/* bam_tview.c -- tview subcommand.
-
- Copyright (C) 2008-2015 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <regex.h>
-#include <assert.h>
-#include "bam_tview.h"
-#include <htslib/faidx.h>
-#include <htslib/sam.h>
-#include <htslib/bgzf.h>
-#include "sam_opts.h"
-
-khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
-{
- khash_t(kh_rg)* rg_hash = kh_init(kh_rg);
- // given sample id return all the RD ID's
- const char rg_regex[] = "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)";
-
- regex_t rg_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- regcomp(&rg_id, rg_regex, REG_EXTENDED|REG_NEWLINE);
- char* text = strdup(header);
- char* end = text + strlen(header);
- char* tofree = text;
- while (end > text && regexec(&rg_id, text, 2, matches, 0) == 0) { // foreach rg id in header
- int ret;
- text[matches[1].rm_eo] = '\0';
- kh_put(kh_rg, rg_hash, strdup(text+matches[1].rm_so), &ret); // Add the RG to the list
- text += matches[0].rm_eo + 1; // Move search pointer forward
- }
- free(tofree);
- return rg_hash;
-}
-
-int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt)
-{
- assert(tv!=NULL);
- assert(fn!=NULL);
- tv->mrow = 24; tv->mcol = 80;
- tv->color_for = TV_COLOR_MAPQ;
- tv->is_dot = 1;
-
- tv->fp = sam_open_format(fn, "r", fmt);
- if(tv->fp == NULL)
- {
- fprintf(pysam_stderr,"sam_open %s. %s\n", fn,fn_fa);
- exit(EXIT_FAILURE);
- }
- // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024);
- assert(tv->fp);
-
- tv->header = sam_hdr_read(tv->fp);
- if(tv->header == NULL)
- {
- fprintf(pysam_stderr,"Cannot read '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->idx = sam_index_load(tv->fp, fn);
- if (tv->idx == NULL)
- {
- fprintf(pysam_stderr,"Cannot read index for '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
- if (fn_fa) tv->fai = fai_load(fn_fa);
- tv->bca = bcf_call_init(0.83, 13);
- tv->ins = 1;
-
- // If the user has asked for specific samples find out create a list of readgroups make up these samples
- if ( samples )
- {
- tv->rg_hash = get_rg_sample(tv->header->text, samples); // Init the list of rg's
- }
-
- return 0;
-}
-
-
-void base_tv_destroy(tview_t* tv)
-{
- bam_lplbuf_destroy(tv->lplbuf);
- bcf_call_destroy(tv->bca);
- hts_idx_destroy(tv->idx);
- if (tv->fai) fai_destroy(tv->fai);
- free(tv->ref);
- bam_hdr_destroy(tv->header);
- sam_close(tv->fp);
-}
-
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
-{
- tview_t *tv = (tview_t*)data;
- int i, j, c, rb, attr, max_ins = 0;
- uint32_t call = 0;
- if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
- // print reference
- rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
- for (i = tv->last_pos + 1; i < pos; ++i) {
- if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1);
- c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- { // call consensus
- bcf_callret1_t bcr;
- memset(&bcr, 0, sizeof bcr);
- int qsum[4], a1, a2, tmp;
- double p[3], prior = 30;
- bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr);
- for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i;
- for (i = 1; i < 4; ++i) // insertion sort
- for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
- tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
- a1 = qsum[0]&3; a2 = qsum[1]&3;
- p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
- if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
- if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
- if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
- else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
- else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
- }
- attr = tv->my_underline(tv);
- c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
- i = (call&0xffff)/10+1;
- if (i > 4) i = 4;
- attr |= tv->my_colorpair(tv,i);
- if (c == toupper(rb)) c = '.';
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,2, tv->ccol, c);
- tv->my_attroff(tv,attr);
- if(tv->ins) {
- // calculate maximum insert
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
- }
- }
- // core loop
- for (j = 0; j <= max_ins; ++j) {
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- int row = TV_MIN_ALNROW + p->level - tv->row_shift;
- if (j == 0) {
- if (!p->is_del) {
- if (tv->base_for == TV_BASE_COLOR_SPACE &&
- (c = bam_aux_getCSi(p->b, p->qpos))) {
- // assume that if we found one color, we will be able to get the color error
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.';
- } else {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
- if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*';
- } else { // padding
- if (j > p->indel) c = '*';
- else { // insertion
- if (tv->base_for == TV_BASE_NUCL) {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)];
- if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- } else {
- c = bam_aux_getCSi(p->b, p->qpos + j);
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- }
- if (row > TV_MIN_ALNROW && row < tv->mrow) {
- int x;
- attr = 0;
- if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
- || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
- if (tv->color_for == TV_COLOR_BASEQ) {
- x = bam_get_qual(p->b)[p->qpos]/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_MAPQ) {
- x = p->b->core.qual/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_NUCL) {
- x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COL) {
- x = 0;
- switch(bam_aux_getCSi(p->b, p->qpos)) {
- case '0': x = 0; break;
- case '1': x = 1; break;
- case '2': x = 2; break;
- case '3': x = 3; break;
- case '4': x = 4; break;
- default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
- }
- x+=5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COLQ) {
- x = bam_aux_getCQi(p->b, p->qpos);
- if(0 == x) x = bam_get_qual(p->b)[p->qpos];
- x = x/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- }
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c));
- tv->my_attroff(tv,attr);
- }
- }
- c = j? '*' : rb;
- if (c == '*') {
- attr = tv->my_colorpair(tv,8);
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- tv->my_attroff(tv,attr);
- } else tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- tv->last_pos = pos;
- return 0;
-}
-
-
-
-
-static int tv_push_aln(const bam1_t *b, tview_t *tv)
-{
- /* If we are restricted to specific readgroups check RG is in the list */
- if ( tv->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(b, "RG");
- if ( !rg ) return 0; // If we don't have an RG tag exclude read
- khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
- if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read
- }
- if (tv->no_skip) {
- uint32_t *cigar = bam_get_cigar(b); // this is cheating...
- int i;
- for (i = 0; i <b->core.n_cigar; ++i) {
- if ((cigar[i]&0xf) == BAM_CREF_SKIP)
- cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
- }
- }
- bam_lplbuf_push(b, tv->lplbuf);
- return 0;
-}
-
-int base_draw_aln(tview_t *tv, int tid, int pos)
-{
- assert(tv!=NULL);
- // reset
- tv->my_clear(tv);
- tv->curr_tid = tid; tv->left_pos = pos;
- tv->last_pos = tv->left_pos - 1;
- tv->ccol = 0;
- // print ref and consensus
- if (tv->fai) {
- char *str;
- if (tv->ref) free(tv->ref);
- assert(tv->curr_tid>=0);
-
- str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
- assert(str!=NULL);
- sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
- tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
- free(str);
- if ( !tv->ref )
- {
- fprintf(pysam_stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
- exit(1);
- }
- }
- // draw aln
- bam_lplbuf_reset(tv->lplbuf);
- hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
- bam1_t *b = bam_init1();
- while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv);
- bam_destroy1(b);
- hts_itr_destroy(iter);
- bam_lplbuf_push(0, tv->lplbuf);
-
- while (tv->ccol < tv->mcol) {
- int pos = tv->last_pos + 1;
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
- ++tv->last_pos;
- }
- return 0;
-}
-
-
-
-
-static void error(const char *format, ...)
-{
- if ( !format )
- {
- fprintf(pysam_stderr,
-"Usage: samtools tview [options] <aln.bam> [ref.fasta]\n"
-"Options:\n"
-" -d display output as (H)tml or (C)urses or (T)ext \n"
-" -p chr:pos go directly to this position\n"
-" -s STR display only reads from this sample or group\n");
- sam_global_opt_help(pysam_stderr, "-.--.");
- }
- else
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(pysam_stderr, format, ap);
- va_end(ap);
- }
- exit(-1);
-}
-
-enum dipsay_mode {display_ncurses,display_html,display_text};
-extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* html_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-
-int bam_tview_main(int argc, char *argv[])
-{
- int view_mode=display_ncurses;
- tview_t* tv=NULL;
- char *samples=NULL, *position=NULL, *ref;
- int c;
-
- sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
- { NULL, 0, NULL, 0 }
- };
-
- while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) {
- switch (c) {
- case 's': samples=optarg; break;
- case 'p': position=optarg; break;
- case 'd':
- {
- switch(optarg[0])
- {
- case 'H': case 'h': view_mode=display_html;break;
- case 'T': case 't': view_mode=display_text;break;
- case 'C': case 'c': view_mode=display_ncurses;break;
- default: view_mode=display_ncurses;break;
- }
- break;
- }
- default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': error(NULL);
- }
- }
- if (argc==optind) error(NULL);
-
- ref = (optind+1>=argc)? ga.reference : argv[optind+1];
-
- switch(view_mode)
- {
- case display_ncurses:
- tv = curses_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_text:
- tv = text_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_html:
- tv = html_tv_init(argv[optind], ref, samples, &ga.in);
- break;
- }
- if (tv==NULL)
- {
- error("cannot create view");
- return EXIT_FAILURE;
- }
-
- if ( position )
- {
- int tid, beg, end;
- char *name_lim = (char *) hts_parse_reg(position, &beg, &end);
- if (name_lim) *name_lim = '\0';
- else beg = 0; // region parsing failed, but possibly a seq named "foo:a"
- tid = bam_name2id(tv->header, position);
- if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; }
- }
- else if ( tv->fai )
- {
- // find the first sequence present in both BAM and the reference file
- int i;
- for (i=0; i<tv->header->n_targets; i++)
- {
- if ( faidx_has_seq(tv->fai, tv->header->target_name[i]) ) break;
- }
- if ( i==tv->header->n_targets )
- {
- fprintf(pysam_stderr,"None of the BAM sequence names present in the fasta file\n");
- exit(EXIT_FAILURE);
- }
- tv->curr_tid = i;
- }
- tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- tv->my_loop(tv);
- tv->my_destroy(tv);
-
- return EXIT_SUCCESS;
-}
diff --git a/samtools/bam_tview.h b/samtools/bam_tview.h
deleted file mode 100644
index e11e39d..0000000
--- a/samtools/bam_tview.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* bam_tview.h -- tview subcommand.
-
- Copyright (C) 2008, 2013 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#ifndef BAM_TVIEW_H
-#define BAM_TVIEW_H
-
-#include <ctype.h>
-#include <string.h>
-#include <math.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <htslib/sam.h>
-#include "bam2bcf.h"
-#include <htslib/khash.h>
-#include <htslib/hts.h>
-#include <htslib/faidx.h>
-#include "bam_lpileup.h"
-
-
-KHASH_MAP_INIT_STR(kh_rg, const char *)
-
-/* Holds state of Tview */
-typedef struct AbstractTview {
- int mrow, mcol;
-
- hts_idx_t* idx;
- bam_lplbuf_t* lplbuf;
- bam_hdr_t* header;
- samFile* fp;
- int curr_tid, left_pos;
- faidx_t* fai;
- bcf_callaux_t* bca;
-
- int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins;
- int no_skip, show_name, inverse;
- char *ref;
- /* maps @RG ID => SM (sample), in practice only used to determine whether a particular RG is in the list of allowed ones */
- khash_t(kh_rg) *rg_hash;
- /* callbacks */
- void (*my_destroy)(struct AbstractTview* );
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_mvaddch)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
- int (*my_drawaln)(struct AbstractTview*,int,int);
- int (*my_loop)(struct AbstractTview*);
- int (*my_underline)(struct AbstractTview*);
-} tview_t;
-
-
-char bam_aux_getCEi(bam1_t *b, int i);
-char bam_aux_getCSi(bam1_t *b, int i);
-char bam_aux_getCQi(bam1_t *b, int i);
-
-#define TV_MIN_ALNROW 2
-#define TV_MAX_GOTO 40
-#define TV_LOW_MAPQ 10
-
-#define TV_COLOR_MAPQ 0
-#define TV_COLOR_BASEQ 1
-#define TV_COLOR_NUCL 2
-#define TV_COLOR_COL 3
-#define TV_COLOR_COLQ 4
-
-#define TV_BASE_NUCL 0
-#define TV_BASE_COLOR_SPACE 1
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
-int base_tv_init(tview_t*,const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-void base_tv_destroy(tview_t*);
-int base_draw_aln(tview_t *tv, int tid, int pos);
-
-typedef struct Tixel
- {
- int ch;
- int attributes;
- }tixel_t;
-
-#endif
-
diff --git a/samtools/bam_tview_curses.c b/samtools/bam_tview_curses.c
deleted file mode 100644
index d7edfe8..0000000
--- a/samtools/bam_tview_curses.c
+++ /dev/null
@@ -1,352 +0,0 @@
-/* bam_tview_curses.c -- curses tview implementation.
-
- Copyright (C) 2008-2013 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include "bam_tview.h"
-
-#ifdef HAVE_CURSES
-
-#if defined HAVE_NCURSESW_CURSES_H
-#include <ncursesw/curses.h>
-#elif defined HAVE_NCURSESW_H
-#include <ncursesw.h>
-#elif defined HAVE_NCURSES_CURSES_H
-#include <ncurses/curses.h>
-#elif defined HAVE_NCURSES_H
-#include <ncurses.h>
-#elif defined HAVE_CURSES_H
-#include <curses.h>
-#endif
-
-typedef struct CursesTview {
- tview_t view;
- WINDOW *wgoto, *whelp;
- } curses_tview_t;
-
-#define FROM_TV(ptr) ((curses_tview_t*)ptr)
-
-static void curses_destroy(tview_t* base)
- {
- curses_tview_t* tv=(curses_tview_t*)base;
-
-
- delwin(tv->wgoto); delwin(tv->whelp);
- endwin();
-
- base_tv_destroy(base);
-
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
- mvprintw(y,x,str);
- free(str);
- }
-
-static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- mvaddch(y,x,ch);
- }
-
-static void curses_attron(struct AbstractTview* tv,int flag)
- {
- attron(flag);
- }
-static void curses_attroff(struct AbstractTview* tv,int flag)
- {
- attroff(flag);
- }
-static void curses_clear(struct AbstractTview* tv)
- {
- clear();
- }
-
-static int curses_init_colors(int inverse)
-{
- if (inverse) {
- init_pair(1, COLOR_WHITE, COLOR_BLUE);
- init_pair(2, COLOR_BLACK, COLOR_GREEN);
- init_pair(3, COLOR_BLACK, COLOR_YELLOW);
- init_pair(4, COLOR_BLACK, COLOR_WHITE);
- init_pair(5, COLOR_BLACK, COLOR_GREEN);
- init_pair(6, COLOR_BLACK, COLOR_CYAN);
- init_pair(7, COLOR_WHITE, COLOR_MAGENTA);
- init_pair(8, COLOR_WHITE, COLOR_RED);
- init_pair(9, COLOR_WHITE, COLOR_BLUE);
- } else {
- init_pair(1, COLOR_BLUE, COLOR_BLACK);
- init_pair(2, COLOR_GREEN, COLOR_BLACK);
- init_pair(3, COLOR_YELLOW, COLOR_BLACK);
- init_pair(4, COLOR_WHITE, COLOR_BLACK);
- init_pair(5, COLOR_GREEN, COLOR_BLACK);
- init_pair(6, COLOR_CYAN, COLOR_BLACK);
- init_pair(7, COLOR_MAGENTA, COLOR_BLACK);
- init_pair(8, COLOR_RED, COLOR_BLACK);
- init_pair(9, COLOR_BLUE, COLOR_BLACK);
- }
-
- return 0;
-}
-
-static int curses_colorpair(struct AbstractTview* tv,int flag)
- {
- return COLOR_PAIR(flag);
- }
-
-static int curses_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- return base_draw_aln(tv, tid, pos);
- }
-
-
-
-static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
- {
- char str[256], *p;
- int i, l = 0;
- tview_t *base=(tview_t*)tv;
- wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(tv->wgoto, 1, 2, "Goto: ");
- for (;;) {
- int invalid = 0;
- int c = wgetch(tv->wgoto);
- wrefresh(tv->wgoto);
- if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
- if(l > 0) --l;
- } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
- int _tid = -1, _beg, _end;
- if (str[0] == '=') {
- _beg = strtol(str+1, &p, 10) - 1;
- if (_beg > 0) {
- *pos = _beg;
- return;
- }
- } else {
- char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end);
- if (name_lim) {
- char name_terminator = *name_lim;
- *name_lim = '\0';
- _tid = bam_name2id(base->header, str);
- *name_lim = name_terminator;
- }
- else {
- // Unparsable region, but possibly a sequence named "foo:a"
- _tid = bam_name2id(base->header, str);
- _beg = 0;
- }
-
- if (_tid >= 0) {
- *tid = _tid; *pos = _beg;
- return;
- }
- }
-
- // If we get here, the region string is invalid
- invalid = 1;
- } else if (isgraph(c)) {
- if (l < TV_MAX_GOTO) str[l++] = c;
- } else if (c == '\027') l = 0;
- else if (c == '\033') return;
- str[l] = '\0';
- for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
- if (invalid) mvwprintw(tv->wgoto, 1, TV_MAX_GOTO - 1, "[Invalid]");
- mvwprintw(tv->wgoto, 1, 8, "%s", str);
- }
-}
-
-
-
-
-static void tv_win_help(curses_tview_t *tv) {
- int r = 1;
- tview_t* base=(tview_t*)base;
- WINDOW *win = tv->whelp;
- wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(win, r++, 2, " -=- Help -=- ");
- r++;
- mvwprintw(win, r++, 2, "? This window");
- mvwprintw(win, r++, 2, "Arrows Small scroll movement");
- mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");
- mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");
- mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");
- mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");
- mvwprintw(win, r++, 2, "space Scroll one screen");
- mvwprintw(win, r++, 2, "backspace Scroll back one screen");
- mvwprintw(win, r++, 2, "g Go to specific location");
- mvwprintw(win, r++, 2, "m Color for mapping qual");
- mvwprintw(win, r++, 2, "n Color for nucleotide");
- mvwprintw(win, r++, 2, "b Color for base quality");
- mvwprintw(win, r++, 2, "c Color for cs color");
- mvwprintw(win, r++, 2, "z Color for cs qual");
- mvwprintw(win, r++, 2, ". Toggle on/off dot view");
- mvwprintw(win, r++, 2, "s Toggle on/off ref skip");
- mvwprintw(win, r++, 2, "r Toggle on/off rd name");
- mvwprintw(win, r++, 2, "N Turn on nt view");
- mvwprintw(win, r++, 2, "C Turn on cs view");
- mvwprintw(win, r++, 2, "i Toggle on/off ins");
- mvwprintw(win, r++, 2, "v Inverse video");
- mvwprintw(win, r++, 2, "q Exit");
- r++;
- mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
- mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");
- mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");
- wrefresh(win);
- wgetch(win);
-}
-
-static int curses_underline(tview_t* tv)
- {
- return A_UNDERLINE;
- }
-
-static int curses_loop(tview_t* tv)
- {
- int tid, pos;
- curses_tview_t *CTV=(curses_tview_t *)tv;
- tid = tv->curr_tid; pos = tv->left_pos;
- while (1) {
- int c = getch();
- switch (c) {
- case '?': tv_win_help(CTV); break;
- case '\033':
- case 'q': goto end_loop;
- case '/':
- case 'g': tv_win_goto(CTV, &tid, &pos); break;
- case 'm': tv->color_for = TV_COLOR_MAPQ; break;
- case 'b': tv->color_for = TV_COLOR_BASEQ; break;
- case 'n': tv->color_for = TV_COLOR_NUCL; break;
- case 'c': tv->color_for = TV_COLOR_COL; break;
- case 'z': tv->color_for = TV_COLOR_COLQ; break;
- case 'v': curses_init_colors(tv->inverse = !tv->inverse); break;
- case 's': tv->no_skip = !tv->no_skip; break;
- case 'r': tv->show_name = !tv->show_name; break;
- case KEY_LEFT:
- case 'h': --pos; break;
- case KEY_RIGHT:
- case 'l': ++pos; break;
- case KEY_SLEFT:
- case 'H': pos -= 20; break;
- case KEY_SRIGHT:
- case 'L': pos += 20; break;
- case '.': tv->is_dot = !tv->is_dot; break;
- case 'N': tv->base_for = TV_BASE_NUCL; break;
- case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
- case 'i': tv->ins = !tv->ins; break;
- case '\010': pos -= 1000; break;
- case '\014': pos += 1000; break;
- case ' ': pos += tv->mcol; break;
- case KEY_UP:
- case 'j': --tv->row_shift; break;
- case KEY_DOWN:
- case 'k': ++tv->row_shift; break;
- case KEY_BACKSPACE:
- case '\177': pos -= tv->mcol; break;
- case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
- default: continue;
- }
- if (pos < 0) pos = 0;
- if (tv->row_shift < 0) tv->row_shift = 0;
- tv->my_drawaln(tv, tid, pos);
- }
-end_loop:
- return 0;
-}
-
-
-
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(stderr,"Calloc failed\n");
- return 0;
- }
-
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
- initscr();
- keypad(stdscr, TRUE);
- clear();
- noecho();
- cbreak();
-
- getmaxyx(stdscr, base->mrow, base->mcol);
- tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(30, 40, 5, 5);
-
- start_color();
- curses_init_colors(0);
- return base;
- }
-
-#else // !HAVE_CURSES
-
-#warning "No curses library is available; tview with curses is disabled."
-
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt);
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- return text_tv_init(fn,fn_fa,samples,fmt);
- }
-
-#endif
diff --git a/samtools/bam_tview_curses.c.pysam.c b/samtools/bam_tview_curses.c.pysam.c
deleted file mode 100644
index 90a8335..0000000
--- a/samtools/bam_tview_curses.c.pysam.c
+++ /dev/null
@@ -1,354 +0,0 @@
-#include "pysam.h"
-
-/* bam_tview_curses.c -- curses tview implementation.
-
- Copyright (C) 2008-2013 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include "bam_tview.h"
-
-#ifdef HAVE_CURSES
-
-#if defined HAVE_NCURSESW_CURSES_H
-#include <ncursesw/curses.h>
-#elif defined HAVE_NCURSESW_H
-#include <ncursesw.h>
-#elif defined HAVE_NCURSES_CURSES_H
-#include <ncurses/curses.h>
-#elif defined HAVE_NCURSES_H
-#include <ncurses.h>
-#elif defined HAVE_CURSES_H
-#include <curses.h>
-#endif
-
-typedef struct CursesTview {
- tview_t view;
- WINDOW *wgoto, *whelp;
- } curses_tview_t;
-
-#define FROM_TV(ptr) ((curses_tview_t*)ptr)
-
-static void curses_destroy(tview_t* base)
- {
- curses_tview_t* tv=(curses_tview_t*)base;
-
-
- delwin(tv->wgoto); delwin(tv->whelp);
- endwin();
-
- base_tv_destroy(base);
-
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
- mvprintw(y,x,str);
- free(str);
- }
-
-static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- mvaddch(y,x,ch);
- }
-
-static void curses_attron(struct AbstractTview* tv,int flag)
- {
- attron(flag);
- }
-static void curses_attroff(struct AbstractTview* tv,int flag)
- {
- attroff(flag);
- }
-static void curses_clear(struct AbstractTview* tv)
- {
- clear();
- }
-
-static int curses_init_colors(int inverse)
-{
- if (inverse) {
- init_pair(1, COLOR_WHITE, COLOR_BLUE);
- init_pair(2, COLOR_BLACK, COLOR_GREEN);
- init_pair(3, COLOR_BLACK, COLOR_YELLOW);
- init_pair(4, COLOR_BLACK, COLOR_WHITE);
- init_pair(5, COLOR_BLACK, COLOR_GREEN);
- init_pair(6, COLOR_BLACK, COLOR_CYAN);
- init_pair(7, COLOR_WHITE, COLOR_MAGENTA);
- init_pair(8, COLOR_WHITE, COLOR_RED);
- init_pair(9, COLOR_WHITE, COLOR_BLUE);
- } else {
- init_pair(1, COLOR_BLUE, COLOR_BLACK);
- init_pair(2, COLOR_GREEN, COLOR_BLACK);
- init_pair(3, COLOR_YELLOW, COLOR_BLACK);
- init_pair(4, COLOR_WHITE, COLOR_BLACK);
- init_pair(5, COLOR_GREEN, COLOR_BLACK);
- init_pair(6, COLOR_CYAN, COLOR_BLACK);
- init_pair(7, COLOR_MAGENTA, COLOR_BLACK);
- init_pair(8, COLOR_RED, COLOR_BLACK);
- init_pair(9, COLOR_BLUE, COLOR_BLACK);
- }
-
- return 0;
-}
-
-static int curses_colorpair(struct AbstractTview* tv,int flag)
- {
- return COLOR_PAIR(flag);
- }
-
-static int curses_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- return base_draw_aln(tv, tid, pos);
- }
-
-
-
-static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
- {
- char str[256], *p;
- int i, l = 0;
- tview_t *base=(tview_t*)tv;
- wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(tv->wgoto, 1, 2, "Goto: ");
- for (;;) {
- int invalid = 0;
- int c = wgetch(tv->wgoto);
- wrefresh(tv->wgoto);
- if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
- if(l > 0) --l;
- } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
- int _tid = -1, _beg, _end;
- if (str[0] == '=') {
- _beg = strtol(str+1, &p, 10) - 1;
- if (_beg > 0) {
- *pos = _beg;
- return;
- }
- } else {
- char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end);
- if (name_lim) {
- char name_terminator = *name_lim;
- *name_lim = '\0';
- _tid = bam_name2id(base->header, str);
- *name_lim = name_terminator;
- }
- else {
- // Unparsable region, but possibly a sequence named "foo:a"
- _tid = bam_name2id(base->header, str);
- _beg = 0;
- }
-
- if (_tid >= 0) {
- *tid = _tid; *pos = _beg;
- return;
- }
- }
-
- // If we get here, the region string is invalid
- invalid = 1;
- } else if (isgraph(c)) {
- if (l < TV_MAX_GOTO) str[l++] = c;
- } else if (c == '\027') l = 0;
- else if (c == '\033') return;
- str[l] = '\0';
- for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
- if (invalid) mvwprintw(tv->wgoto, 1, TV_MAX_GOTO - 1, "[Invalid]");
- mvwprintw(tv->wgoto, 1, 8, "%s", str);
- }
-}
-
-
-
-
-static void tv_win_help(curses_tview_t *tv) {
- int r = 1;
- tview_t* base=(tview_t*)base;
- WINDOW *win = tv->whelp;
- wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(win, r++, 2, " -=- Help -=- ");
- r++;
- mvwprintw(win, r++, 2, "? This window");
- mvwprintw(win, r++, 2, "Arrows Small scroll movement");
- mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");
- mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");
- mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");
- mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");
- mvwprintw(win, r++, 2, "space Scroll one screen");
- mvwprintw(win, r++, 2, "backspace Scroll back one screen");
- mvwprintw(win, r++, 2, "g Go to specific location");
- mvwprintw(win, r++, 2, "m Color for mapping qual");
- mvwprintw(win, r++, 2, "n Color for nucleotide");
- mvwprintw(win, r++, 2, "b Color for base quality");
- mvwprintw(win, r++, 2, "c Color for cs color");
- mvwprintw(win, r++, 2, "z Color for cs qual");
- mvwprintw(win, r++, 2, ". Toggle on/off dot view");
- mvwprintw(win, r++, 2, "s Toggle on/off ref skip");
- mvwprintw(win, r++, 2, "r Toggle on/off rd name");
- mvwprintw(win, r++, 2, "N Turn on nt view");
- mvwprintw(win, r++, 2, "C Turn on cs view");
- mvwprintw(win, r++, 2, "i Toggle on/off ins");
- mvwprintw(win, r++, 2, "v Inverse video");
- mvwprintw(win, r++, 2, "q Exit");
- r++;
- mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
- mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");
- mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");
- wrefresh(win);
- wgetch(win);
-}
-
-static int curses_underline(tview_t* tv)
- {
- return A_UNDERLINE;
- }
-
-static int curses_loop(tview_t* tv)
- {
- int tid, pos;
- curses_tview_t *CTV=(curses_tview_t *)tv;
- tid = tv->curr_tid; pos = tv->left_pos;
- while (1) {
- int c = getch();
- switch (c) {
- case '?': tv_win_help(CTV); break;
- case '\033':
- case 'q': goto end_loop;
- case '/':
- case 'g': tv_win_goto(CTV, &tid, &pos); break;
- case 'm': tv->color_for = TV_COLOR_MAPQ; break;
- case 'b': tv->color_for = TV_COLOR_BASEQ; break;
- case 'n': tv->color_for = TV_COLOR_NUCL; break;
- case 'c': tv->color_for = TV_COLOR_COL; break;
- case 'z': tv->color_for = TV_COLOR_COLQ; break;
- case 'v': curses_init_colors(tv->inverse = !tv->inverse); break;
- case 's': tv->no_skip = !tv->no_skip; break;
- case 'r': tv->show_name = !tv->show_name; break;
- case KEY_LEFT:
- case 'h': --pos; break;
- case KEY_RIGHT:
- case 'l': ++pos; break;
- case KEY_SLEFT:
- case 'H': pos -= 20; break;
- case KEY_SRIGHT:
- case 'L': pos += 20; break;
- case '.': tv->is_dot = !tv->is_dot; break;
- case 'N': tv->base_for = TV_BASE_NUCL; break;
- case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
- case 'i': tv->ins = !tv->ins; break;
- case '\010': pos -= 1000; break;
- case '\014': pos += 1000; break;
- case ' ': pos += tv->mcol; break;
- case KEY_UP:
- case 'j': --tv->row_shift; break;
- case KEY_DOWN:
- case 'k': ++tv->row_shift; break;
- case KEY_BACKSPACE:
- case '\177': pos -= tv->mcol; break;
- case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
- default: continue;
- }
- if (pos < 0) pos = 0;
- if (tv->row_shift < 0) tv->row_shift = 0;
- tv->my_drawaln(tv, tid, pos);
- }
-end_loop:
- return 0;
-}
-
-
-
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(pysam_stderr,"Calloc failed\n");
- return 0;
- }
-
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
- initscr();
- keypad(stdscr, TRUE);
- clear();
- noecho();
- cbreak();
-
- getmaxyx(stdscr, base->mrow, base->mcol);
- tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(30, 40, 5, 5);
-
- start_color();
- curses_init_colors(0);
- return base;
- }
-
-#else // !HAVE_CURSES
-
-#warning "No curses library is available; tview with curses is disabled."
-
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt);
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- return text_tv_init(fn,fn_fa,samples,fmt);
- }
-
-#endif
diff --git a/samtools/bam_tview_html.c b/samtools/bam_tview_html.c
deleted file mode 100644
index e3aecda..0000000
--- a/samtools/bam_tview_html.c
+++ /dev/null
@@ -1,377 +0,0 @@
-/* bam_tview_html.c -- HTML tview output.
-
- Copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Pierre Lindenbaum <plindenbaum at yahoo.fr>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <unistd.h>
-#include "bam_tview.h"
-
-#define UNDERLINE_FLAG 10
-
-typedef struct HtmlTview {
- tview_t view;
- int row_count;
- tixel_t** screen;
- FILE* out;
- int attributes;/* color... */
- } html_tview_t;
-
-#define FROM_TV(ptr) ((html_tview_t*)ptr)
-
-static void html_destroy(tview_t* base)
- {
- int i;
- html_tview_t* tv=(html_tview_t*)base;
- if(tv->screen!=NULL)
- {
- for(i=0;i< tv->row_count;++i) free(tv->screen[i]);
- free(tv->screen);
- }
- base_tv_destroy(base);
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- int i,nchars=0;
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- nchars=vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
-
- for(i=0;i< nchars;++i)
- {
- tv->my_mvaddch(tv,y,x+i,str[i]);
- }
- free(str);
- }
-
-static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- tixel_t* row=NULL;
- html_tview_t* ptr=FROM_TV(tv);
- if( x >= tv->mcol ) return; //out of screen
- while(ptr->row_count<=y)
- {
- int x;
- row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t));
- if(row==0) exit(EXIT_FAILURE);
- for(x=0;x<tv->mcol;++x) {row[x].ch=' ';row[x].attributes=0;}
- ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1));
- ptr->screen[ptr->row_count++]=row;
- }
- row=ptr->screen[y];
- row[x].ch=ch;
- row[x].attributes=ptr->attributes;
- }
-
-static void html_attron(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes |= flag;
-
-
- }
-
-static void html_attroff(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes &= ~(flag);
- }
-
-static void html_clear(struct AbstractTview* tv)
- {
- html_tview_t* ptr=FROM_TV(tv);
- if(ptr->screen!=NULL)
- {
- int i;
- for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]);
- free(ptr->screen);
- ptr->screen=NULL;
- }
- ptr->row_count=0;
- ptr->attributes=0;
- }
-
-static int html_colorpair(struct AbstractTview* tv,int flag)
- {
- return (1 << (flag));
- }
-
-static int html_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- fputs("<html><head>",ptr->out);
- fprintf(ptr->out,"<title>%s:%d</title>",
- tv->header->target_name[tid],
- pos+1
- );
- //style
-
- fputs("<style type='text/css'>\n",ptr->out);
- fputs(".tviewbody { margin:5px; background-color:white;text-align:center;}\n",ptr->out);
- fputs(".tviewtitle {text-align:center;}\n",ptr->out);
- fputs(".tviewpre { margin:5px; background-color:white;}\n",ptr->out);
- #define CSS(id,col) fprintf(ptr->out,".tviewc%d {color:%s;}\n.tviewcu%d {color:%s;text-decoration:underline;}\n",id,col,id,col);
- CSS(0, "black");
- CSS(1, "blue");
- CSS(2, "green");
- CSS(3, "yellow");
- CSS(4, "black");
- CSS(5, "green");
- CSS(6, "cyan");
- CSS(7, "yellow");
- CSS(8, "red");
- CSS(9, "blue");
- #undef CSS
- fputs("</style>",ptr->out);
-
- fputs("</head><body>",ptr->out);
-
- fprintf(ptr->out,"<div class='tviewbody'><div class='tviewtitle'>%s:%d</div>",
- tv->header->target_name[tid],
- pos+1
- );
-
- fputs("<pre class='tviewpre'>",ptr->out);
- for(y=0;y< ptr->row_count;++y)
- {
-
- for(x=0;x< tv->mcol;++x)
- {
-
-
- if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes)
- {
- int css=0;
- fprintf(ptr->out,"<span");
- while(css<32)
- {
- //if(y>1) fprintf(stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
-
- fprintf(ptr->out," class='tviewc%s%d'",
- (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""),
- css);
- break;
- }
- ++css;
- }
-
-
- fputs(">",ptr->out);
- }
-
- int ch=ptr->screen[y][x].ch;
- switch(ch)
- {
- case '<': fputs("<",ptr->out);break;
- case '>': fputs(">",ptr->out);break;
- case '&': fputs("&",ptr->out);break;
- default: fputc(ch,ptr->out); break;
- }
-
-
- if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes)
- {
- fputs("</span>",ptr->out);
- }
- }
- if(y+1 < ptr->row_count) fputs("<br/>",ptr->out);
- }
- fputs("</pre></div></body></html>",ptr->out);
- return 0;
- }
-
-
-#define ANSI_COLOR_RED "\x1b[31m"
-#define ANSI_COLOR_GREEN "\x1b[32m"
-#define ANSI_COLOR_YELLOW "\x1b[33m"
-#define ANSI_COLOR_BLUE "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN "\x1b[36m"
-#define ANSI_COLOR_BLACK "\x1b[0m"
-#define ANSI_COLOR_RESET ANSI_COLOR_BLACK
-
-#define ANSI_UNDERLINE_SET "\033[4m"
-#define ANSI_UNDERLINE_UNSET "\033[0m"
-
-static int text_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- int is_term= isatty(fileno(ptr->out));
-
- for(y=0;y< ptr->row_count;++y)
- {
- for(x=0;x< tv->mcol;++x)
- {
- if(is_term)
- {
- int css=0;
- while(css<32)
- {
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
- break;
- }
- ++css;
- }
- switch(css)
- {
- //CSS(0, "black");
- case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- //CSS(4, "black");
- case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break;
- case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- case 8: fputs(ANSI_COLOR_RED,ptr->out); break;
- case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- default:break;
- }
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_SET,ptr->out);
- }
-
- }
-
-
- int ch=ptr->screen[y][x].ch;
-
- fputc(ch,ptr->out);
- if(is_term)
- {
- fputs(ANSI_COLOR_RESET,ptr->out);
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_UNSET,ptr->out);
- }
- }
- }
- fputc('\n',ptr->out);
- }
- return 0;
- }
-
-
-static int html_loop(tview_t* tv)
- {
- //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- return 0;
- }
-
-static int html_underline(tview_t* tv)
- {
- return (1 << UNDERLINE_FLAG);
- }
-
-/*
-static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper)
- {
-
- }
-*/
-
-tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- char* colstr=getenv("COLUMNS");
- html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(stderr,"Calloc failed\n");
- return 0;
- }
- tv->row_count=0;
- tv->screen=NULL;
- tv->out=stdout;
- tv->attributes=0;
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
-
- if(colstr!=0)
- {
- base->mcol=atoi(colstr);
- if(base->mcol<10) base->mcol=80;
- }
- base->mrow=99999;
-
-/*
- init_pair(tv,1, "blue", "white");
- init_pair(tv,2, "green", "white");
- init_pair(tv,3, "yellow", "white");
- init_pair(tv,4, "white", "white");
- init_pair(tv,5, "green", "white");
- init_pair(tv,6, "cyan", "white");
- init_pair(tv,7, "yellow", "white");
- init_pair(tv,8, "red", "white");
- init_pair(tv,9, "blue", "white");
- */
- return base;
- }
-
-
-tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt);
- tv->my_drawaln=text_drawaln;
- return tv;
- }
-
diff --git a/samtools/bam_tview_html.c.pysam.c b/samtools/bam_tview_html.c.pysam.c
deleted file mode 100644
index 164e33d..0000000
--- a/samtools/bam_tview_html.c.pysam.c
+++ /dev/null
@@ -1,379 +0,0 @@
-#include "pysam.h"
-
-/* bam_tview_html.c -- HTML tview output.
-
- Copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Pierre Lindenbaum <plindenbaum at yahoo.fr>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <unistd.h>
-#include "bam_tview.h"
-
-#define UNDERLINE_FLAG 10
-
-typedef struct HtmlTview {
- tview_t view;
- int row_count;
- tixel_t** screen;
- FILE* out;
- int attributes;/* color... */
- } html_tview_t;
-
-#define FROM_TV(ptr) ((html_tview_t*)ptr)
-
-static void html_destroy(tview_t* base)
- {
- int i;
- html_tview_t* tv=(html_tview_t*)base;
- if(tv->screen!=NULL)
- {
- for(i=0;i< tv->row_count;++i) free(tv->screen[i]);
- free(tv->screen);
- }
- base_tv_destroy(base);
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- int i,nchars=0;
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- nchars=vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
-
- for(i=0;i< nchars;++i)
- {
- tv->my_mvaddch(tv,y,x+i,str[i]);
- }
- free(str);
- }
-
-static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- tixel_t* row=NULL;
- html_tview_t* ptr=FROM_TV(tv);
- if( x >= tv->mcol ) return; //out of screen
- while(ptr->row_count<=y)
- {
- int x;
- row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t));
- if(row==0) exit(EXIT_FAILURE);
- for(x=0;x<tv->mcol;++x) {row[x].ch=' ';row[x].attributes=0;}
- ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1));
- ptr->screen[ptr->row_count++]=row;
- }
- row=ptr->screen[y];
- row[x].ch=ch;
- row[x].attributes=ptr->attributes;
- }
-
-static void html_attron(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes |= flag;
-
-
- }
-
-static void html_attroff(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes &= ~(flag);
- }
-
-static void html_clear(struct AbstractTview* tv)
- {
- html_tview_t* ptr=FROM_TV(tv);
- if(ptr->screen!=NULL)
- {
- int i;
- for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]);
- free(ptr->screen);
- ptr->screen=NULL;
- }
- ptr->row_count=0;
- ptr->attributes=0;
- }
-
-static int html_colorpair(struct AbstractTview* tv,int flag)
- {
- return (1 << (flag));
- }
-
-static int html_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- fputs("<html><head>",ptr->out);
- fprintf(ptr->out,"<title>%s:%d</title>",
- tv->header->target_name[tid],
- pos+1
- );
- //style
-
- fputs("<style type='text/css'>\n",ptr->out);
- fputs(".tviewbody { margin:5px; background-color:white;text-align:center;}\n",ptr->out);
- fputs(".tviewtitle {text-align:center;}\n",ptr->out);
- fputs(".tviewpre { margin:5px; background-color:white;}\n",ptr->out);
- #define CSS(id,col) fprintf(ptr->out,".tviewc%d {color:%s;}\n.tviewcu%d {color:%s;text-decoration:underline;}\n",id,col,id,col);
- CSS(0, "black");
- CSS(1, "blue");
- CSS(2, "green");
- CSS(3, "yellow");
- CSS(4, "black");
- CSS(5, "green");
- CSS(6, "cyan");
- CSS(7, "yellow");
- CSS(8, "red");
- CSS(9, "blue");
- #undef CSS
- fputs("</style>",ptr->out);
-
- fputs("</head><body>",ptr->out);
-
- fprintf(ptr->out,"<div class='tviewbody'><div class='tviewtitle'>%s:%d</div>",
- tv->header->target_name[tid],
- pos+1
- );
-
- fputs("<pre class='tviewpre'>",ptr->out);
- for(y=0;y< ptr->row_count;++y)
- {
-
- for(x=0;x< tv->mcol;++x)
- {
-
-
- if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes)
- {
- int css=0;
- fprintf(ptr->out,"<span");
- while(css<32)
- {
- //if(y>1) fprintf(pysam_stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
-
- fprintf(ptr->out," class='tviewc%s%d'",
- (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""),
- css);
- break;
- }
- ++css;
- }
-
-
- fputs(">",ptr->out);
- }
-
- int ch=ptr->screen[y][x].ch;
- switch(ch)
- {
- case '<': fputs("<",ptr->out);break;
- case '>': fputs(">",ptr->out);break;
- case '&': fputs("&",ptr->out);break;
- default: fputc(ch,ptr->out); break;
- }
-
-
- if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes)
- {
- fputs("</span>",ptr->out);
- }
- }
- if(y+1 < ptr->row_count) fputs("<br/>",ptr->out);
- }
- fputs("</pre></div></body></html>",ptr->out);
- return 0;
- }
-
-
-#define ANSI_COLOR_RED "\x1b[31m"
-#define ANSI_COLOR_GREEN "\x1b[32m"
-#define ANSI_COLOR_YELLOW "\x1b[33m"
-#define ANSI_COLOR_BLUE "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN "\x1b[36m"
-#define ANSI_COLOR_BLACK "\x1b[0m"
-#define ANSI_COLOR_RESET ANSI_COLOR_BLACK
-
-#define ANSI_UNDERLINE_SET "\033[4m"
-#define ANSI_UNDERLINE_UNSET "\033[0m"
-
-static int text_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- int is_term= isatty(fileno(ptr->out));
-
- for(y=0;y< ptr->row_count;++y)
- {
- for(x=0;x< tv->mcol;++x)
- {
- if(is_term)
- {
- int css=0;
- while(css<32)
- {
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
- break;
- }
- ++css;
- }
- switch(css)
- {
- //CSS(0, "black");
- case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- //CSS(4, "black");
- case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break;
- case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- case 8: fputs(ANSI_COLOR_RED,ptr->out); break;
- case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- default:break;
- }
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_SET,ptr->out);
- }
-
- }
-
-
- int ch=ptr->screen[y][x].ch;
-
- fputc(ch,ptr->out);
- if(is_term)
- {
- fputs(ANSI_COLOR_RESET,ptr->out);
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_UNSET,ptr->out);
- }
- }
- }
- fputc('\n',ptr->out);
- }
- return 0;
- }
-
-
-static int html_loop(tview_t* tv)
- {
- //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- return 0;
- }
-
-static int html_underline(tview_t* tv)
- {
- return (1 << UNDERLINE_FLAG);
- }
-
-/*
-static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper)
- {
-
- }
-*/
-
-tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- char* colstr=getenv("COLUMNS");
- html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(pysam_stderr,"Calloc failed\n");
- return 0;
- }
- tv->row_count=0;
- tv->screen=NULL;
- tv->out=pysam_stdout;
- tv->attributes=0;
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
-
- if(colstr!=0)
- {
- base->mcol=atoi(colstr);
- if(base->mcol<10) base->mcol=80;
- }
- base->mrow=99999;
-
-/*
- init_pair(tv,1, "blue", "white");
- init_pair(tv,2, "green", "white");
- init_pair(tv,3, "yellow", "white");
- init_pair(tv,4, "white", "white");
- init_pair(tv,5, "green", "white");
- init_pair(tv,6, "cyan", "white");
- init_pair(tv,7, "yellow", "white");
- init_pair(tv,8, "red", "white");
- init_pair(tv,9, "blue", "white");
- */
- return base;
- }
-
-
-tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt);
- tv->my_drawaln=text_drawaln;
- return tv;
- }
-
diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c
index 044bc4e..e24689e 100644
--- a/samtools/bamshuf.c
+++ b/samtools/bamshuf.c
@@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/hts.h"
#include "htslib/ksort.h"
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#define DEF_CLEVEL 1
@@ -86,6 +87,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
bam_hdr_t *h = NULL;
int64_t j, max_cnt = 0, *cnt = NULL;
elem_t *a = NULL;
+ htsThreadPool p = {NULL, 0};
+
+ if (ga->nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga->nthreads))) {
+ print_error_errno("collate", "Error creating thread pool\n");
+ return 1;
+ }
+ }
// Read input, distribute reads pseudo-randomly into n_files temporary
// files.
@@ -94,6 +103,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
h = sam_hdr_read(fp);
if (h == NULL) {
@@ -173,6 +183,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
goto fail;
}
+ if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p);
if (sam_hdr_write(fpw, h) < 0) {
print_error_errno("collate", "Couldn't write header");
@@ -193,6 +204,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]);
goto fail;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header
// Slurp in one of the split files
@@ -228,6 +240,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
return 0;
mem_fail:
@@ -249,13 +262,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
free(fnt);
free(fpt);
free(cnt);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(ga);
return 1;
}
static int usage(FILE *fp, int n_files) {
fprintf(fp,
- "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
+ "Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] <in.bam> <out.prefix>\n\n"
"Options:\n"
" -O output to stdout\n"
" -u uncompressed BAM output\n"
@@ -263,7 +277,7 @@ static int usage(FILE *fp, int n_files) {
" -n INT number of temporary files [%d]\n", // n_files
DEF_CLEVEL, n_files);
- sam_global_opt_help(fp, "-....");
+ sam_global_opt_help(fp, "-....@");
return 1;
}
@@ -273,11 +287,11 @@ int main_bamshuf(int argc, char *argv[])
int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "n:l:uO@:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c
index fb1a5ac..04cd37b 100644
--- a/samtools/bamshuf.c.pysam.c
+++ b/samtools/bamshuf.c.pysam.c
@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/hts.h"
#include "htslib/ksort.h"
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#define DEF_CLEVEL 1
@@ -88,6 +89,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
bam_hdr_t *h = NULL;
int64_t j, max_cnt = 0, *cnt = NULL;
elem_t *a = NULL;
+ htsThreadPool p = {NULL, 0};
+
+ if (ga->nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga->nthreads))) {
+ print_error_errno("collate", "Error creating thread pool\n");
+ return 1;
+ }
+ }
// Read input, distribute reads pseudo-randomly into n_files temporary
// files.
@@ -96,6 +105,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
h = sam_hdr_read(fp);
if (h == NULL) {
@@ -175,6 +185,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
goto fail;
}
+ if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p);
if (sam_hdr_write(fpw, h) < 0) {
print_error_errno("collate", "Couldn't write header");
@@ -195,6 +206,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]);
goto fail;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header
// Slurp in one of the split files
@@ -230,6 +242,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
return 0;
mem_fail:
@@ -251,13 +264,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
free(fnt);
free(fpt);
free(cnt);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(ga);
return 1;
}
static int usage(FILE *fp, int n_files) {
fprintf(fp,
- "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
+ "Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] <in.bam> <out.prefix>\n\n"
"Options:\n"
" -O output to pysam_stdout\n"
" -u uncompressed BAM output\n"
@@ -265,7 +279,7 @@ static int usage(FILE *fp, int n_files) {
" -n INT number of temporary files [%d]\n", // n_files
DEF_CLEVEL, n_files);
- sam_global_opt_help(fp, "-....");
+ sam_global_opt_help(fp, "-....@");
return 1;
}
@@ -275,11 +289,11 @@ int main_bamshuf(int argc, char *argv[])
int c, n_files = 64, clevel = DEF_CLEVEL, is_pysam_stdout = 0, is_un = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "n:l:uO@:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
diff --git a/samtools/bamtk.c b/samtools/bamtk.c
index 5c1c60d..bd520b6 100644
--- a/samtools/bamtk.c
+++ b/samtools/bamtk.c
@@ -1,6 +1,6 @@
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2016 Genome Research Ltd.
+ Copyright (C) 2008-2017 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -27,9 +27,8 @@ DEALINGS IN THE SOFTWARE. */
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
-#include <stdarg.h>
#include <string.h>
-#include <errno.h>
+
#include "htslib/hts.h"
#include "samtools.h"
#include "version.h"
@@ -69,34 +68,6 @@ const char *samtools_version()
return SAMTOOLS_VERSION;
}
-static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
-{
- fflush(stdout);
- if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand);
- else fprintf(stderr, "samtools: ");
- vfprintf(stderr, format, args);
- if (extra) fprintf(stderr, ": %s\n", extra);
- else fprintf(stderr, "\n");
- fflush(stderr);
-}
-
-void print_error(const char *subcommand, const char *format, ...)
-{
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, NULL);
- va_end(args);
-}
-
-void print_error_errno(const char *subcommand, const char *format, ...)
-{
- int err = errno;
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, strerror(err));
- va_end(args);
-}
-
static void usage(FILE *fp)
{
/* Please improve the grouping */
@@ -215,7 +186,7 @@ int main(int argc, char *argv[])
printf(
"samtools %s\n"
"Using htslib %s\n"
-"Copyright (C) 2016 Genome Research Ltd.\n",
+"Copyright (C) 2017 Genome Research Ltd.\n",
samtools_version(), hts_version());
}
else if (strcmp(argv[1], "--version-only") == 0) {
diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c
index 1f3d938..8956b1f 100644
--- a/samtools/bamtk.c.pysam.c
+++ b/samtools/bamtk.c.pysam.c
@@ -2,7 +2,7 @@
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2016 Genome Research Ltd.
+ Copyright (C) 2008-2017 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -29,9 +29,8 @@ DEALINGS IN THE SOFTWARE. */
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
-#include <stdarg.h>
#include <string.h>
-#include <errno.h>
+
#include "htslib/hts.h"
#include "samtools.h"
#include "version.h"
@@ -41,7 +40,7 @@ int bam_mpileup(int argc, char *argv[]);
int bam_merge(int argc, char *argv[]);
int bam_index(int argc, char *argv[]);
int bam_sort(int argc, char *argv[]);
-int bam_tview_main(int argc, char *argv[]);
+/* AH: int bam_tview_main(int argc, char *argv[]); */
int bam_mating(int argc, char *argv[]);
int bam_rmdup(int argc, char *argv[]);
int bam_flagstat(int argc, char *argv[]);
@@ -71,34 +70,6 @@ const char *samtools_version()
return SAMTOOLS_VERSION;
}
-static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
-{
- fflush(pysam_stdout);
- if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand);
- else fprintf(pysam_stderr, "samtools: ");
- vfprintf(pysam_stderr, format, args);
- if (extra) fprintf(pysam_stderr, ": %s\n", extra);
- else fprintf(pysam_stderr, "\n");
- fflush(pysam_stderr);
-}
-
-void print_error(const char *subcommand, const char *format, ...)
-{
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, NULL);
- va_end(args);
-}
-
-void print_error_errno(const char *subcommand, const char *format, ...)
-{
- int err = errno;
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, strerror(err));
- va_end(args);
-}
-
static void usage(FILE *fp)
{
/* Please improve the grouping */
@@ -212,12 +183,12 @@ int samtools_main(int argc, char *argv[])
fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
return 1;
}
- else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
+/* AH: else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); */
else if (strcmp(argv[1], "--version") == 0) {
fprintf(pysam_stdout,
"samtools %s\n"
"Using htslib %s\n"
-"Copyright (C) 2016 Genome Research Ltd.\n",
+"Copyright (C) 2017 Genome Research Ltd.\n",
samtools_version(), hts_version());
}
else if (strcmp(argv[1], "--version-only") == 0) {
diff --git a/samtools/bedcov.c b/samtools/bedcov.c
index d4dceee..1113e17 100644
--- a/samtools/bedcov.c
+++ b/samtools/bedcov.c
@@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kseq.h"
@@ -74,7 +75,7 @@ int main_bedcov(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
@@ -89,8 +90,9 @@ int main_bedcov(int argc, char *argv[])
}
if (usage || optind + 2 > argc) {
fprintf(stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
- fprintf(stderr, " -Q INT Only count bases of at least INT quality [0]\n");
- sam_global_opt_help(stderr, "-.--.");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
+ sam_global_opt_help(stderr, "-.--.-");
return 1;
}
memset(&str, 0, sizeof(kstring_t));
diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c
index 25fdffc..3fd6d4c 100644
--- a/samtools/bedcov.c.pysam.c
+++ b/samtools/bedcov.c.pysam.c
@@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kseq.h"
@@ -76,7 +77,7 @@ int main_bedcov(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
@@ -91,8 +92,9 @@ int main_bedcov(int argc, char *argv[])
}
if (usage || optind + 2 > argc) {
fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
- fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n");
- sam_global_opt_help(pysam_stderr, "-.--.");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -Q <int> mapping quality threshold [0]\n");
+ sam_global_opt_help(pysam_stderr, "-.--.-");
return 1;
}
memset(&str, 0, sizeof(kstring_t));
diff --git a/samtools/cut_target.c b/samtools/cut_target.c
index 71a6c85..7d541fa 100644
--- a/samtools/cut_target.c
+++ b/samtools/cut_target.c
@@ -1,7 +1,7 @@
/* cut_target.c -- targetcut subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2012-2013, 2015 Genome Research Ltd.
+ Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -28,9 +28,10 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
-#include "errmod.h"
#include "htslib/faidx.h"
+#include "samtools.h"
#include "sam_opts.h"
#define ERR_DEP 0.83
@@ -146,7 +147,6 @@ static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns)
static int read_aln(void *data, bam1_t *b)
{
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
ct_t *g = (ct_t*)data;
int ret;
while (1)
@@ -160,7 +160,7 @@ static int read_aln(void *data, bam1_t *b)
g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len);
g->tid = b->core.tid;
}
- bam_prob_realn_core(b, g->ref, g->len, 1<<1|1);
+ sam_prob_realn(b, g->ref, g->len, 1<<1|1);
}
break;
}
@@ -177,7 +177,7 @@ int main_cut_target(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'),
{ NULL, 0, NULL, 0 }
};
@@ -201,14 +201,19 @@ int main_cut_target(int argc, char *argv[])
}
if (usage || argc == optind) {
fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
- sam_global_opt_help(stderr, "-.--f");
+ sam_global_opt_help(stderr, "-.--f-");
return 1;
}
l = max_l = 0; cns = 0;
g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ if (g.fp == NULL) {
+ print_error_errno("targetcut", "can't open \"%s\"", argv[optind]);
+ return 1;
+ }
+
g.h = sam_hdr_read(g.fp);
if (g.h == NULL) {
- fprintf(stderr, "Couldn't read header for '%s'\n", argv[optind]);
+ print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]);
sam_close(g.fp);
return 1;
}
diff --git a/samtools/cut_target.c.pysam.c b/samtools/cut_target.c.pysam.c
index 82a4c4c..e55f749 100644
--- a/samtools/cut_target.c.pysam.c
+++ b/samtools/cut_target.c.pysam.c
@@ -3,7 +3,7 @@
/* cut_target.c -- targetcut subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2012-2013, 2015 Genome Research Ltd.
+ Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -30,9 +30,10 @@ DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
-#include "errmod.h"
#include "htslib/faidx.h"
+#include "samtools.h"
#include "sam_opts.h"
#define ERR_DEP 0.83
@@ -148,7 +149,6 @@ static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns)
static int read_aln(void *data, bam1_t *b)
{
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
ct_t *g = (ct_t*)data;
int ret;
while (1)
@@ -162,7 +162,7 @@ static int read_aln(void *data, bam1_t *b)
g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len);
g->tid = b->core.tid;
}
- bam_prob_realn_core(b, g->ref, g->len, 1<<1|1);
+ sam_prob_realn(b, g->ref, g->len, 1<<1|1);
}
break;
}
@@ -179,7 +179,7 @@ int main_cut_target(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'),
{ NULL, 0, NULL, 0 }
};
@@ -203,14 +203,19 @@ int main_cut_target(int argc, char *argv[])
}
if (usage || argc == optind) {
fprintf(pysam_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
- sam_global_opt_help(pysam_stderr, "-.--f");
+ sam_global_opt_help(pysam_stderr, "-.--f-");
return 1;
}
l = max_l = 0; cns = 0;
g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ if (g.fp == NULL) {
+ print_error_errno("targetcut", "can't open \"%s\"", argv[optind]);
+ return 1;
+ }
+
g.h = sam_hdr_read(g.fp);
if (g.h == NULL) {
- fprintf(pysam_stderr, "Couldn't read header for '%s'\n", argv[optind]);
+ print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]);
sam_close(g.fp);
return 1;
}
diff --git a/samtools/errmod.c b/samtools/errmod.c
deleted file mode 100644
index c37c6d1..0000000
--- a/samtools/errmod.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/* errmod.c -- revised MAQ error model.
-
- Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
-
- Author: Heng Li <lh3 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <math.h>
-#include "errmod.h"
-#include "htslib/ksort.h"
-KSORT_INIT_GENERIC(uint16_t)
-
-/* table of constants generated for given depcorr and eta */
-typedef struct __errmod_coef_t {
- double *fk, *beta, *lhet;
-} errmod_coef_t;
-
-typedef struct {
- double fsum[16], bsum[16];
- uint32_t c[16];
-} call_aux_t;
-
-/* \Gamma(n) = (n-1)! */
-#define lfact(n) lgamma(n+1)
-
-/* generates a success * trials table of bionomial probability densities (log transformed) */
-static double* logbinomial_table( const int n_size )
-{
- /* prob distribution for binom var is p(k) = {n! \over k! (n-k)! } p^k (1-p)^{n-k} */
- /* this calcs p(k) = {log(n!) - log(k!) - log((n-k)!) */
- int k, n;
- double *logbinom = (double*)calloc(n_size * n_size, sizeof(double));
- for (n = 1; n < n_size; ++n) {
- double lfn = lfact(n);
- for (k = 1; k <= n; ++k)
- logbinom[n<<8|k] = lfn - lfact(k) - lfact(n-k);
- }
- return logbinom;
-}
-
-static errmod_coef_t *cal_coef(double depcorr, double eta)
-{
- int k, n, q;
- long double sum, sum1;
- double *lC;
- errmod_coef_t *ec;
-
- ec = calloc(1, sizeof(errmod_coef_t));
- // initialize ->fk
- ec->fk = (double*)calloc(256, sizeof(double));
- ec->fk[0] = 1.0;
- for (n = 1; n < 256; ++n)
- ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
- // initialize ->coef
- ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
-
- lC = logbinomial_table( 256 );
-
- for (q = 1; q < 64; ++q) {
- double e = pow(10.0, -q/10.0);
- double le = log(e);
- double le1 = log(1.0 - e);
- for (n = 1; n <= 255; ++n) {
- double *beta = ec->beta + (q<<16|n<<8);
- sum1 = sum = 0.0;
- for (k = n; k >= 0; --k, sum1 = sum) {
- sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
- beta[k] = -10. / M_LN10 * logl(sum1 / sum);
- }
- }
- }
- // initialize ->lhet
- ec->lhet = (double*)calloc(256 * 256, sizeof(double));
- for (n = 0; n < 256; ++n)
- for (k = 0; k < 256; ++k)
- ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
- free(lC);
- return ec;
-}
-
-/**
- * Create errmod_t object with obj.depcorr set to depcorr and initialise
- */
-errmod_t *errmod_init(double depcorr)
-{
- errmod_t *em;
- em = (errmod_t*)calloc(1, sizeof(errmod_t));
- em->depcorr = depcorr;
- em->coef = cal_coef(depcorr, 0.03);
- return em;
-}
-
-/**
- * Deallocate an errmod_t object
- */
-void errmod_destroy(errmod_t *em)
-{
- if (em == 0) return;
- free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
- free(em->coef); free(em);
-}
-
-//
-// em: error model to fit to data
-// m: number of alleles across all samples
-// n: number of bases observed in sample
-// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base]
-// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j)
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
-{
- // Aux
- // aux.c is total count of each base observed (ignoring strand)
- call_aux_t aux;
- // Loop variables
- int i, j, k;
- // The total count of each base observed per strand
- int w[32];
-
- memset(q, 0, m * m * sizeof(float)); // initialise q to 0
- if (n == 0) return 0;
- // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix
- if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255
- ks_shuffle(uint16_t, n, bases);
- n = 255;
- }
- ks_introsort(uint16_t, n, bases);
- /* zero out w and aux */
- memset(w, 0, 32 * sizeof(int));
- memset(&aux, 0, sizeof(call_aux_t));
-
- for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
- uint16_t b = bases[j];
- /* extract quality and cap at 63 */
- int qual = b>>5 < 4? 4 : b>>5;
- if (qual > 63) qual = 63;
- /* extract base ORed with strand */
- int basestrand = b&0x1f;
- /* extract base */
- int base = b&0xf;
- aux.fsum[base] += em->coef->fk[w[basestrand]];
- aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]];
- ++aux.c[base];
- ++w[basestrand];
- }
-
- // generate likelihood
- for (j = 0; j < m; ++j) {
- float tmp1, tmp3;
- int tmp2;
- // homozygous
- for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) {
- if (k == j) continue;
- tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
- }
- if (tmp2) {
- q[j*m+j] = tmp1;
- }
- // heterozygous
- for (k = j + 1; k < m; ++k) {
- int cjk = aux.c[j] + aux.c[k];
- for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
- if (i == j || i == k) continue;
- tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
- }
- if (tmp2) {
- q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
- } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
- }
- /* clamp to greater than 0 */
- for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
- }
-
- return 0;
-}
diff --git a/samtools/errmod.c.pysam.c b/samtools/errmod.c.pysam.c
deleted file mode 100644
index 12176cf..0000000
--- a/samtools/errmod.c.pysam.c
+++ /dev/null
@@ -1,196 +0,0 @@
-#include "pysam.h"
-
-/* errmod.c -- revised MAQ error model.
-
- Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
-
- Author: Heng Li <lh3 at sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <math.h>
-#include "errmod.h"
-#include "htslib/ksort.h"
-KSORT_INIT_GENERIC(uint16_t)
-
-/* table of constants generated for given depcorr and eta */
-typedef struct __errmod_coef_t {
- double *fk, *beta, *lhet;
-} errmod_coef_t;
-
-typedef struct {
- double fsum[16], bsum[16];
- uint32_t c[16];
-} call_aux_t;
-
-/* \Gamma(n) = (n-1)! */
-#define lfact(n) lgamma(n+1)
-
-/* generates a success * trials table of bionomial probability densities (log transformed) */
-static double* logbinomial_table( const int n_size )
-{
- /* prob distribution for binom var is p(k) = {n! \over k! (n-k)! } p^k (1-p)^{n-k} */
- /* this calcs p(k) = {log(n!) - log(k!) - log((n-k)!) */
- int k, n;
- double *logbinom = (double*)calloc(n_size * n_size, sizeof(double));
- for (n = 1; n < n_size; ++n) {
- double lfn = lfact(n);
- for (k = 1; k <= n; ++k)
- logbinom[n<<8|k] = lfn - lfact(k) - lfact(n-k);
- }
- return logbinom;
-}
-
-static errmod_coef_t *cal_coef(double depcorr, double eta)
-{
- int k, n, q;
- long double sum, sum1;
- double *lC;
- errmod_coef_t *ec;
-
- ec = calloc(1, sizeof(errmod_coef_t));
- // initialize ->fk
- ec->fk = (double*)calloc(256, sizeof(double));
- ec->fk[0] = 1.0;
- for (n = 1; n < 256; ++n)
- ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
- // initialize ->coef
- ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
-
- lC = logbinomial_table( 256 );
-
- for (q = 1; q < 64; ++q) {
- double e = pow(10.0, -q/10.0);
- double le = log(e);
- double le1 = log(1.0 - e);
- for (n = 1; n <= 255; ++n) {
- double *beta = ec->beta + (q<<16|n<<8);
- sum1 = sum = 0.0;
- for (k = n; k >= 0; --k, sum1 = sum) {
- sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
- beta[k] = -10. / M_LN10 * logl(sum1 / sum);
- }
- }
- }
- // initialize ->lhet
- ec->lhet = (double*)calloc(256 * 256, sizeof(double));
- for (n = 0; n < 256; ++n)
- for (k = 0; k < 256; ++k)
- ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
- free(lC);
- return ec;
-}
-
-/**
- * Create errmod_t object with obj.depcorr set to depcorr and initialise
- */
-errmod_t *errmod_init(double depcorr)
-{
- errmod_t *em;
- em = (errmod_t*)calloc(1, sizeof(errmod_t));
- em->depcorr = depcorr;
- em->coef = cal_coef(depcorr, 0.03);
- return em;
-}
-
-/**
- * Deallocate an errmod_t object
- */
-void errmod_destroy(errmod_t *em)
-{
- if (em == 0) return;
- free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
- free(em->coef); free(em);
-}
-
-//
-// em: error model to fit to data
-// m: number of alleles across all samples
-// n: number of bases observed in sample
-// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base]
-// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j)
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
-{
- // Aux
- // aux.c is total count of each base observed (ignoring strand)
- call_aux_t aux;
- // Loop variables
- int i, j, k;
- // The total count of each base observed per strand
- int w[32];
-
- memset(q, 0, m * m * sizeof(float)); // initialise q to 0
- if (n == 0) return 0;
- // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix
- if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255
- ks_shuffle(uint16_t, n, bases);
- n = 255;
- }
- ks_introsort(uint16_t, n, bases);
- /* zero out w and aux */
- memset(w, 0, 32 * sizeof(int));
- memset(&aux, 0, sizeof(call_aux_t));
-
- for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
- uint16_t b = bases[j];
- /* extract quality and cap at 63 */
- int qual = b>>5 < 4? 4 : b>>5;
- if (qual > 63) qual = 63;
- /* extract base ORed with strand */
- int basestrand = b&0x1f;
- /* extract base */
- int base = b&0xf;
- aux.fsum[base] += em->coef->fk[w[basestrand]];
- aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]];
- ++aux.c[base];
- ++w[basestrand];
- }
-
- // generate likelihood
- for (j = 0; j < m; ++j) {
- float tmp1, tmp3;
- int tmp2;
- // homozygous
- for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) {
- if (k == j) continue;
- tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
- }
- if (tmp2) {
- q[j*m+j] = tmp1;
- }
- // heterozygous
- for (k = j + 1; k < m; ++k) {
- int cjk = aux.c[j] + aux.c[k];
- for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
- if (i == j || i == k) continue;
- tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
- }
- if (tmp2) {
- q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
- } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
- }
- /* clamp to greater than 0 */
- for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
- }
-
- return 0;
-}
diff --git a/samtools/faidx.c b/samtools/faidx.c
index 336bde5..c5c9ed6 100644
--- a/samtools/faidx.c
+++ b/samtools/faidx.c
@@ -1,6 +1,6 @@
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -25,34 +25,19 @@ DEALINGS IN THE SOFTWARE. */
#include <config.h>
-#include <ctype.h>
-#include <string.h>
#include <stdlib.h>
#include <stdio.h>
-#include <stdint.h>
#include <unistd.h>
-#include <stdarg.h>
+
#include <htslib/faidx.h>
+#include "samtools.h"
-static void error(const char *format, ...)
+static int usage(FILE *fp, int exit_status)
{
- if ( format )
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
- }
- else
- {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
- fprintf(stderr, "\n");
- }
- exit(-1);
+ fprintf(fp, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
+ return exit_status;
}
-
int faidx_main(int argc, char *argv[])
{
int c;
@@ -61,39 +46,60 @@ int faidx_main(int argc, char *argv[])
switch(c)
{
case 'h':
+ return usage(stdout, EXIT_SUCCESS);
+
default:
- error(NULL);
+ return usage(stderr, EXIT_FAILURE);
}
}
if ( argc==optind )
- error(NULL);
+ return usage(stdout, EXIT_SUCCESS);
if ( argc==2 )
{
if (fai_build(argv[optind]) != 0) {
- error("Could not build fai index %s.fai\n", argv[optind]);
+ fprintf(stderr, "Could not build fai index %s.fai\n", argv[optind]);
+ return EXIT_FAILURE;
}
return 0;
}
faidx_t *fai = fai_load(argv[optind]);
- if ( !fai ) error("Could not load fai index of %s\n", argv[optind]);
+ if ( !fai ) {
+ fprintf(stderr, "Could not load fai index of %s\n", argv[optind]);
+ return EXIT_FAILURE;
+ }
+
+ int exit_status = EXIT_SUCCESS;
- while ( ++optind<argc )
+ while ( ++optind<argc && exit_status == EXIT_SUCCESS)
{
printf(">%s\n", argv[optind]);
- int i, j, seq_len;
+ int seq_len;
char *seq = fai_fetch(fai, argv[optind], &seq_len);
- if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]);
- for (i=0; i<seq_len; i+=60)
+ if ( seq_len < 0 ) {
+ fprintf(stderr, "Failed to fetch sequence in %s\n", argv[optind]);
+ exit_status = EXIT_FAILURE;
+ break;
+ }
+ size_t i, seq_sz = seq_len;
+ for (i=0; i<seq_sz; i+=60)
{
- for (j=0; j<60 && i+j<seq_len; j++)
- putchar(seq[i+j]);
- putchar('\n');
+ size_t len = i + 60 < seq_sz ? 60 : seq_sz - i;
+ if (fwrite(seq + i, 1, len, stdout) < len ||
+ putchar('\n') == EOF) {
+ print_error_errno("faidx", "failed to write output");
+ exit_status = EXIT_FAILURE;
+ break;
+ }
}
free(seq);
}
fai_destroy(fai);
- return 0;
-}
+ if (fflush(stdout) == EOF) {
+ print_error_errno("faidx", "failed to flush output");
+ exit_status = EXIT_FAILURE;
+ }
+ return exit_status;
+}
diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c
index ac06647..ec8c90f 100644
--- a/samtools/faidx.c.pysam.c
+++ b/samtools/faidx.c.pysam.c
@@ -2,7 +2,7 @@
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -27,34 +27,19 @@ DEALINGS IN THE SOFTWARE. */
#include <config.h>
-#include <ctype.h>
-#include <string.h>
#include <stdlib.h>
#include <stdio.h>
-#include <stdint.h>
#include <unistd.h>
-#include <stdarg.h>
+
#include <htslib/faidx.h>
+#include "samtools.h"
-static void error(const char *format, ...)
+static int usage(FILE *fp, int exit_status)
{
- if ( format )
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(pysam_stderr, format, ap);
- va_end(ap);
- }
- else
- {
- fprintf(pysam_stderr, "\n");
- fprintf(pysam_stderr, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
- fprintf(pysam_stderr, "\n");
- }
- exit(-1);
+ fprintf(fp, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
+ return exit_status;
}
-
int faidx_main(int argc, char *argv[])
{
int c;
@@ -63,39 +48,60 @@ int faidx_main(int argc, char *argv[])
switch(c)
{
case 'h':
+ return usage(pysam_stdout, EXIT_SUCCESS);
+
default:
- error(NULL);
+ return usage(pysam_stderr, EXIT_FAILURE);
}
}
if ( argc==optind )
- error(NULL);
+ return usage(pysam_stdout, EXIT_SUCCESS);
if ( argc==2 )
{
if (fai_build(argv[optind]) != 0) {
- error("Could not build fai index %s.fai\n", argv[optind]);
+ fprintf(pysam_stderr, "Could not build fai index %s.fai\n", argv[optind]);
+ return EXIT_FAILURE;
}
return 0;
}
faidx_t *fai = fai_load(argv[optind]);
- if ( !fai ) error("Could not load fai index of %s\n", argv[optind]);
+ if ( !fai ) {
+ fprintf(pysam_stderr, "Could not load fai index of %s\n", argv[optind]);
+ return EXIT_FAILURE;
+ }
+
+ int exit_status = EXIT_SUCCESS;
- while ( ++optind<argc )
+ while ( ++optind<argc && exit_status == EXIT_SUCCESS)
{
fprintf(pysam_stdout, ">%s\n", argv[optind]);
- int i, j, seq_len;
+ int seq_len;
char *seq = fai_fetch(fai, argv[optind], &seq_len);
- if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]);
- for (i=0; i<seq_len; i+=60)
+ if ( seq_len < 0 ) {
+ fprintf(pysam_stderr, "Failed to fetch sequence in %s\n", argv[optind]);
+ exit_status = EXIT_FAILURE;
+ break;
+ }
+ size_t i, seq_sz = seq_len;
+ for (i=0; i<seq_sz; i+=60)
{
- for (j=0; j<60 && i+j<seq_len; j++)
- fputc(seq[i+j], pysam_stdout);
- fputc('\n', pysam_stdout);
+ size_t len = i + 60 < seq_sz ? 60 : seq_sz - i;
+ if (fwrite(seq + i, 1, len, pysam_stdout) < len ||
+ fputc('\n', pysam_stdout) == EOF) {
+ print_error_errno("faidx", "failed to write output");
+ exit_status = EXIT_FAILURE;
+ break;
+ }
}
free(seq);
}
fai_destroy(fai);
- return 0;
-}
+ if (fflush(pysam_stdout) == EOF) {
+ print_error_errno("faidx", "failed to flush output");
+ exit_status = EXIT_FAILURE;
+ }
+ return exit_status;
+}
diff --git a/samtools/kprobaln.c b/samtools/kprobaln.c
deleted file mode 100644
index e319708..0000000
--- a/samtools/kprobaln.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3 at live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <config.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kprobaln.h"
-
-/*****************************************
- * Probabilistic banded glocal alignment *
- *****************************************/
-
-#define EI .25
-#define EM .33333333333
-
-static float g_qual2prob[256];
-
-#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
-
-kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
-kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
-
-/*
- The topology of the profile HMM:
-
- /\ /\ /\ /\
- I[1] I[k-1] I[k] I[L]
- ^ \ \ ^ \ ^ \ \ ^
- | \ \ | \ | \ \ |
- M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
- \ \/ \/ \/ /
- \ /\ /\ /\ /
- -> D[k-1] -> D[k] ->
-
- M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
-
- On input, _ref is the reference sequence and _query is the query
- sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
- ambiguous residue. iqual is the base quality. c sets the gap open
- probability, gap extension probability and band width.
-
- On output, state and q are arrays of length l_query. The higher 30
- bits give the reference position the query base is matched to and the
- lower two bits can be 0 (an alignment match) or 1 (an
- insertion). q[i] gives the phred scaled posterior probability of
- state[i] being wrong.
- */
-int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q)
-{
- double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
- float *qual, *_qual;
- const uint8_t *ref, *query;
- int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
-
- if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
-
- /*** initialization ***/
- is_backward = state && q? 1 : 0;
- ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
- bw = l_ref > l_query? l_ref : l_query;
- if (bw > c->bw) bw = c->bw;
- if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
- bw2 = bw * 2 + 1;
- // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
- f = calloc(l_query+1, sizeof(double*));
- if (is_backward) b = calloc(l_query+1, sizeof(double*));
- for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0
- f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
- if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
- }
- s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
- // initialize qual
- _qual = calloc(l_query, sizeof(float));
- if (g_qual2prob[0] == 0)
- for (i = 0; i < 256; ++i)
- g_qual2prob[i] = pow(10, -i/10.);
- for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
- qual = _qual - 1;
- // initialize transition probability
- sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
- m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
- m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
- m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
- bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
- /*** forward ***/
- // f[0]
- set_u(k, bw, 0, 0);
- f[0][k] = s[0] = 1.;
- { // f[1]
- double *fi = f[1], sum;
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- fi[u+0] = e * bM; fi[u+1] = EI * bI;
- sum += fi[u] + fi[u+1];
- }
- // rescale
- s[1] = sum;
- set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
- for (k = _beg; k <= _end; ++k) fi[k] /= sum;
- }
- // f[2..l_query]
- for (i = 2; i <= l_query; ++i) {
- double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
- int beg = 1, end = l_ref, x, _beg, _end;
- uint8_t qyi = query[i];
- x = i - bw; beg = beg > x? beg : x; // band start
- x = i + bw; end = end < x? end : x; // band end
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u, v11, v01, v10;
- double e;
- e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
- set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
- fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
- fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
- fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
- sum += fi[u] + fi[u+1] + fi[u+2];
-// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
- }
- // rescale
- s[i] = sum;
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
- }
- { // f[l_query+1]
- double sum;
- for (k = 1, sum = 0.; k <= l_ref; ++k) {
- int u;
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
- }
- s[l_query+1] = sum; // the last scaling factor
- }
- { // compute likelihood
- double p = 1., Pr1 = 0.;
- for (i = 0; i <= l_query + 1; ++i) {
- p *= s[i];
- if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
- }
- Pr1 += -4.343 * log(p * l_ref * l_query);
- Pr = (int)(Pr1 + .499);
- if (!is_backward) { // skip backward and MAP
- for (i = 0; i <= l_query; ++i) free(f[i]);
- free(f); free(s); free(_qual);
- return Pr;
- }
- }
- /*** backward ***/
- // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
- for (k = 1; k <= l_ref; ++k) {
- int u;
- double *bi = b[l_query];
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
- }
- // b[l_query-1..1]
- for (i = l_query - 1; i >= 1; --i) {
- int beg = 1, end = l_ref, x, _beg, _end;
- double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
- uint8_t qyi1 = query[i+1];
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = end; k >= beg; --k) {
- int u, v11, v01, v10;
- double e;
- set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
- e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
- bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
- bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
- bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
-// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
- }
- // rescale
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
- }
- { // b[0]
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
- double sum = 0.;
- for (k = end; k >= beg; --k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
- }
- set_u(k, bw, 0, 0);
- pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
- }
- is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
- /*** MAP ***/
- for (i = 1; i <= l_query; ++i) {
- double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
- int beg = 1, end = l_ref, x, max_k = -1;
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = beg; k <= end; ++k) {
- int u;
- double z;
- set_u(u, bw, i, k);
- z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
- z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
- }
- max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
- if (state) state[i-1] = max_k;
- if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
-#ifdef _MAIN
- fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
- "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
-#endif
- }
- /*** free ***/
- for (i = 0; i <= l_query; ++i) {
- free(f[i]); free(b[i]);
- }
- free(f); free(b); free(s); free(_qual);
- return Pr;
-}
-
-#ifdef _MAIN
-#include <unistd.h>
-int main(int argc, char *argv[])
-{
- uint8_t conv[256], *iqual, *ref, *query;
- int c, l_ref, l_query, i, q = 30, b = 10, P;
- while ((c = getopt(argc, argv, "b:q:")) >= 0) {
- switch (c) {
- case 'b': b = atoi(optarg); break;
- case 'q': q = atoi(optarg); break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
- return 1;
- }
- memset(conv, 4, 256);
- conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
- conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
- ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
- l_ref = strlen((char*)ref); l_query = strlen((char*)query);
- for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
- for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
- iqual = malloc(l_query);
- memset(iqual, q, l_query);
- kpa_par_def.bw = b;
- P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
- fprintf(stderr, "%d\n", P);
- free(iqual);
- return 0;
-}
-#endif
diff --git a/samtools/kprobaln.c.pysam.c b/samtools/kprobaln.c.pysam.c
deleted file mode 100644
index 630b730..0000000
--- a/samtools/kprobaln.c.pysam.c
+++ /dev/null
@@ -1,284 +0,0 @@
-#include "pysam.h"
-
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3 at live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <config.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kprobaln.h"
-
-/*****************************************
- * Probabilistic banded glocal alignment *
- *****************************************/
-
-#define EI .25
-#define EM .33333333333
-
-static float g_qual2prob[256];
-
-#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
-
-kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
-kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
-
-/*
- The topology of the profile HMM:
-
- /\ /\ /\ /\
- I[1] I[k-1] I[k] I[L]
- ^ \ \ ^ \ ^ \ \ ^
- | \ \ | \ | \ \ |
- M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
- \ \/ \/ \/ /
- \ /\ /\ /\ /
- -> D[k-1] -> D[k] ->
-
- M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
-
- On input, _ref is the reference sequence and _query is the query
- sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
- ambiguous residue. iqual is the base quality. c sets the gap open
- probability, gap extension probability and band width.
-
- On output, state and q are arrays of length l_query. The higher 30
- bits give the reference position the query base is matched to and the
- lower two bits can be 0 (an alignment match) or 1 (an
- insertion). q[i] gives the phred scaled posterior probability of
- state[i] being wrong.
- */
-int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q)
-{
- double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
- float *qual, *_qual;
- const uint8_t *ref, *query;
- int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
-
- if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
-
- /*** initialization ***/
- is_backward = state && q? 1 : 0;
- ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
- bw = l_ref > l_query? l_ref : l_query;
- if (bw > c->bw) bw = c->bw;
- if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
- bw2 = bw * 2 + 1;
- // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
- f = calloc(l_query+1, sizeof(double*));
- if (is_backward) b = calloc(l_query+1, sizeof(double*));
- for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0
- f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
- if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
- }
- s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
- // initialize qual
- _qual = calloc(l_query, sizeof(float));
- if (g_qual2prob[0] == 0)
- for (i = 0; i < 256; ++i)
- g_qual2prob[i] = pow(10, -i/10.);
- for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
- qual = _qual - 1;
- // initialize transition probability
- sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
- m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
- m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
- m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
- bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
- /*** forward ***/
- // f[0]
- set_u(k, bw, 0, 0);
- f[0][k] = s[0] = 1.;
- { // f[1]
- double *fi = f[1], sum;
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- fi[u+0] = e * bM; fi[u+1] = EI * bI;
- sum += fi[u] + fi[u+1];
- }
- // rescale
- s[1] = sum;
- set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
- for (k = _beg; k <= _end; ++k) fi[k] /= sum;
- }
- // f[2..l_query]
- for (i = 2; i <= l_query; ++i) {
- double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
- int beg = 1, end = l_ref, x, _beg, _end;
- uint8_t qyi = query[i];
- x = i - bw; beg = beg > x? beg : x; // band start
- x = i + bw; end = end < x? end : x; // band end
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u, v11, v01, v10;
- double e;
- e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
- set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
- fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
- fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
- fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
- sum += fi[u] + fi[u+1] + fi[u+2];
-// fprintf(pysam_stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
- }
- // rescale
- s[i] = sum;
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
- }
- { // f[l_query+1]
- double sum;
- for (k = 1, sum = 0.; k <= l_ref; ++k) {
- int u;
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
- }
- s[l_query+1] = sum; // the last scaling factor
- }
- { // compute likelihood
- double p = 1., Pr1 = 0.;
- for (i = 0; i <= l_query + 1; ++i) {
- p *= s[i];
- if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
- }
- Pr1 += -4.343 * log(p * l_ref * l_query);
- Pr = (int)(Pr1 + .499);
- if (!is_backward) { // skip backward and MAP
- for (i = 0; i <= l_query; ++i) free(f[i]);
- free(f); free(s); free(_qual);
- return Pr;
- }
- }
- /*** backward ***/
- // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
- for (k = 1; k <= l_ref; ++k) {
- int u;
- double *bi = b[l_query];
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
- }
- // b[l_query-1..1]
- for (i = l_query - 1; i >= 1; --i) {
- int beg = 1, end = l_ref, x, _beg, _end;
- double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
- uint8_t qyi1 = query[i+1];
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = end; k >= beg; --k) {
- int u, v11, v01, v10;
- double e;
- set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
- e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
- bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
- bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
- bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
-// fprintf(pysam_stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
- }
- // rescale
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
- }
- { // b[0]
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
- double sum = 0.;
- for (k = end; k >= beg; --k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
- }
- set_u(k, bw, 0, 0);
- pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
- }
- is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
- /*** MAP ***/
- for (i = 1; i <= l_query; ++i) {
- double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
- int beg = 1, end = l_ref, x, max_k = -1;
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = beg; k <= end; ++k) {
- int u;
- double z;
- set_u(u, bw, i, k);
- z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
- z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
- }
- max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
- if (state) state[i-1] = max_k;
- if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
-#ifdef _MAIN
- fprintf(pysam_stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
- "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
-#endif
- }
- /*** free ***/
- for (i = 0; i <= l_query; ++i) {
- free(f[i]); free(b[i]);
- }
- free(f); free(b); free(s); free(_qual);
- return Pr;
-}
-
-#ifdef _MAIN
-#include <unistd.h>
-int samtools_kprobaln_main(int argc, char *argv[])
-{
- uint8_t conv[256], *iqual, *ref, *query;
- int c, l_ref, l_query, i, q = 30, b = 10, P;
- while ((c = getopt(argc, argv, "b:q:")) >= 0) {
- switch (c) {
- case 'b': b = atoi(optarg); break;
- case 'q': q = atoi(optarg); break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(pysam_stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
- return 1;
- }
- memset(conv, 4, 256);
- conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
- conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
- ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
- l_ref = strlen((char*)ref); l_query = strlen((char*)query);
- for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
- for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
- iqual = malloc(l_query);
- memset(iqual, q, l_query);
- kpa_par_def.bw = b;
- P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
- fprintf(pysam_stderr, "%d\n", P);
- free(iqual);
- return 0;
-}
-#endif
diff --git a/samtools/kprobaln.h b/samtools/kprobaln.h
deleted file mode 100644
index 50ae77b..0000000
--- a/samtools/kprobaln.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3 at live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#ifndef LH3_KPROBALN_H_
-#define LH3_KPROBALN_H_
-
-#include <stdint.h>
-
-typedef struct {
- float d, e;
- int bw;
-} kpa_par_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q);
-
-#ifdef __cplusplus
-}
-#endif
-
-extern kpa_par_t kpa_par_def, kpa_par_alt;
-
-#endif
diff --git a/samtools/misc/ace2sam.c b/samtools/misc/ace2sam.c
index 77b9993..19727eb 100644
--- a/samtools/misc/ace2sam.c
+++ b/samtools/misc/ace2sam.c
@@ -161,7 +161,10 @@ int main(int argc, char *argv[])
}
if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
- if (write_cns) puts(t[4].s); t[4].l = 0;
+ if (write_cns) {
+ if (t[4].l) puts(t[4].s);
+ t[4].l = 0;
+ }
} else if (strcmp(s.s, "AF") == 0) { // padded read position
int reversed, neg, pos;
if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
diff --git a/samtools/misc/ace2sam.c.pysam.c b/samtools/misc/ace2sam.c.pysam.c
index a663399..02d2f58 100644
--- a/samtools/misc/ace2sam.c.pysam.c
+++ b/samtools/misc/ace2sam.c.pysam.c
@@ -163,7 +163,10 @@ int samtools_ace2sam_main(int argc, char *argv[])
}
if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
- if (write_cns) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); t[4].l = 0;
+ if (write_cns) {
+ if (t[4].l) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout);
+ t[4].l = 0;
+ }
} else if (strcmp(s.s, "AF") == 0) { // padded read position
int reversed, neg, pos;
if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
diff --git a/samtools/padding.c b/samtools/padding.c
index cea79cf..2f10e86 100644
--- a/samtools/padding.c
+++ b/samtools/padding.c
@@ -491,7 +491,7 @@ int main_pad2unpad(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'),
{ NULL, 0, NULL, 0 }
};
@@ -603,7 +603,7 @@ static int usage(int is_long_help)
fprintf(stderr, " Padded reference sequence file [null]\n");
fprintf(stderr, " -o FILE Output file name [stdout]\n");
fprintf(stderr, " -? Longer help\n");
- sam_global_opt_help(stderr, "-...-");
+ sam_global_opt_help(stderr, "-...--");
if (is_long_help)
fprintf(stderr,
diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c
index 9f85c95..a3461e4 100644
--- a/samtools/padding.c.pysam.c
+++ b/samtools/padding.c.pysam.c
@@ -493,7 +493,7 @@ int main_pad2unpad(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'),
{ NULL, 0, NULL, 0 }
};
@@ -605,7 +605,7 @@ static int usage(int is_long_help)
fprintf(pysam_stderr, " Padded reference sequence file [null]\n");
fprintf(pysam_stderr, " -o FILE Output file name [pysam_stdout]\n");
fprintf(pysam_stderr, " -? Longer help\n");
- sam_global_opt_help(pysam_stderr, "-...-");
+ sam_global_opt_help(pysam_stderr, "-...--");
if (is_long_help)
fprintf(pysam_stderr,
diff --git a/samtools/phase.c b/samtools/phase.c
index 6909912..584334d 100644
--- a/samtools/phase.c
+++ b/samtools/phase.c
@@ -31,9 +31,9 @@ DEALINGS IN THE SOFTWARE. */
#include <stdint.h>
#include <math.h>
#include <zlib.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "errmod.h"
#include "sam_opts.h"
#include "samtools.h"
@@ -580,7 +580,7 @@ int main_phase(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
@@ -620,7 +620,7 @@ int main_phase(int argc, char *argv[])
// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
fprintf(stderr, "\n");
- sam_global_opt_help(stderr, "-....");
+ sam_global_opt_help(stderr, "-....-");
return 1;
}
diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c
index 3babd37..4226c03 100644
--- a/samtools/phase.c.pysam.c
+++ b/samtools/phase.c.pysam.c
@@ -33,9 +33,9 @@ DEALINGS IN THE SOFTWARE. */
#include <stdint.h>
#include <math.h>
#include <zlib.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "errmod.h"
#include "sam_opts.h"
#include "samtools.h"
@@ -582,7 +582,7 @@ int main_phase(int argc, char *argv[])
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
@@ -622,7 +622,7 @@ int main_phase(int argc, char *argv[])
// fprintf(pysam_stderr, " -e do not discover SNPs (effective with -l)\n");
fprintf(pysam_stderr, "\n");
- sam_global_opt_help(pysam_stderr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....-");
return 1;
}
diff --git a/samtools/sam.h b/samtools/sam.h
index 5130105..6545e64 100644
--- a/samtools/sam.h
+++ b/samtools/sam.h
@@ -50,7 +50,7 @@ typedef struct {
samFile *file;
struct { BGZF *bam; } x; // Hack so that fp->x.bam still works
bam_hdr_t *header;
- short is_write:1;
+ unsigned short is_write:1;
} samfile_t;
#ifdef __cplusplus
diff --git a/samtools/sam_opts.c b/samtools/sam_opts.c
index 9369145..9e7a8de 100644
--- a/samtools/sam_opts.c
+++ b/samtools/sam_opts.c
@@ -72,6 +72,9 @@ int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt,
r |= hts_opt_add((hts_opt **)&ga->out.specific, ref);
free(ref);
break;
+ } else if (strcmp(lopt->name, "threads") == 0) {
+ ga->nthreads = atoi(optarg);
+ break;
// } else if (strcmp(lopt->name, "verbose") == 0) {
// ga->verbosity++;
// break;
@@ -100,7 +103,7 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) {
int i = 0;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0),
+ SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0,0),
{ NULL, 0, NULL, 0 }
};
@@ -130,6 +133,9 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) {
else if (strcmp(lopts[i].name, "reference") == 0)
fprintf(fp,"reference FILE\n"
" Reference sequence FASTA FILE [null]\n");
+ else if (strcmp(lopts[i].name, "threads") == 0)
+ fprintf(fp,"threads INT\n"
+ " Number of additional threads to use [0]\n");
// else if (strcmp(lopts[i].name, "verbose") == 0)
// fprintf(fp,"verbose\n"
// " Increment level of verbosity\n");
diff --git a/samtools/sam_opts.c.pysam.c b/samtools/sam_opts.c.pysam.c
index d0b56a3..aed4869 100644
--- a/samtools/sam_opts.c.pysam.c
+++ b/samtools/sam_opts.c.pysam.c
@@ -74,6 +74,9 @@ int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt,
r |= hts_opt_add((hts_opt **)&ga->out.specific, ref);
free(ref);
break;
+ } else if (strcmp(lopt->name, "threads") == 0) {
+ ga->nthreads = atoi(optarg);
+ break;
// } else if (strcmp(lopt->name, "verbose") == 0) {
// ga->verbosity++;
// break;
@@ -102,7 +105,7 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) {
int i = 0;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0),
+ SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0,0),
{ NULL, 0, NULL, 0 }
};
@@ -132,6 +135,9 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) {
else if (strcmp(lopts[i].name, "reference") == 0)
fprintf(fp,"reference FILE\n"
" Reference sequence FASTA FILE [null]\n");
+ else if (strcmp(lopts[i].name, "threads") == 0)
+ fprintf(fp,"threads INT\n"
+ " Number of additional threads to use [0]\n");
// else if (strcmp(lopts[i].name, "verbose") == 0)
// fprintf(fp,"verbose\n"
// " Increment level of verbosity\n");
diff --git a/samtools/sam_opts.h b/samtools/sam_opts.h
index 25e9279..6edbf64 100644
--- a/samtools/sam_opts.h
+++ b/samtools/sam_opts.h
@@ -34,6 +34,7 @@ typedef struct sam_global_args {
htsFormat in;
htsFormat out;
char *reference;
+ int nthreads;
//int verbosity;
} sam_global_args;
@@ -45,6 +46,7 @@ enum {
SAM_OPT_OUTPUT_FMT,
SAM_OPT_OUTPUT_FMT_OPTION,
SAM_OPT_REFERENCE,
+ SAM_OPT_NTHREADS,
//SAM_OPT_VERBOSE
};
@@ -56,12 +58,13 @@ enum {
// 0 No short option has been assigned. Use --long-opt only.
// '-' Both long and short options are disabled.
// <c> Otherwise the equivalent short option is character <c>.
-#define SAM_OPT_GLOBAL_OPTIONS(o1, o2, o3, o4, o5) \
+#define SAM_OPT_GLOBAL_OPTIONS(o1, o2, o3, o4, o5, o6) \
{"input-fmt", required_argument, NULL, SAM_OPT_VAL(o1, SAM_OPT_INPUT_FMT)}, \
{"input-fmt-option", required_argument, NULL, SAM_OPT_VAL(o2, SAM_OPT_INPUT_FMT_OPTION)}, \
{"output-fmt", required_argument, NULL, SAM_OPT_VAL(o3, SAM_OPT_OUTPUT_FMT)}, \
{"output-fmt-option", required_argument, NULL, SAM_OPT_VAL(o4, SAM_OPT_OUTPUT_FMT_OPTION)}, \
- {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}
+ {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}, \
+ {"threads", required_argument, NULL, SAM_OPT_VAL(o6, SAM_OPT_NTHREADS)}
//{"verbose", no_argument, NULL, SAM_OPT_VERBOSE}
/*
diff --git a/samtools/test/test.c b/samtools/sam_utils.c
similarity index 52%
copy from samtools/test/test.c
copy to samtools/sam_utils.c
index 7ab38af..4f8964a 100644
--- a/samtools/test/test.c
+++ b/samtools/sam_utils.c
@@ -1,8 +1,8 @@
-/* test/test.c -- test harness utility routines.
+/* sam_utils.c -- various utilities internal to samtools.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2016 Genome Research Ltd.
- Author: Martin O. Pollard <mp15 at sanger.ac.uk>
+ Author: John Marshall <jm18 at sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -24,32 +24,37 @@ DEALINGS IN THE SOFTWARE. */
#include <config.h>
-#include <errno.h>
#include <stdio.h>
-#include <stdlib.h>
+#include <stdarg.h>
#include <string.h>
-#include <htslib/sam.h>
+#include <errno.h>
+
+#include "samtools.h"
-#include "test.h"
+static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+{
+ fflush(stdout);
+ if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand);
+ else fprintf(stderr, "samtools: ");
+ vfprintf(stderr, format, args);
+ if (extra) fprintf(stderr, ": %s\n", extra);
+ else fprintf(stderr, "\n");
+ fflush(stderr);
+}
-void xfreopen(const char *path, const char *mode, FILE *stream)
+void print_error(const char *subcommand, const char *format, ...)
{
- if (freopen(path, mode, stream) == NULL) {
- fprintf(stderr, __FILE__": error reopening %s: %s\n",
- path, strerror(errno));
- exit(2);
- }
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, NULL);
+ va_end(args);
}
-void dump_hdr(const bam_hdr_t* hdr)
+void print_error_errno(const char *subcommand, const char *format, ...)
{
- printf("n_targets: %d\n", hdr->n_targets);
- printf("ignore_sam_err: %d\n", hdr->ignore_sam_err);
- printf("l_text: %u\n", hdr->l_text);
- printf("idx\ttarget_len\ttarget_name:\n");
- int32_t target;
- for (target = 0; target < hdr->n_targets; ++target) {
- printf("%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]);
- }
- printf("text: \"%s\"\n", hdr->text);
+ int err = errno;
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, err? strerror(err) : NULL);
+ va_end(args);
}
diff --git a/samtools/test/test.c.pysam.c b/samtools/sam_utils.c.pysam.c
similarity index 52%
copy from samtools/test/test.c.pysam.c
copy to samtools/sam_utils.c.pysam.c
index a8295b5..0a78619 100644
--- a/samtools/test/test.c.pysam.c
+++ b/samtools/sam_utils.c.pysam.c
@@ -1,10 +1,10 @@
#include "pysam.h"
-/* test/test.c -- test harness utility routines.
+/* sam_utils.c -- various utilities internal to samtools.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2016 Genome Research Ltd.
- Author: Martin O. Pollard <mp15 at sanger.ac.uk>
+ Author: John Marshall <jm18 at sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -26,32 +26,37 @@ DEALINGS IN THE SOFTWARE. */
#include <config.h>
-#include <errno.h>
#include <stdio.h>
-#include <stdlib.h>
+#include <stdarg.h>
#include <string.h>
-#include <htslib/sam.h>
+#include <errno.h>
+
+#include "samtools.h"
-#include "test.h"
+static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+{
+ fflush(pysam_stdout);
+ if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand);
+ else fprintf(pysam_stderr, "samtools: ");
+ vfprintf(pysam_stderr, format, args);
+ if (extra) fprintf(pysam_stderr, ": %s\n", extra);
+ else fprintf(pysam_stderr, "\n");
+ fflush(pysam_stderr);
+}
-void xfreopen(const char *path, const char *mode, FILE *stream)
+void print_error(const char *subcommand, const char *format, ...)
{
- if (freopen(path, mode, stream) == NULL) {
- fprintf(pysam_stderr, __FILE__": error reopening %s: %s\n",
- path, strerror(errno));
- exit(2);
- }
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, NULL);
+ va_end(args);
}
-void dump_hdr(const bam_hdr_t* hdr)
+void print_error_errno(const char *subcommand, const char *format, ...)
{
- fprintf(pysam_stdout, "n_targets: %d\n", hdr->n_targets);
- fprintf(pysam_stdout, "ignore_sam_err: %d\n", hdr->ignore_sam_err);
- fprintf(pysam_stdout, "l_text: %u\n", hdr->l_text);
- fprintf(pysam_stdout, "idx\ttarget_len\ttarget_name:\n");
- int32_t target;
- for (target = 0; target < hdr->n_targets; ++target) {
- fprintf(pysam_stdout, "%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]);
- }
- fprintf(pysam_stdout, "text: \"%s\"\n", hdr->text);
+ int err = errno;
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, err? strerror(err) : NULL);
+ va_end(args);
}
diff --git a/samtools/sam_view.c b/samtools/sam_view.c
index 402e1d3..9c2d15b 100644
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -1,6 +1,6 @@
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2015 Genome Research Ltd.
+ Copyright (C) 2009-2017 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <stdio.h>
#include <unistd.h>
#include <math.h>
@@ -34,12 +35,18 @@ DEALINGS IN THE SOFTWARE. */
#include <stdbool.h>
#include <assert.h>
#include <getopt.h>
+#include <ctype.h>
#include "htslib/sam.h"
#include "htslib/faidx.h"
#include "htslib/kstring.h"
#include "htslib/khash.h"
+#include "htslib/thread_pool.h"
#include "samtools.h"
#include "sam_opts.h"
+
+#define DEFAULT_BARCODE_TAG "BC"
+#define DEFAULT_QUALITY_TAG "QT"
+
KHASH_SET_INIT_STR(rg)
typedef khash_t(rg) *rghash_t;
@@ -50,6 +57,7 @@ typedef struct samview_settings {
int min_mapQ;
int flag_on;
int flag_off;
+ int flag_alloff;
int min_qlen;
int remove_B;
uint32_t subsam_seed;
@@ -83,6 +91,8 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin
}
if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off))
return 1;
+ if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff))
+ return 1;
if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b))))
return 1;
if (settings->subsam_frac > 0.) {
@@ -231,19 +241,22 @@ static void check_sam_close(const char *subcmd, samFile *fp, const char *fname,
int main_samview(int argc, char *argv[])
{
int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0;
- int is_long_help = 0, n_threads = 0;
+ int is_long_help = 0;
int64_t count = 0;
samFile *in = 0, *out = 0, *un_out=0;
+ FILE *fp_out = NULL;
bam_hdr_t *header = NULL;
char out_mode[5], out_un_mode[5], *out_format = "";
char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
samview_settings_t settings = {
.rghash = NULL,
.min_mapQ = 0,
.flag_on = 0,
.flag_off = 0,
+ .flag_alloff = 0,
.min_qlen = 0,
.remove_B = 0,
.subsam_seed = 0,
@@ -253,8 +266,7 @@ int main_samview(int argc, char *argv[])
};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'),
- { "threads", required_argument, NULL, '@' },
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
{ NULL, 0, NULL, 0 }
};
@@ -262,11 +274,13 @@ int main_samview(int argc, char *argv[])
strcpy(out_mode, "w");
strcpy(out_un_mode, "w");
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
+ // Convert likely user input 0,1,2,... to pseudo-random
+ // values with more entropy and more bits set
srand(settings.subsam_seed);
settings.subsam_seed = rand();
}
@@ -284,6 +298,7 @@ int main_samview(int argc, char *argv[])
case 'U': fn_un_out = strdup(optarg); break;
case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
@@ -313,7 +328,6 @@ int main_samview(int argc, char *argv[])
*/
case '?': is_long_help = 1; break;
case 'B': settings.remove_B = 1; break;
- case '@': n_threads = strtol(optarg, 0, 0); break;
case 'x':
{
if (strlen(optarg) != 2) {
@@ -425,8 +439,26 @@ int main_samview(int argc, char *argv[])
}
}
}
+ else {
+ if (fn_out) {
+ fp_out = fopen(fn_out, "w");
+ if (fp_out == NULL) {
+ print_error_errno("view", "can't create \"%s\"", fn_out);
+ ret = EXIT_FAILURE;
+ goto view_end;
+ }
+ }
+ }
- if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
+ if (ga.nthreads > 1) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ ret = 1;
+ goto view_end;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
if (is_header_only) goto view_end; // no need to print alignments
if (optind + 1 >= argc) { // convert/print the entire file
@@ -487,13 +519,19 @@ int main_samview(int argc, char *argv[])
}
view_end:
- if (is_count && ret == 0)
- printf("%" PRId64 "\n", count);
+ if (is_count && ret == 0) {
+ if (fprintf(fn_out? fp_out : stdout, "%" PRId64 "\n", count) < 0) {
+ if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out);
+ else print_error_errno("view", "writing to standard output failed");
+ ret = EXIT_FAILURE;
+ }
+ }
// close files, free and return
if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
+ if (fp_out) fclose(fp_out);
free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
sam_global_args_free(&ga);
@@ -508,6 +546,10 @@ view_end:
if (settings.remove_aux_len) {
free(settings.remove_aux);
}
+
+ if (p.pool)
+ hts_tpool_destroy(p.pool);
+
return ret;
}
@@ -538,20 +580,19 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
" -l STR only include reads in library STR [null]\n"
" -m INT only include reads with number of CIGAR operations consuming\n"
" query sequence >= INT [0]\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
+" fraction of templates/read pairs to keep; INT part sets seed)\n"
// read processing
" -x STR read tag to strip (repeatable) [null]\n"
" -B collapse the backward CIGAR operation\n"
-" -s FLOAT integer part sets seed of random number generator [0];\n"
-" rest sets fraction of templates to subsample [no subsampling]\n"
// general options
-" -@, --threads INT\n"
-" number of BAM/CRAM compression threads [0]\n"
" -? print long help, including note about region specification\n"
" -S ignored (input format is auto-detected)\n");
- sam_global_opt_help(fp, "-.O.T");
+ sam_global_opt_help(fp, "-.O.T@");
fprintf(fp, "\n");
if (is_long_help)
@@ -620,21 +661,37 @@ static void bam2fq_usage(FILE *to, const char *command)
"Usage: samtools %s [options...] <in.bam>\n", command);
fprintf(to,
"Options:\n"
-" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
-" -1 FILE write paired reads flagged READ1 to FILE\n"
-" -2 FILE write paired reads flagged READ2 to FILE\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
-" -n don't append /1 and /2 to the read name\n");
+" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
+" -1 FILE write paired reads flagged READ1 to FILE\n"
+" -2 FILE write paired reads flagged READ2 to FILE\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -n don't append /1 and /2 to the read name\n"
+" -N always append /1 and /2 to the read name\n");
if (fq) fprintf(to,
-" -O output quality in the OQ tag if present\n");
+" -O output quality in the OQ tag if present\n");
fprintf(to,
-" -s FILE write singleton reads to FILE [assume single-end]\n"
-" -t copy RG, BC and QT tags to the %s header line\n",
+" -s FILE write singleton reads to FILE [assume single-end]\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -v INT default quality score if not given in file [1]\n");
- sam_global_opt_help(to, "-.--.");
+" -v INT default quality score if not given in file [1]\n"
+" --i1 FILE write first index reads to FILE\n"
+" --i2 FILE write second index reads to FILE\n"
+" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
+" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
+" --index-format STR How to parse barcode and quality tags\n\n");
+ sam_global_opt_help(to, "-.--.@");
+ fprintf(to,
+" \n"
+" The index-format string describes how to parse the barcode and quality tags, for example:\n"
+" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n"
+" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n"
+" If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
+" 'read until the separator or end of tag', for example:\n"
+" n*i* ignore the left part of the tag until the separator, then use the second part\n"
+" of the tag as index 1\n");
}
typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
@@ -643,24 +700,97 @@ typedef struct bam2fq_opts {
char *fnse;
char *fnr[3];
char *fn_input; // pointer to input filename in argv do not free
- bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ bool has12, has12always, use_oq, copy_tags;
+ int flag_on, flag_off, flag_alloff;
sam_global_args ga;
fastfile filetype;
int def_qual;
+ char *barcode_tag;
+ char *quality_tag;
+ char *index_file[2];
+ char *index_format;
} bam2fq_opts_t;
typedef struct bam2fq_state {
samFile *fp;
FILE *fpse;
FILE *fpr[3];
+ FILE *fpi[2];
bam_hdr_t *h;
bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ int flag_on, flag_off, flag_alloff;
fastfile filetype;
int def_qual;
} bam2fq_state_t;
+/*
+ * Get and decode the read from a BAM record.
+ *
+ * TODO: htslib really needs an interface for this. Consider this or perhaps
+ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
+ * functions as string formatted equivalents to bam_get_{seq,qual}?
+ */
+
+/*
+ * Reverse a string in place.
+ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
+ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
+ */
+static char *reverse(char *str)
+{
+ int i = strlen(str)-1,j=0;
+ char ch;
+ while (i>j) {
+ ch = str[i];
+ str[i]= str[j];
+ str[j] = ch;
+ i--;
+ j++;
+ }
+ return str;
+}
+
+/* return the read, reverse complemented if necessary */
+static char *get_read(const bam1_t *rec)
+{
+ int len = rec->core.l_qseq + 1;
+ char *read = calloc(1, len);
+ char *seq = (char *)bam_get_seq(rec);
+ int n;
+
+ if (!read) return NULL;
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
+ else read[n] = seq_nt16_str[bam_seqi(seq,n)];
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(read);
+ return read;
+}
+
+/*
+ * get and decode the quality from a BAM record
+ */
+static char *get_quality(const bam1_t *rec)
+{
+ char *quality = calloc(1, rec->core.l_qseq + 1);
+ char *q = (char *)bam_get_qual(rec);
+ int n;
+
+ if (*q == '\xff') { free(quality); return NULL; }
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ quality[n] = q[n]+33;
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(quality);
+ return quality;
+}
+
+//
+// End of htslib complaints
+//
+
+
static readpart which_readpart(const bam1_t *b)
{
if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
@@ -672,85 +802,60 @@ static readpart which_readpart(const bam1_t *b)
}
}
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+/*
+ * parse the length part from the index-format string
+ */
+static int getLength(char **s)
{
- int i;
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- uint8_t *seq;
- uint8_t *qual = bam_get_qual(b);
- const uint8_t *oq = NULL;
- if (state->use_oq) {
- oq = bam_aux_get(b, "OQ");
- if (oq) oq++; // skip tag type
+ int n = 0;
+ while (**s) {
+ if (**s == '*') { n=-1; (*s)++; break; }
+ if ( !isdigit(**s)) break;
+ n = n*10 + ((**s)-'0');
+ (*s)++;
}
- bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
+ return n;
+}
+
+static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int i;
linebuf->l = 0;
// Write read name
- readpart readpart = which_readpart(b);
kputc(state->filetype == FASTA? '>' : '@', linebuf);
- kputs(bam_get_qname(b), linebuf);
+ kputs(bam_get_qname(rec), linebuf);
// Add the /1 /2 if requested
if (state->has12) {
+ readpart readpart = which_readpart(rec);
if (readpart == READ_1) kputs("/1", linebuf);
else if (readpart == READ_2) kputs("/2", linebuf);
}
if (state->copy_tags) {
for (i = 0; copied_tags[i]; ++i) {
uint8_t *s;
- if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
- kputc('\t', linebuf);
- kputsn(copied_tags[i], 2, linebuf);
- kputsn(":Z:", 3, linebuf);
- kputs(bam_aux2Z(s), linebuf);
+ if ((s = bam_aux_get(rec, copied_tags[i])) != 0) {
+ if (*s == 'Z') {
+ kputc('\t', linebuf);
+ kputsn(copied_tags[i], 2, linebuf);
+ kputsn(":Z:", 3, linebuf);
+ kputs(bam_aux2Z(s), linebuf);
+ }
}
}
}
kputc('\n', linebuf);
-
- seq = bam_get_seq(b);
-
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
- kputc(c, linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- char c = seq_nt16_str[bam_seqi(seq,i)];
- kputc(c, linebuf);
- }
- }
+ kputs(seq, linebuf);
kputc('\n', linebuf);
if (state->filetype == FASTQ) {
// Write quality
kputs("+\n", linebuf);
- if (has_qual) {
- if (state->use_oq && oq) {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(oq[i], linebuf);
- }
- } else {
- kputs((char*)oq, linebuf);
- }
- } else {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(33 + qual[i], linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- kputc(33 + qual[i], linebuf);
- }
- }
- }
+ if (qual && *qual) {
+ kputs(qual, linebuf);
} else {
- for (i = 0; i < qlen; ++i) {
+ int len = strlen(seq);
+ for (i = 0; i < len; ++i) {
kputc(33 + state->def_qual, linebuf);
}
}
@@ -759,49 +864,214 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t
return true;
}
+/*
+ * Create FASTQ lines from the barcode tag using the index-format
+ */
+static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts)
+{
+ uint8_t *p;
+ char *ifmt = opts->index_format;
+ char *tag = NULL;
+ char *qual = NULL;
+ int file_number = 0;
+ kstring_t linebuf = { 0, 0, NULL }; // Buffer
+
+ // read barcode tag
+ p = bam_aux_get(rec,opts->barcode_tag);
+ if (p) tag = bam_aux2Z(p);
+
+ if (!tag) return true; // there is no tag
+
+ // read quality tag
+ p = bam_aux_get(rec, opts->quality_tag);
+ if (p) qual = bam_aux2Z(p);
+
+ // Parse the index-format string
+ while (*ifmt) {
+ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly
+ char action = *ifmt; // should be 'i' or 'n'
+ ifmt++; // skip over action
+ int index_len = getLength(&ifmt);
+
+ char *sub_tag = calloc(1, strlen(tag)+1);
+ char *sub_qual = calloc(1, strlen(tag)+1);
+ int n = 0;
+
+ if (index_len < 0) {
+ // read until separator
+ while (isalpha(*tag)) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ if (*tag) { // skip separator
+ tag++;
+ if (qual) qual++;
+ }
+ } else {
+ // read index_len characters
+ while (index_len-- && *tag) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ }
+
+ if (action=='i' && *sub_tag && state->fpi[file_number]) {
+ make_fq_line(rec, sub_tag, sub_qual, &linebuf, state);
+ fputs(linebuf.s, state->fpi[file_number++]);
+ }
+ free(sub_qual); free(sub_tag);
+
+ }
+
+ free(linebuf.s);
+ return true;
+}
+
+// Transform a bam1_t record into a string with the FASTQ representation of it
+// @returns false for error, true for success
+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int32_t qlen = b->core.l_qseq;
+ assert(qlen >= 0);
+ const uint8_t *oq = NULL;
+ char *qual = NULL;
+
+ char *seq = get_read(b);
+
+ if (state->use_oq) {
+ oq = bam_aux_get(b, "OQ");
+ if (oq) {
+ oq++;
+ qual = strdup(bam_aux2Z(oq));
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ reverse(qual);
+ }
+ }
+ } else {
+ qual = get_quality(b);
+ }
+
+ make_fq_line(b, seq, qual, linebuf, state);
+
+ free(qual);
+ free(seq);
+ return true;
+}
+
+static void free_opts(bam2fq_opts_t *opts)
+{
+ free(opts->barcode_tag);
+ free(opts->quality_tag);
+ free(opts->index_format);
+ free(opts);
+}
+
// return true if valid
static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
{
// Parse args
bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
opts->has12 = true;
+ opts->has12always = false;
opts->filetype = FASTQ;
opts->def_qual = 1;
+ opts->barcode_tag = NULL;
+ opts->quality_tag = NULL;
+ opts->index_format = NULL;
+ opts->index_file[0] = NULL;
+ opts->index_file[1] = NULL;
int c;
sam_global_args_init(&opts->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+ {"i1", required_argument, NULL, 1},
+ {"I1", required_argument, NULL, 1},
+ {"i2", required_argument, NULL, 2},
+ {"I2", required_argument, NULL, 2},
+ {"if", required_argument, NULL, 3},
+ {"IF", required_argument, NULL, 3},
+ {"index-format", required_argument, NULL, 3},
+ {"barcode-tag", required_argument, NULL, 'b'},
+ {"quality-tag", required_argument, NULL, 'q'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) {
+ while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) {
switch (c) {
+ case 'b': opts->barcode_tag = strdup(optarg); break;
+ case 'q': opts->quality_tag = strdup(optarg); break;
+ case 1 : opts->index_file[0] = optarg; break;
+ case 2 : opts->index_file[1] = optarg; break;
+ case 3 : opts->index_format = strdup(optarg); break;
case '0': opts->fnr[0] = optarg; break;
case '1': opts->fnr[1] = optarg; break;
case '2': opts->fnr[2] = optarg; break;
case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
case 'F': opts->flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
+ case 'N': opts->has12always = true; break;
case 'O': opts->use_oq = true; break;
case 's': opts->fnse = optarg; break;
case 't': opts->copy_tags = true; break;
case 'v': opts->def_qual = atoi(optarg); break;
- case '?': bam2fq_usage(stderr, argv[0]); free(opts); return false;
+ case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
default:
if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
- bam2fq_usage(stderr, argv[0]); free(opts); return false;
+ bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
}
break;
}
}
if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
+ if (opts->has12always) opts->has12 = true;
+
+ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
+ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+
+ int nIndex = 0;
+ if (opts->index_format) {
+ char *s;
+ for (s = opts->index_format; *s; s++) {
+ if (*s == 'i') nIndex++;
+ }
+ }
+ if (nIndex>2) {
+ fprintf(stderr,"Invalid index format: more than 2 indexes\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (opts->index_file[1] && !opts->index_file[0]) {
+ fprintf(stderr, "Index one specified, but index two not given\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==2 && !opts->index_file[1]) {
+ fprintf(stderr, "index_format specifies two indexes, but only one index file given\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==1 && !opts->index_file[0]) {
+ fprintf(stderr, "index_format specifies an index, but no index file given\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
if (opts->def_qual < 0 || 93 < opts->def_qual) {
fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
bam2fq_usage(stderr, argv[0]);
- free(opts);
- return true;
+ free_opts(opts);
+ return false;
}
const char* type_str = argv[0];
@@ -812,20 +1082,21 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
} else {
print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
bam2fq_usage(stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) == 0) {
+ fprintf(stderr, "No input file specified.\n");
bam2fq_usage(stdout, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) != 1) {
fprintf(stderr, "Too many arguments.\n");
bam2fq_usage(stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
opts->fn_input = argv[optind];
@@ -838,6 +1109,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
+ state->flag_alloff = opts->flag_alloff;
state->has12 = opts->has12;
state->use_oq = opts->use_oq;
state->copy_tags = opts->copy_tags;
@@ -850,6 +1122,8 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
free(state);
return false;
}
+ if (opts->ga.nthreads > 0)
+ hts_set_threads(state->fp, opts->ga.nthreads);
uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
if (opts->use_oq) rf |= SAM_AUX;
if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
@@ -884,6 +1158,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
state->fpr[i] = stdout;
}
}
+ for (i = 0; i < 2; i++) {
+ state->fpi[i] = NULL;
+ if (opts->index_file[i]) {
+ state->fpi[i] = fopen(opts->index_file[i], "w");
+ if (state->fpi[i] == NULL) {
+ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]);
+ free(state);
+ return false;
+ }
+ }
+ }
state->h = sam_hdr_read(state->fp);
if (state->h == NULL) {
@@ -906,6 +1191,12 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
for (i = 0; i < 3; ++i) {
if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
}
+ for (i = 0; i < 2; i++) {
+ if (state->fpi[i] && fclose(state->fpi[i])) {
+ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
+ valid = false;
+ }
+ }
free(state);
return valid;
}
@@ -914,11 +1205,12 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
{
return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
|| (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0);
+ || (b->core.flag&(state->flag_off)) != 0
+ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
}
-static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
+static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
bam1_t* b = bam_init1();
char *current_qname = NULL;
@@ -974,6 +1266,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
return false;
}
score[which_readpart(b)] = b_score;
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
}
if (!valid)
@@ -991,7 +1284,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
return valid;
}
-static bool bam2fq_mainloop(bam2fq_state_t *state)
+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
// process a name collated BAM into fastq
bam1_t* b = bam_init1();
@@ -1002,13 +1295,12 @@ static bool bam2fq_mainloop(bam2fq_state_t *state)
int64_t n_reads = 0; // Statistics
kstring_t linebuf = { 0, 0, NULL }; // Buffer
while (sam_read1(state->fp, state->h, b) >= 0) {
- if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0) continue;
+ if (filter_it_out(b, state)) continue;
++n_reads;
if (!bam1_to_fq(b, &linebuf, state)) return false;
fputs(linebuf.s, state->fpr[which_readpart(b)]);
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
free(linebuf.s);
bam_destroy1(b);
@@ -1029,14 +1321,14 @@ int main_bam2fq(int argc, char *argv[])
if (!init_state(opts, &state)) return EXIT_FAILURE;
if (state->fpse) {
- if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE;
} else {
- if (!bam2fq_mainloop(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
}
if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
sam_global_args_free(&opts->ga);
- free(opts);
+ free_opts(opts);
return status;
}
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c
index 8c883b0..6df47c9 100644
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -2,7 +2,7 @@
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2015 Genome Research Ltd.
+ Copyright (C) 2009-2017 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <stdio.h>
#include <unistd.h>
#include <math.h>
@@ -36,12 +37,18 @@ DEALINGS IN THE SOFTWARE. */
#include <stdbool.h>
#include <assert.h>
#include <getopt.h>
+#include <ctype.h>
#include "htslib/sam.h"
#include "htslib/faidx.h"
#include "htslib/kstring.h"
#include "htslib/khash.h"
+#include "htslib/thread_pool.h"
#include "samtools.h"
#include "sam_opts.h"
+
+#define DEFAULT_BARCODE_TAG "BC"
+#define DEFAULT_QUALITY_TAG "QT"
+
KHASH_SET_INIT_STR(rg)
typedef khash_t(rg) *rghash_t;
@@ -52,6 +59,7 @@ typedef struct samview_settings {
int min_mapQ;
int flag_on;
int flag_off;
+ int flag_alloff;
int min_qlen;
int remove_B;
uint32_t subsam_seed;
@@ -85,6 +93,8 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin
}
if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off))
return 1;
+ if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff))
+ return 1;
if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b))))
return 1;
if (settings->subsam_frac > 0.) {
@@ -233,19 +243,22 @@ static void check_sam_close(const char *subcmd, samFile *fp, const char *fname,
int main_samview(int argc, char *argv[])
{
int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0;
- int is_long_help = 0, n_threads = 0;
+ int is_long_help = 0;
int64_t count = 0;
samFile *in = 0, *out = 0, *un_out=0;
+ FILE *fp_out = NULL;
bam_hdr_t *header = NULL;
char out_mode[5], out_un_mode[5], *out_format = "";
char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
samview_settings_t settings = {
.rghash = NULL,
.min_mapQ = 0,
.flag_on = 0,
.flag_off = 0,
+ .flag_alloff = 0,
.min_qlen = 0,
.remove_B = 0,
.subsam_seed = 0,
@@ -255,8 +268,7 @@ int main_samview(int argc, char *argv[])
};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'),
- { "threads", required_argument, NULL, '@' },
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
{ NULL, 0, NULL, 0 }
};
@@ -264,11 +276,13 @@ int main_samview(int argc, char *argv[])
strcpy(out_mode, "w");
strcpy(out_un_mode, "w");
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
+ // Convert likely user input 0,1,2,... to pseudo-random
+ // values with more entropy and more bits set
srand(settings.subsam_seed);
settings.subsam_seed = rand();
}
@@ -286,6 +300,7 @@ int main_samview(int argc, char *argv[])
case 'U': fn_un_out = strdup(optarg); break;
case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
@@ -315,7 +330,6 @@ int main_samview(int argc, char *argv[])
*/
case '?': is_long_help = 1; break;
case 'B': settings.remove_B = 1; break;
- case '@': n_threads = strtol(optarg, 0, 0); break;
case 'x':
{
if (strlen(optarg) != 2) {
@@ -427,8 +441,26 @@ int main_samview(int argc, char *argv[])
}
}
}
+ else {
+ if (fn_out) {
+ fp_out = fopen(fn_out, "w");
+ if (fp_out == NULL) {
+ print_error_errno("view", "can't create \"%s\"", fn_out);
+ ret = EXIT_FAILURE;
+ goto view_end;
+ }
+ }
+ }
- if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
+ if (ga.nthreads > 1) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ ret = 1;
+ goto view_end;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
if (is_header_only) goto view_end; // no need to print alignments
if (optind + 1 >= argc) { // convert/print the entire file
@@ -489,13 +521,19 @@ int main_samview(int argc, char *argv[])
}
view_end:
- if (is_count && ret == 0)
- fprintf(pysam_stdout, "%" PRId64 "\n", count);
-
+ if (is_count && ret == 0) {
+ if (fprintf(fn_out? fp_out : pysam_stdout, "%" PRId64 "\n", count) < 0) {
+ if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out);
+ else print_error_errno("view", "writing to standard output failed");
+ ret = EXIT_FAILURE;
+ }
+ }
+
// close files, free and return
if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
+ if (fp_out) fclose(fp_out);
free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
sam_global_args_free(&ga);
@@ -510,6 +548,10 @@ view_end:
if (settings.remove_aux_len) {
free(settings.remove_aux);
}
+
+ if (p.pool)
+ hts_tpool_destroy(p.pool);
+
return ret;
}
@@ -540,20 +582,19 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
" -l STR only include reads in library STR [null]\n"
" -m INT only include reads with number of CIGAR operations consuming\n"
" query sequence >= INT [0]\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
+" fraction of templates/read pairs to keep; INT part sets seed)\n"
// read processing
" -x STR read tag to strip (repeatable) [null]\n"
" -B collapse the backward CIGAR operation\n"
-" -s FLOAT integer part sets seed of random number generator [0];\n"
-" rest sets fraction of templates to subsample [no subsampling]\n"
// general options
-" -@, --threads INT\n"
-" number of BAM/CRAM compression threads [0]\n"
" -? print long help, including note about region specification\n"
" -S ignored (input format is auto-detected)\n");
- sam_global_opt_help(fp, "-.O.T");
+ sam_global_opt_help(fp, "-.O.T@");
fprintf(fp, "\n");
if (is_long_help)
@@ -622,21 +663,37 @@ static void bam2fq_usage(FILE *to, const char *command)
"Usage: samtools %s [options...] <in.bam>\n", command);
fprintf(to,
"Options:\n"
-" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
-" -1 FILE write paired reads flagged READ1 to FILE\n"
-" -2 FILE write paired reads flagged READ2 to FILE\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
-" -n don't append /1 and /2 to the read name\n");
+" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
+" -1 FILE write paired reads flagged READ1 to FILE\n"
+" -2 FILE write paired reads flagged READ2 to FILE\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -n don't append /1 and /2 to the read name\n"
+" -N always append /1 and /2 to the read name\n");
if (fq) fprintf(to,
-" -O output quality in the OQ tag if present\n");
+" -O output quality in the OQ tag if present\n");
fprintf(to,
-" -s FILE write singleton reads to FILE [assume single-end]\n"
-" -t copy RG, BC and QT tags to the %s header line\n",
+" -s FILE write singleton reads to FILE [assume single-end]\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -v INT default quality score if not given in file [1]\n");
- sam_global_opt_help(to, "-.--.");
+" -v INT default quality score if not given in file [1]\n"
+" --i1 FILE write first index reads to FILE\n"
+" --i2 FILE write second index reads to FILE\n"
+" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
+" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
+" --index-format STR How to parse barcode and quality tags\n\n");
+ sam_global_opt_help(to, "-.--.@");
+ fprintf(to,
+" \n"
+" The index-format string describes how to parse the barcode and quality tags, for example:\n"
+" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n"
+" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n"
+" If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
+" 'read until the separator or end of tag', for example:\n"
+" n*i* ignore the left part of the tag until the separator, then use the second part\n"
+" of the tag as index 1\n");
}
typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
@@ -645,24 +702,97 @@ typedef struct bam2fq_opts {
char *fnse;
char *fnr[3];
char *fn_input; // pointer to input filename in argv do not free
- bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ bool has12, has12always, use_oq, copy_tags;
+ int flag_on, flag_off, flag_alloff;
sam_global_args ga;
fastfile filetype;
int def_qual;
+ char *barcode_tag;
+ char *quality_tag;
+ char *index_file[2];
+ char *index_format;
} bam2fq_opts_t;
typedef struct bam2fq_state {
samFile *fp;
FILE *fpse;
FILE *fpr[3];
+ FILE *fpi[2];
bam_hdr_t *h;
bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ int flag_on, flag_off, flag_alloff;
fastfile filetype;
int def_qual;
} bam2fq_state_t;
+/*
+ * Get and decode the read from a BAM record.
+ *
+ * TODO: htslib really needs an interface for this. Consider this or perhaps
+ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
+ * functions as string formatted equivalents to bam_get_{seq,qual}?
+ */
+
+/*
+ * Reverse a string in place.
+ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
+ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
+ */
+static char *reverse(char *str)
+{
+ int i = strlen(str)-1,j=0;
+ char ch;
+ while (i>j) {
+ ch = str[i];
+ str[i]= str[j];
+ str[j] = ch;
+ i--;
+ j++;
+ }
+ return str;
+}
+
+/* return the read, reverse complemented if necessary */
+static char *get_read(const bam1_t *rec)
+{
+ int len = rec->core.l_qseq + 1;
+ char *read = calloc(1, len);
+ char *seq = (char *)bam_get_seq(rec);
+ int n;
+
+ if (!read) return NULL;
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
+ else read[n] = seq_nt16_str[bam_seqi(seq,n)];
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(read);
+ return read;
+}
+
+/*
+ * get and decode the quality from a BAM record
+ */
+static char *get_quality(const bam1_t *rec)
+{
+ char *quality = calloc(1, rec->core.l_qseq + 1);
+ char *q = (char *)bam_get_qual(rec);
+ int n;
+
+ if (*q == '\xff') { free(quality); return NULL; }
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ quality[n] = q[n]+33;
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(quality);
+ return quality;
+}
+
+//
+// End of htslib complaints
+//
+
+
static readpart which_readpart(const bam1_t *b)
{
if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
@@ -674,85 +804,60 @@ static readpart which_readpart(const bam1_t *b)
}
}
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+/*
+ * parse the length part from the index-format string
+ */
+static int getLength(char **s)
{
- int i;
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- uint8_t *seq;
- uint8_t *qual = bam_get_qual(b);
- const uint8_t *oq = NULL;
- if (state->use_oq) {
- oq = bam_aux_get(b, "OQ");
- if (oq) oq++; // skip tag type
+ int n = 0;
+ while (**s) {
+ if (**s == '*') { n=-1; (*s)++; break; }
+ if ( !isdigit(**s)) break;
+ n = n*10 + ((**s)-'0');
+ (*s)++;
}
- bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
+ return n;
+}
+
+static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int i;
linebuf->l = 0;
// Write read name
- readpart readpart = which_readpart(b);
kputc(state->filetype == FASTA? '>' : '@', linebuf);
- kputs(bam_get_qname(b), linebuf);
+ kputs(bam_get_qname(rec), linebuf);
// Add the /1 /2 if requested
if (state->has12) {
+ readpart readpart = which_readpart(rec);
if (readpart == READ_1) kputs("/1", linebuf);
else if (readpart == READ_2) kputs("/2", linebuf);
}
if (state->copy_tags) {
for (i = 0; copied_tags[i]; ++i) {
uint8_t *s;
- if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
- kputc('\t', linebuf);
- kputsn(copied_tags[i], 2, linebuf);
- kputsn(":Z:", 3, linebuf);
- kputs(bam_aux2Z(s), linebuf);
+ if ((s = bam_aux_get(rec, copied_tags[i])) != 0) {
+ if (*s == 'Z') {
+ kputc('\t', linebuf);
+ kputsn(copied_tags[i], 2, linebuf);
+ kputsn(":Z:", 3, linebuf);
+ kputs(bam_aux2Z(s), linebuf);
+ }
}
}
}
kputc('\n', linebuf);
-
- seq = bam_get_seq(b);
-
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
- kputc(c, linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- char c = seq_nt16_str[bam_seqi(seq,i)];
- kputc(c, linebuf);
- }
- }
+ kputs(seq, linebuf);
kputc('\n', linebuf);
if (state->filetype == FASTQ) {
// Write quality
kputs("+\n", linebuf);
- if (has_qual) {
- if (state->use_oq && oq) {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(oq[i], linebuf);
- }
- } else {
- kputs((char*)oq, linebuf);
- }
- } else {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(33 + qual[i], linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- kputc(33 + qual[i], linebuf);
- }
- }
- }
+ if (qual && *qual) {
+ kputs(qual, linebuf);
} else {
- for (i = 0; i < qlen; ++i) {
+ int len = strlen(seq);
+ for (i = 0; i < len; ++i) {
kputc(33 + state->def_qual, linebuf);
}
}
@@ -761,49 +866,214 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t
return true;
}
+/*
+ * Create FASTQ lines from the barcode tag using the index-format
+ */
+static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts)
+{
+ uint8_t *p;
+ char *ifmt = opts->index_format;
+ char *tag = NULL;
+ char *qual = NULL;
+ int file_number = 0;
+ kstring_t linebuf = { 0, 0, NULL }; // Buffer
+
+ // read barcode tag
+ p = bam_aux_get(rec,opts->barcode_tag);
+ if (p) tag = bam_aux2Z(p);
+
+ if (!tag) return true; // there is no tag
+
+ // read quality tag
+ p = bam_aux_get(rec, opts->quality_tag);
+ if (p) qual = bam_aux2Z(p);
+
+ // Parse the index-format string
+ while (*ifmt) {
+ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly
+ char action = *ifmt; // should be 'i' or 'n'
+ ifmt++; // skip over action
+ int index_len = getLength(&ifmt);
+
+ char *sub_tag = calloc(1, strlen(tag)+1);
+ char *sub_qual = calloc(1, strlen(tag)+1);
+ int n = 0;
+
+ if (index_len < 0) {
+ // read until separator
+ while (isalpha(*tag)) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ if (*tag) { // skip separator
+ tag++;
+ if (qual) qual++;
+ }
+ } else {
+ // read index_len characters
+ while (index_len-- && *tag) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ }
+
+ if (action=='i' && *sub_tag && state->fpi[file_number]) {
+ make_fq_line(rec, sub_tag, sub_qual, &linebuf, state);
+ fputs(linebuf.s, state->fpi[file_number++]);
+ }
+ free(sub_qual); free(sub_tag);
+
+ }
+
+ free(linebuf.s);
+ return true;
+}
+
+// Transform a bam1_t record into a string with the FASTQ representation of it
+// @returns false for error, true for success
+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int32_t qlen = b->core.l_qseq;
+ assert(qlen >= 0);
+ const uint8_t *oq = NULL;
+ char *qual = NULL;
+
+ char *seq = get_read(b);
+
+ if (state->use_oq) {
+ oq = bam_aux_get(b, "OQ");
+ if (oq) {
+ oq++;
+ qual = strdup(bam_aux2Z(oq));
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ reverse(qual);
+ }
+ }
+ } else {
+ qual = get_quality(b);
+ }
+
+ make_fq_line(b, seq, qual, linebuf, state);
+
+ free(qual);
+ free(seq);
+ return true;
+}
+
+static void free_opts(bam2fq_opts_t *opts)
+{
+ free(opts->barcode_tag);
+ free(opts->quality_tag);
+ free(opts->index_format);
+ free(opts);
+}
+
// return true if valid
static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
{
// Parse args
bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
opts->has12 = true;
+ opts->has12always = false;
opts->filetype = FASTQ;
opts->def_qual = 1;
+ opts->barcode_tag = NULL;
+ opts->quality_tag = NULL;
+ opts->index_format = NULL;
+ opts->index_file[0] = NULL;
+ opts->index_file[1] = NULL;
int c;
sam_global_args_init(&opts->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+ {"i1", required_argument, NULL, 1},
+ {"I1", required_argument, NULL, 1},
+ {"i2", required_argument, NULL, 2},
+ {"I2", required_argument, NULL, 2},
+ {"if", required_argument, NULL, 3},
+ {"IF", required_argument, NULL, 3},
+ {"index-format", required_argument, NULL, 3},
+ {"barcode-tag", required_argument, NULL, 'b'},
+ {"quality-tag", required_argument, NULL, 'q'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) {
+ while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) {
switch (c) {
+ case 'b': opts->barcode_tag = strdup(optarg); break;
+ case 'q': opts->quality_tag = strdup(optarg); break;
+ case 1 : opts->index_file[0] = optarg; break;
+ case 2 : opts->index_file[1] = optarg; break;
+ case 3 : opts->index_format = strdup(optarg); break;
case '0': opts->fnr[0] = optarg; break;
case '1': opts->fnr[1] = optarg; break;
case '2': opts->fnr[2] = optarg; break;
case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
case 'F': opts->flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
+ case 'N': opts->has12always = true; break;
case 'O': opts->use_oq = true; break;
case 's': opts->fnse = optarg; break;
case 't': opts->copy_tags = true; break;
case 'v': opts->def_qual = atoi(optarg); break;
- case '?': bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false;
+ case '?': bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false;
default:
if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
- bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false;
+ bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false;
}
break;
}
}
if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
+ if (opts->has12always) opts->has12 = true;
+
+ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
+ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+
+ int nIndex = 0;
+ if (opts->index_format) {
+ char *s;
+ for (s = opts->index_format; *s; s++) {
+ if (*s == 'i') nIndex++;
+ }
+ }
+ if (nIndex>2) {
+ fprintf(pysam_stderr,"Invalid index format: more than 2 indexes\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (opts->index_file[1] && !opts->index_file[0]) {
+ fprintf(pysam_stderr, "Index one specified, but index two not given\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==2 && !opts->index_file[1]) {
+ fprintf(pysam_stderr, "index_format specifies two indexes, but only one index file given\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==1 && !opts->index_file[0]) {
+ fprintf(pysam_stderr, "index_format specifies an index, but no index file given\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
if (opts->def_qual < 0 || 93 < opts->def_qual) {
fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
bam2fq_usage(pysam_stderr, argv[0]);
- free(opts);
- return true;
+ free_opts(opts);
+ return false;
}
const char* type_str = argv[0];
@@ -814,20 +1084,21 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
} else {
print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
bam2fq_usage(pysam_stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) == 0) {
+ fprintf(pysam_stderr, "No input file specified.\n");
bam2fq_usage(pysam_stdout, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) != 1) {
fprintf(pysam_stderr, "Too many arguments.\n");
bam2fq_usage(pysam_stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
opts->fn_input = argv[optind];
@@ -840,6 +1111,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
+ state->flag_alloff = opts->flag_alloff;
state->has12 = opts->has12;
state->use_oq = opts->use_oq;
state->copy_tags = opts->copy_tags;
@@ -852,6 +1124,8 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
free(state);
return false;
}
+ if (opts->ga.nthreads > 0)
+ hts_set_threads(state->fp, opts->ga.nthreads);
uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
if (opts->use_oq) rf |= SAM_AUX;
if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
@@ -886,6 +1160,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
state->fpr[i] = pysam_stdout;
}
}
+ for (i = 0; i < 2; i++) {
+ state->fpi[i] = NULL;
+ if (opts->index_file[i]) {
+ state->fpi[i] = fopen(opts->index_file[i], "w");
+ if (state->fpi[i] == NULL) {
+ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]);
+ free(state);
+ return false;
+ }
+ }
+ }
state->h = sam_hdr_read(state->fp);
if (state->h == NULL) {
@@ -908,6 +1193,12 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
for (i = 0; i < 3; ++i) {
if (state->fpr[i] != pysam_stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
}
+ for (i = 0; i < 2; i++) {
+ if (state->fpi[i] && fclose(state->fpi[i])) {
+ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
+ valid = false;
+ }
+ }
free(state);
return valid;
}
@@ -916,11 +1207,12 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
{
return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
|| (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0);
+ || (b->core.flag&(state->flag_off)) != 0
+ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
}
-static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
+static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
bam1_t* b = bam_init1();
char *current_qname = NULL;
@@ -976,6 +1268,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
return false;
}
score[which_readpart(b)] = b_score;
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
}
if (!valid)
@@ -993,7 +1286,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
return valid;
}
-static bool bam2fq_mainloop(bam2fq_state_t *state)
+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
// process a name collated BAM into fastq
bam1_t* b = bam_init1();
@@ -1004,13 +1297,12 @@ static bool bam2fq_mainloop(bam2fq_state_t *state)
int64_t n_reads = 0; // Statistics
kstring_t linebuf = { 0, 0, NULL }; // Buffer
while (sam_read1(state->fp, state->h, b) >= 0) {
- if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0) continue;
+ if (filter_it_out(b, state)) continue;
++n_reads;
if (!bam1_to_fq(b, &linebuf, state)) return false;
fputs(linebuf.s, state->fpr[which_readpart(b)]);
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
free(linebuf.s);
bam_destroy1(b);
@@ -1031,14 +1323,14 @@ int main_bam2fq(int argc, char *argv[])
if (!init_state(opts, &state)) return EXIT_FAILURE;
if (state->fpse) {
- if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE;
} else {
- if (!bam2fq_mainloop(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
}
if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
sam_global_args_free(&opts->ga);
- free(opts);
+ free_opts(opts);
return status;
}
diff --git a/samtools/stats.c b/samtools/stats.c
index eb6bb52..35574ed 100644
--- a/samtools/stats.c
+++ b/samtools/stats.c
@@ -828,8 +828,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
// reads. Mates mapped to different chromosomes have isize==0.
int32_t isize = bam_line->core.isize;
if ( isize<0 ) isize = -isize;
- if ( stats->info->nisize > 0 && isize >= stats->info->nisize )
- isize = stats->info->nisize-1;
+ if ( stats->info->nisize > 0 && isize > stats->info->nisize )
+ isize = stats->info->nisize;
if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
{
int pos_fst = bam_line->core.mpos - bam_line->core.pos;
@@ -1263,7 +1263,7 @@ void init_regions(stats_t *stats, const char *file)
stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
}
- if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
+ if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
if ( prev_tid==-1 || prev_tid!=tid )
{
prev_tid = tid;
@@ -1375,7 +1375,7 @@ static void error(const char *format, ...)
printf(" -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
- sam_global_opt_help(stdout, "-.--.");
+ sam_global_opt_help(stdout, "-.--.@");
printf("\n");
}
else
@@ -1481,13 +1481,13 @@ int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFor
// .. bam
samFile* sam;
if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) {
- error("Failed to open: %s\n", bam_fname);
+ print_error_errno("stats", "failed to open \"%s\"", bam_fname);
return 1;
}
info->sam = sam;
info->sam_header = sam_hdr_read(sam);
if (info->sam_header == NULL) {
- error("Failed to read header for '%s'\n", bam_fname);
+ print_error("stats", "failed to read header for \"%s\"", bam_fname);
return 1;
}
return 0;
@@ -1537,7 +1537,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr
stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize = init_isize_t(info->nisize);
+ stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0);
stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t));
@@ -1596,7 +1596,7 @@ int main_stats(int argc, char *argv[])
static const struct option loptions[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{"help", no_argument, NULL, 'h'},
{"remove-dups", no_argument, NULL, 'd'},
{"sam", no_argument, NULL, 's'},
@@ -1618,7 +1618,7 @@ int main_stats(int argc, char *argv[])
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 )
+ while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:@:",loptions,NULL))>0 )
{
switch (opt)
{
@@ -1662,6 +1662,8 @@ int main_stats(int argc, char *argv[])
}
if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1;
+ if (ga.nthreads > 0)
+ hts_set_threads(info->sam, ga.nthreads);
stats_t *all_stats = stats_init();
stats_t *curr_stats = NULL;
diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c
index da187ac..8ebb52a 100644
--- a/samtools/stats.c.pysam.c
+++ b/samtools/stats.c.pysam.c
@@ -220,7 +220,7 @@ typedef struct
stats_t;
KHASH_MAP_INIT_STR(c2stats, stats_t*)
-static void error(const char *format, ...);
+static int error(const char *format, ...);
int is_in_regions(bam1_t *bam_line, stats_t *stats);
void realloc_buffers(stats_t *stats, int seq_len);
@@ -830,8 +830,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
// reads. Mates mapped to different chromosomes have isize==0.
int32_t isize = bam_line->core.isize;
if ( isize<0 ) isize = -isize;
- if ( stats->info->nisize > 0 && isize >= stats->info->nisize )
- isize = stats->info->nisize-1;
+ if ( stats->info->nisize > 0 && isize > stats->info->nisize )
+ isize = stats->info->nisize;
if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
{
int pos_fst = bam_line->core.mpos - bam_line->core.pos;
@@ -1265,7 +1265,7 @@ void init_regions(stats_t *stats, const char *file)
stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
}
- if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
+ if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
if ( prev_tid==-1 || prev_tid!=tid )
{
prev_tid = tid;
@@ -1352,7 +1352,7 @@ void init_group_id(stats_t *stats, const char *id)
}
-static void error(const char *format, ...)
+static int error(const char *format, ...)
{
if ( !format )
{
@@ -1377,8 +1377,9 @@ static void error(const char *format, ...)
fprintf(pysam_stdout, " -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
fprintf(pysam_stdout, " -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
fprintf(pysam_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
- sam_global_opt_help(pysam_stdout, "-.--.");
+ sam_global_opt_help(pysam_stdout, "-.--.@");
fprintf(pysam_stdout, "\n");
+ return(0);
}
else
{
@@ -1483,13 +1484,13 @@ int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFor
// .. bam
samFile* sam;
if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) {
- error("Failed to open: %s\n", bam_fname);
+ print_error_errno("stats", "failed to open \"%s\"", bam_fname);
return 1;
}
info->sam = sam;
info->sam_header = sam_hdr_read(sam);
if (info->sam_header == NULL) {
- error("Failed to read header for '%s'\n", bam_fname);
+ print_error("stats", "failed to read header for \"%s\"", bam_fname);
return 1;
}
return 0;
@@ -1539,7 +1540,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr
stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize = init_isize_t(info->nisize);
+ stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0);
stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t));
@@ -1598,7 +1599,7 @@ int main_stats(int argc, char *argv[])
static const struct option loptions[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{"help", no_argument, NULL, 'h'},
{"remove-dups", no_argument, NULL, 'd'},
{"sam", no_argument, NULL, 's'},
@@ -1620,7 +1621,7 @@ int main_stats(int argc, char *argv[])
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 )
+ while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:@:",loptions,NULL))>0 )
{
switch (opt)
{
@@ -1646,7 +1647,7 @@ int main_stats(int argc, char *argv[])
case 'S': info->split_tag = optarg; break;
case 'P': info->split_prefix = optarg; break;
case '?':
- case 'h': error(NULL);
+ case 'h': return(error(NULL));
default:
if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0)
error("Unknown argument: %s\n", optarg);
@@ -1659,11 +1660,13 @@ int main_stats(int argc, char *argv[])
if ( !bam_fname )
{
if ( isatty(STDIN_FILENO) )
- error(NULL);
+ return(error(NULL));
bam_fname = "-";
}
if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1;
+ if (ga.nthreads > 0)
+ hts_set_threads(info->sam, ga.nthreads);
stats_t *all_stats = stats_init();
stats_t *curr_stats = NULL;
diff --git a/samtools/test/split/test_filter_header_rg.c b/samtools/test/split/test_filter_header_rg.c
index d9505d6..cccf0e9 100644
--- a/samtools/test/split/test_filter_header_rg.c
+++ b/samtools/test/split/test_filter_header_rg.c
@@ -42,7 +42,8 @@ void setup_test_1(bam_hdr_t** hdr_in)
bool check_test_1(const bam_hdr_t* hdr) {
const char *test1_res =
"@HD\tVN:1.4\n"
- "@SQ\tSN:blah\n";
+ "@SQ\tSN:blah\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test1_res)) {
return false;
@@ -65,7 +66,8 @@ bool check_test_2(const bam_hdr_t* hdr) {
const char *test2_res =
"@HD\tVN:1.4\n"
"@SQ\tSN:blah\n"
- "@RG\tID:fish\n";
+ "@RG\tID:fish\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test2_res)) {
return false;
@@ -73,7 +75,7 @@ bool check_test_2(const bam_hdr_t* hdr) {
return true;
}
-int main(int argc, char**argv)
+int main(int argc, char *argv[])
{
// test state
const int NUM_TESTS = 2;
@@ -82,6 +84,8 @@ int main(int argc, char**argv)
int failure = 0;
int getopt_char;
+ char *test_argv[] = { "test_filter_header_rg", "foo\tbar", "baz" };
+ char *arg_list = stringify_argv(3, test_argv);
while ((getopt_char = getopt(argc, argv, "v")) != -1) {
switch (getopt_char) {
case 'v':
@@ -116,7 +120,7 @@ int main(int argc, char**argv)
// test
xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe
- bool result_1 = filter_header_rg(hdr1, id_to_keep_1);
+ bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list);
fclose(stderr);
if (verbose) printf("END RUN test 1\n");
@@ -155,7 +159,7 @@ int main(int argc, char**argv)
// test
xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe
- bool result_2 = filter_header_rg(hdr2, id_to_keep_2);
+ bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list);
fclose(stderr);
if (verbose) printf("END RUN test 2\n");
@@ -185,6 +189,7 @@ int main(int argc, char**argv)
// Cleanup
free(res.s);
+ free(arg_list);
remove(tempfname);
if (failure > 0)
fprintf(orig_stderr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/split/test_filter_header_rg.c.pysam.c b/samtools/test/split/test_filter_header_rg.c.pysam.c
index 97b3573..c9284f6 100644
--- a/samtools/test/split/test_filter_header_rg.c.pysam.c
+++ b/samtools/test/split/test_filter_header_rg.c.pysam.c
@@ -44,7 +44,8 @@ void setup_test_1(bam_hdr_t** hdr_in)
bool check_test_1(const bam_hdr_t* hdr) {
const char *test1_res =
"@HD\tVN:1.4\n"
- "@SQ\tSN:blah\n";
+ "@SQ\tSN:blah\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test1_res)) {
return false;
@@ -67,7 +68,8 @@ bool check_test_2(const bam_hdr_t* hdr) {
const char *test2_res =
"@HD\tVN:1.4\n"
"@SQ\tSN:blah\n"
- "@RG\tID:fish\n";
+ "@RG\tID:fish\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test2_res)) {
return false;
@@ -75,7 +77,7 @@ bool check_test_2(const bam_hdr_t* hdr) {
return true;
}
-int samtools_test_filter_header_rg_main(int argc, char**argv)
+int samtools_test_filter_header_rg_main(int argc, char *argv[])
{
// test state
const int NUM_TESTS = 2;
@@ -84,6 +86,8 @@ int samtools_test_filter_header_rg_main(int argc, char**argv)
int failure = 0;
int getopt_char;
+ char *test_argv[] = { "test_filter_header_rg", "foo\tbar", "baz" };
+ char *arg_list = stringify_argv(3, test_argv);
while ((getopt_char = getopt(argc, argv, "v")) != -1) {
switch (getopt_char) {
case 'v':
@@ -118,7 +122,7 @@ int samtools_test_filter_header_rg_main(int argc, char**argv)
// test
xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
- bool result_1 = filter_header_rg(hdr1, id_to_keep_1);
+ bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list);
fclose(pysam_stderr);
if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
@@ -157,7 +161,7 @@ int samtools_test_filter_header_rg_main(int argc, char**argv)
// test
xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
- bool result_2 = filter_header_rg(hdr2, id_to_keep_2);
+ bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list);
fclose(pysam_stderr);
if (verbose) fprintf(pysam_stdout, "END RUN test 2\n");
@@ -187,6 +191,7 @@ int samtools_test_filter_header_rg_main(int argc, char**argv)
// Cleanup
free(res.s);
+ free(arg_list);
remove(tempfname);
if (failure > 0)
fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success);
diff --git a/samtools/test/test.c b/samtools/test/test.c
index 7ab38af..fb0b549 100644
--- a/samtools/test/test.c
+++ b/samtools/test/test.c
@@ -1,6 +1,6 @@
/* test/test.c -- test harness utility routines.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014, 2016 Genome Research Ltd.
Author: Martin O. Pollard <mp15 at sanger.ac.uk>
@@ -53,3 +53,9 @@ void dump_hdr(const bam_hdr_t* hdr)
}
printf("text: \"%s\"\n", hdr->text);
}
+
+// For tests, just return a constant that can be embedded in expected output.
+const char *samtools_version(void)
+{
+ return "x.y.test";
+}
diff --git a/samtools/test/test.c.pysam.c b/samtools/test/test.c.pysam.c
index a8295b5..bf460e8 100644
--- a/samtools/test/test.c.pysam.c
+++ b/samtools/test/test.c.pysam.c
@@ -2,7 +2,7 @@
/* test/test.c -- test harness utility routines.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014, 2016 Genome Research Ltd.
Author: Martin O. Pollard <mp15 at sanger.ac.uk>
@@ -55,3 +55,9 @@ void dump_hdr(const bam_hdr_t* hdr)
}
fprintf(pysam_stdout, "text: \"%s\"\n", hdr->text);
}
+
+// For tests, just return a constant that can be embedded in expected output.
+const char *samtools_version(void)
+{
+ return "x.y.test";
+}
diff --git a/samtools/version.h b/samtools/version.h
index ec46e67..004d7ed 100644
--- a/samtools/version.h
+++ b/samtools/version.h
@@ -1 +1 @@
-#define SAMTOOLS_VERSION "1.3.1"
+#define SAMTOOLS_VERSION "1.4.1"
diff --git a/setup.py b/setup.py
index 6d52617..5b23d20 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ This module provides a low-level wrapper around the htslib C-API as
using cython and a high-level API for convenient access to the data
within standard genomic file formats.
-The current version wraps htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1.
+The current version wraps htslib-1.4.1, samtools-1.4.1 and bcftools-1.4.1.
See:
http://www.htslib.org
@@ -78,6 +78,11 @@ def configure_library(library_dir, env_options=None, options=[]):
configure_script = os.path.join(library_dir, "configure")
+ on_rtd = os.environ.get("READTHEDOCS") == "True"
+ # RTD has no bzip2 development libraries installed:
+ if on_rtd:
+ env_options = "--disable-bz2"
+
if not os.path.exists(configure_script):
raise ValueError(
"configure script {} does not exist".format(configure_script))
@@ -246,8 +251,8 @@ elif HTSLIB_MODE == 'shared':
# htslib built from sources included in the pysam
# package.
htslib_library_dirs = [
- 'pysam',
- ".",
+ "pysam", # when using setup.py develop?
+ ".", # when using setup.py develop?
os.path.join("build", distutils_dir_name("lib"), "pysam")]
htslib_include_dirs = ['htslib']
@@ -255,7 +260,15 @@ elif HTSLIB_MODE == 'shared':
else:
raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
-internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]]
+suffix = sysconfig.get_config_var('EXT_SUFFIX')
+if not suffix:
+ suffix = sysconfig.get_config_var('SO')
+internal_htslib_libraries = [os.path.splitext("chtslib{}".format(suffix))[0]]
+
+internal_tools_libraries = [
+ os.path.splitext("csamtools{}".format(suffix))[0],
+ os.path.splitext("cbcftools{}".format(suffix))[0],
+ ]
# build config.py
with open(os.path.join("pysam", "config.py"), "w") as outf:
@@ -268,7 +281,7 @@ with open(os.path.join("pysam", "config.py"), "w") as outf:
if line.startswith("#define"):
key, value = re.match(
"#define (\S+)\s+(\S+)", line).groups()
- config_values[key] = int(value)
+ config_values[key] = value
for key in ["ENABLE_PLUGINS",
"HAVE_COMMONCRYPTO",
"HAVE_GMTIME_R",
@@ -353,7 +366,6 @@ chtslib = Extension(
shared_htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- runtime_library_dirs=htslib_library_dirs,
include_dirs=["pysam", "."] + include_os + htslib_include_dirs,
libraries=external_htslib_libraries,
language="c",
@@ -369,8 +381,7 @@ csamfile = Extension(
"pysam.libcsamfile",
[source_pattern % "samfile",
"pysam/htslib_util.c",
- "pysam/samfile_util.c",
- "samtools/kprobaln.c"] +
+ "pysam/samfile_util.c"] +
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
@@ -389,8 +400,7 @@ calignmentfile = Extension(
"pysam.libcalignmentfile",
[source_pattern % "alignmentfile",
"pysam/htslib_util.c",
- "pysam/samfile_util.c",
- "samtools/kprobaln.c"] +
+ "pysam/samfile_util.c"] +
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
@@ -409,8 +419,7 @@ calignedsegment = Extension(
"pysam.libcalignedsegment",
[source_pattern % "alignedsegment",
"pysam/htslib_util.c",
- "pysam/samfile_util.c",
- "samtools/kprobaln.c"] +
+ "pysam/samfile_util.c"] +
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
@@ -435,17 +444,45 @@ ctabix = Extension(
define_macros=define_macros
)
+
+
cutils = Extension(
"pysam.libcutils",
[source_pattern % "utils", "pysam/pysam_util.c"] +
+ htslib_sources +
+ os_c_files,
+ library_dirs=["pysam"] + htslib_library_dirs,
+ include_dirs=["pysam", "."] +
+ include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries + internal_tools_libraries,
+ language="c",
+ extra_compile_args=extra_compile_args,
+ define_macros=define_macros
+)
+
+csamtools = Extension(
+ "pysam.libcsamtools",
+ [source_pattern % "samtools"] +
glob.glob(os.path.join("samtools", "*.pysam.c")) +
- # glob.glob(os.path.join("samtools", "*", "*.pysam.c")) +
+ htslib_sources +
+ os_c_files,
+ library_dirs=["pysam"] + htslib_library_dirs,
+ include_dirs=["samtools", "pysam", "."] +
+ include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
+ language="c",
+ extra_compile_args=extra_compile_args,
+ define_macros=define_macros
+)
+
+cbcftools = Extension(
+ "pysam.libcbcftools",
+ [source_pattern % "bcftools"] +
glob.glob(os.path.join("bcftools", "*.pysam.c")) +
- # glob.glob(os.path.join("bcftools", "*", "*.pysam.c")) +
htslib_sources +
os_c_files,
library_dirs=["pysam"] + htslib_library_dirs,
- include_dirs=["samtools", "bcftools", "pysam", "."] +
+ include_dirs=["bcftools", "pysam", "."] +
include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
@@ -538,6 +575,8 @@ metadata = {
cbcf,
cbgzf,
cfaidx,
+ csamtools,
+ cbcftools,
cutils],
'cmdclass': cmdclass,
'package_dir': package_dirs,
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py
index b0a3466..6d9101c 100644
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -234,20 +234,46 @@ class TestAlignedSegment(ReadTest):
def test_infer_query_length(self):
'''Test infer_query_length on M|=|X|I|D|H|S cigar ops'''
a = self.buildRead()
- a.cigarstring = '15M'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '15='
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '15X'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '5M5I5M'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '5M5D5M'
- self.assertEqual(a.infer_query_length(), 10)
- a.cigarstring = '5H10M'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '5S10M'
- self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '40M'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '40='
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '40X'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '20M5I20M'
+ self.assertEqual(a.infer_query_length(), 45)
+ a.cigarstring = '20M5D20M'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '5H35M'
+ self.assertEqual(a.infer_query_length(), 35)
+ a.cigarstring = '5S35M'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '35M5H'
+ self.assertEqual(a.infer_query_length(), 35)
+ a.cigarstring = '35M5S'
+ self.assertEqual(a.infer_query_length(), 40)
+
+ def test_infer_read_length(self):
+ '''Test infer_read_length on M|=|X|I|D|H|S cigar ops'''
+ a = self.buildRead()
+ a.cigarstring = '40M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '40='
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '40X'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '20M5I20M'
+ self.assertEqual(a.infer_read_length(), 45)
+ a.cigarstring = '20M5D20M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '5H35M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '5S35M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '35M5H'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '35M5S'
+ self.assertEqual(a.infer_read_length(), 40)
def test_get_aligned_pairs_soft_clipping(self):
a = self.buildRead()
@@ -388,22 +414,28 @@ class TestAlignedSegment(ReadTest):
self.assertEqual(a.query_alignment_length, 20)
a.cigarstring = "20M1S"
self.assertEqual(a.query_alignment_length, 20)
+ a.cigarstring = "20M1H"
+ self.assertEqual(a.query_alignment_length, 20)
a.cigarstring = "1S20M"
self.assertEqual(a.query_alignment_length, 20)
+ a.cigarstring = "1H20M"
+ self.assertEqual(a.query_alignment_length, 20)
a.cigarstring = "1S20M1S"
self.assertEqual(a.query_alignment_length, 20)
+ a.cigarstring = "1H20M1H"
+ self.assertEqual(a.query_alignment_length, 20)
def test_query_length_is_limited(self):
a = self.buildRead()
a.query_name = "A" * 1
- a.query_name = "A" * 254
+ a.query_name = "A" * 251
self.assertRaises(
ValueError,
setattr,
a,
"query_name",
- "A" * 255)
+ "A" * 252)
class TestCigarStats(ReadTest):
@@ -785,5 +817,34 @@ class TestAsString(unittest.TestCase):
self.assertEqual(s, p.tostring(pysamf))
+class TestEnums(unittest.TestCase):
+
+ def test_cigar_enums_are_defined(self):
+ self.assertEqual(pysam.CMATCH, 0)
+ self.assertEqual(pysam.CINS, 1)
+ self.assertEqual(pysam.CDEL, 2)
+ self.assertEqual(pysam.CREF_SKIP, 3)
+ self.assertEqual(pysam.CSOFT_CLIP, 4)
+ self.assertEqual(pysam.CHARD_CLIP, 5)
+ self.assertEqual(pysam.CPAD, 6)
+ self.assertEqual(pysam.CEQUAL, 7)
+ self.assertEqual(pysam.CDIFF, 8)
+ self.assertEqual(pysam.CBACK, 9)
+
+ def test_sam_flags_are_defined(self):
+ self.assertEqual(pysam.FPAIRED, 1)
+ self.assertEqual(pysam.FPROPER_PAIR, 2)
+ self.assertEqual(pysam.FUNMAP, 4)
+ self.assertEqual(pysam.FMUNMAP, 8)
+ self.assertEqual(pysam.FREVERSE, 16)
+ self.assertEqual(pysam.FMREVERSE, 32)
+ self.assertEqual(pysam.FREAD1, 64)
+ self.assertEqual(pysam.FREAD2, 128)
+ self.assertEqual(pysam.FSECONDARY, 256)
+ self.assertEqual(pysam.FQCFAIL, 512)
+ self.assertEqual(pysam.FDUP, 1024)
+ self.assertEqual(pysam.FSUPPLEMENTARY, 2048)
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py
index 18fb05b..a866881 100644
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -439,10 +439,12 @@ class TestIO(unittest.TestCase):
input_filename,
reference_filename,
output_filename,
- input_mode, output_mode,
+ input_mode,
+ output_mode,
sequence_filename=None,
use_template=True,
- checkf=checkBinaryEqual):
+ checkf=checkBinaryEqual,
+ **kwargs):
'''iterate through *input_filename* writing to
*output_filename* and comparing the output to
*reference_filename*.
@@ -477,7 +479,7 @@ class TestIO(unittest.TestCase):
output_filename,
output_mode,
reference_filename=sequence_filename,
- template=infile)
+ template=infile, **kwargs)
else:
outfile = pysam.AlignmentFile(
output_filename,
@@ -485,7 +487,8 @@ class TestIO(unittest.TestCase):
reference_names=infile.references,
reference_lengths=infile.lengths,
reference_filename=sequence_filename,
- add_sq_text=False)
+ add_sq_text=False,
+ **kwargs)
iter = infile.fetch()
@@ -509,6 +512,13 @@ class TestIO(unittest.TestCase):
"tmp_ex2.sam",
"r", "wh")
+ def testSAM2SAMWithoutHeader(self):
+ self.checkEcho("ex2.sam",
+ "ex1.sam",
+ "tmp_ex2.sam",
+ "r", "w",
+ add_sam_header=False)
+
def testBAM2BAM(self):
self.checkEcho("ex2.bam",
"ex2.bam",
@@ -588,14 +598,6 @@ class TestIO(unittest.TestCase):
# self.checkEcho(input_filename, reference_filename, output_filename,
# "rb", "wb", use_template=False)
- # Release 0.8.0
- # no samfiles without header
- def testSAM2SAMWithoutHeader(self):
- self.checkEcho("ex2.sam",
- "ex1.sam",
- "tmp_ex2.sam",
- "r", "w")
-
def testReadSamWithoutTargetNames(self):
'''see issue 104.'''
input_filename = os.path.join(
@@ -614,14 +616,12 @@ class TestIO(unittest.TestCase):
input_filename, "r",
check_header=True)
- infile = pysam.AlignmentFile(
+ with pysam.AlignmentFile(
input_filename,
check_header=False,
- check_sq=False)
-
- # TODO
- # result = list(infile.fetch(until_eof=True))
- # self.assertEqual(2, len(result))
+ check_sq=False) as infile:
+ result = list(infile.fetch(until_eof=True))
+ self.assertEqual(2, len(result))
def testReadBamWithoutTargetNames(self):
'''see issue 104.'''
@@ -641,52 +641,43 @@ class TestIO(unittest.TestCase):
"r",
check_header=True)
- infile = pysam.AlignmentFile(
- input_filename, check_header=False, check_sq=False)
- result = list(infile.fetch(until_eof=True))
+ with pysam.AlignmentFile(
+ input_filename, check_sq=False) as infile:
+ result = list(infile.fetch(until_eof=True))
- # TODO
- def testReadSamWithoutHeader(self):
+ def test_fail_read_sam_without_header(self):
input_filename = os.path.join(DATADIR, "ex1.sam")
- # reading from a samfile without header is not
- # implemented
self.assertRaises(ValueError,
pysam.AlignmentFile,
input_filename,
"r")
- # TODO
- # without check_header header is no read
- # leading to segfault
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "r",
- # check_header=False)
+ def test_pass_read_sam_without_header_with_refs(self):
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.sam"),
+ "r",
+ reference_names=["chr1", "chr2"],
+ reference_lengths=[1575, 1584]) as samfile:
+ self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270)
- # TODO
- # def testReadUnformattedFile(self):
- # '''test reading from a file that is not bam/sam formatted'''
- # input_filename = os.path.join(DATADIR, 'Makefile')
-
- # # bam - file raise error
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "rb")
-
- # # sam - file error, but can't fetch
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "r")
-
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "r",
- # check_header=False)
+ def test_pass_read_sam_with_header_without_header_check(self):
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex2.sam"),
+ "r", check_header=False) as samfile:
+ self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270)
+
+ def test_fail_when_reading_unformatted_files(self):
+ '''test reading from a file that is not bam/sam formatted'''
+ input_filename = os.path.join(DATADIR, 'Makefile')
+
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
+ input_filename,
+ "rb")
+
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
+ input_filename,
+ "r")
def testBAMWithoutAlignedSegments(self):
'''see issue 117'''
@@ -854,7 +845,23 @@ class TestIO(unittest.TestCase):
check_sq=False)
samfile.fetch('chr2')
-
+ def test_fetch_by_tid(self):
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), "rb") as samfile:
+ self.assertEqual(len(list(samfile.fetch('chr1'))),
+ len(list(samfile.fetch(tid=0))))
+ self.assertEqual(len(list(samfile.fetch('chr2'))),
+ len(list(samfile.fetch(tid=1))))
+ self.assertRaises(
+ IndexError,
+ samfile.fetch,
+ tid=2)
+ self.assertRaises(
+ IndexError,
+ samfile.fetch,
+ tid=-1)
+ self.assertEqual(len(list(samfile.fetch('chr1',start=1000, end=2000))),
+ len(list(samfile.fetch(tid=0, start=1000, end=2000))))
+
class TestAutoDetect(unittest.TestCase):
@@ -1761,7 +1768,7 @@ class TestDeNovoConstruction(unittest.TestCase):
# os.unlink(tmpfilename)
- def testBAMPerRead(self):
+ def test_pass_if_reads_binary_equal(self):
'''check if individual reads are binary equal.'''
infile = pysam.AlignmentFile(self.bamfile, "rb")
@@ -1846,25 +1853,17 @@ class TestTruncatedBAM(unittest.TestCase):
'''see pull request 50.'''
- def testTruncatedBam(self):
+ def testTruncatedBam2(self):
+ self.assertRaises(IOError,
+ pysam.AlignmentFile,
+ os.path.join(DATADIR, 'ex2_truncated.bam'))
- s = pysam.AlignmentFile(
- os.path.join(DATADIR, 'ex2_truncated.bam'))
+ def testTruncatedBam2(self):
+ s = pysam.AlignmentFile(os.path.join(DATADIR, 'ex2_truncated.bam'),
+ ignore_truncation=True)
iterall = lambda x: len([a for a in x])
self.assertRaises(IOError, iterall, s)
- def testTruncatedBamFetch(self):
- '''See comments for pull request at
- https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
- '''
- # Currently there is no way to detect truncated
- # files through hts_iter_fetch, so this test is
- # disabled
- return
- s = pysam.AlignmentFile(
- os.path.join(DATADIR, 'ex2_truncated.bam'))
- iterall = lambda x: len([a for a in x])
- self.assertRaises(IOError, iterall, s.fetch())
COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204,
0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78,
diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py
deleted file mode 100644
index ff13045..0000000
--- a/tests/SamFile_test.py
+++ /dev/null
@@ -1,1990 +0,0 @@
-#!/usr/bin/env python
-'''unit testing code for pysam.
-
-Execute in the :file:`tests` directory as it requires the Makefile
-and data files located there.
-'''
-
-import pysam
-import pysam.samtools
-import unittest
-import os
-import shutil
-import sys
-import collections
-import subprocess
-import logging
-import array
-from TestUtils import checkBinaryEqual, checkURL, force_str
-
-DATADIR = "pysam_data"
-
-
-class BasicTestBAMFetch(unittest.TestCase):
-
- '''basic first test - detailed testing
- if information in file is consistent
- with information in AlignedRead object.'''
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.bam"),
- "rb")
- self.reads = list(self.samfile.fetch())
-
- def testARqname(self):
- self.assertEqual(
- self.reads[0].qname,
- "read_28833_29006_6945",
- "read name mismatch in read 1: %s != %s" % (
- self.reads[0].qname, "read_28833_29006_6945"))
- self.assertEqual(
- self.reads[1].qname,
- "read_28701_28881_323b",
- "read name mismatch in read 2: %s != %s" % (
- self.reads[1].qname, "read_28701_28881_323b"))
-
- def testARflag(self):
- self.assertEqual(
- self.reads[0].flag, 99,
- "flag mismatch in read 1: %s != %s" % (
- self.reads[0].flag, 99))
- self.assertEqual(
- self.reads[1].flag, 147,
- "flag mismatch in read 2: %s != %s" % (
- self.reads[1].flag, 147))
-
- def testARrname(self):
- self.assertEqual(
- self.reads[0].rname, 0,
- "chromosome/target id mismatch in read 1: %s != %s" %
- (self.reads[0].rname, 0))
- self.assertEqual(
- self.reads[1].rname, 1,
- "chromosome/target id mismatch in read 2: %s != %s" %
- (self.reads[1].rname, 1))
-
- def testARpos(self):
- self.assertEqual(
- self.reads[0].pos, 33 - 1,
- "mapping position mismatch in read 1: %s != %s" %
- (self.reads[0].pos, 33 - 1))
- self.assertEqual(
- self.reads[1].pos, 88 - 1,
- "mapping position mismatch in read 2: %s != %s" %
- (self.reads[1].pos, 88 - 1))
-
- def testARmapq(self):
- self.assertEqual(
- self.reads[0].mapq, 20,
- "mapping quality mismatch in read 1: %s != %s" %
- (self.reads[0].mapq, 20))
- self.assertEqual(
- self.reads[1].mapq, 30,
- "mapping quality mismatch in read 2: %s != %s" % (
- self.reads[1].mapq, 30))
-
- def testARcigar(self):
- self.assertEqual(
- self.reads[0].cigar,
- [(0, 10), (2, 1), (0, 25)],
- "read name length mismatch in read 1: %s != %s" %
- (self.reads[0].cigar, [(0, 10), (2, 1), (0, 25)]))
- self.assertEqual(
- self.reads[1].cigar, [(0, 35)],
- "read name length mismatch in read 2: %s != %s" %
- (self.reads[1].cigar, [(0, 35)]))
-
- def testARcigarstring(self):
- self.assertEqual(self.reads[0].cigarstring, '10M1D25M')
- self.assertEqual(self.reads[1].cigarstring, '35M')
-
- def testARmrnm(self):
- self.assertEqual(
- self.reads[0].mrnm, 0,
- "mate reference sequence name mismatch in read 1: %s != %s" %
- (self.reads[0].mrnm, 0))
- self.assertEqual(
- self.reads[1].mrnm, 1,
- "mate reference sequence name mismatch in read 2: %s != %s" %
- (self.reads[1].mrnm, 1))
- self.assertEqual(
- self.reads[0].rnext, 0,
- "mate reference sequence name mismatch in read 1: %s != %s" %
- (self.reads[0].rnext, 0))
- self.assertEqual(
- self.reads[1].rnext, 1,
- "mate reference sequence name mismatch in read 2: %s != %s" %
- (self.reads[1].rnext, 1))
-
- def testARmpos(self):
- self.assertEqual(self.reads[
- 0].mpos, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].mpos, 200 - 1))
- self.assertEqual(self.reads[
- 1].mpos, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].mpos, 500 - 1))
- self.assertEqual(self.reads[
- 0].pnext, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].pnext, 200 - 1))
- self.assertEqual(self.reads[
- 1].pnext, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].pnext, 500 - 1))
-
- def testARisize(self):
- self.assertEqual(self.reads[0].isize, 167, "insert size mismatch in read 1: %s != %s" % (
- self.reads[0].isize, 167))
- self.assertEqual(self.reads[1].isize, 412, "insert size mismatch in read 2: %s != %s" % (
- self.reads[1].isize, 412))
- self.assertEqual(self.reads[0].tlen, 167, "insert size mismatch in read 1: %s != %s" % (
- self.reads[0].tlen, 167))
- self.assertEqual(self.reads[1].tlen, 412, "insert size mismatch in read 2: %s != %s" % (
- self.reads[1].tlen, 412))
-
- def testARseq(self):
- self.assertEqual(self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 1: %s != %s" % (
- self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
- self.assertEqual(self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "sequence size mismatch in read 2: %s != %s" % (
- self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"))
- self.assertEqual(self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 4: %s != %s" % (
- self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
-
- def testARqual(self):
- self.assertEqual(self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "quality string mismatch in read 1: %s != %s" % (self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
- self.assertEqual(self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "quality string mismatch in read 2: %s != %s" % (
- self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"))
- self.assertEqual(self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "quality string mismatch in read 3: %s != %s" % (self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
-
- def testARquery(self):
- self.assertEqual(self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "query mismatch in read 1: %s != %s" % (
- self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
- self.assertEqual(self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "query size mismatch in read 2: %s != %s" % (
- self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"))
- self.assertEqual(self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT", "query mismatch in read 4: %s != %s" % (
- self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT"))
-
- def testARqqual(self):
- self.assertEqual(
- self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "qquality string mismatch in read 1: %s != %s" %
- (self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
- self.assertEqual(
- self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<",
- "qquality string mismatch in read 2: %s != %s" %
- (self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"))
- self.assertEqual(
- self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22",
- "qquality string mismatch in read 3: %s != %s" %
- (self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22"))
-
- def testPresentOptionalFields(self):
- self.assertEqual(
- self.reads[0].opt('NM'), 1,
- "optional field mismatch in read 1, NM: %s != %s" %
- (self.reads[0].opt('NM'), 1))
- self.assertEqual(
- self.reads[0].opt('RG'), 'L1',
- "optional field mismatch in read 1, RG: %s != %s" %
- (self.reads[0].opt('RG'), 'L1'))
- self.assertEqual(
- self.reads[1].opt('RG'), 'L2',
- "optional field mismatch in read 2, RG: %s != %s" %
- (self.reads[1].opt('RG'), 'L2'))
- self.assertEqual(
- self.reads[1].opt('MF'), 18,
- "optional field mismatch in read 2, MF: %s != %s" %
- (self.reads[1].opt('MF'), 18))
-
- def testPairedBools(self):
- self.assertEqual(self.reads[0].is_paired, True,
- "is paired mismatch in read 1: %s != %s" % (
- self.reads[0].is_paired, True))
- self.assertEqual(self.reads[1].is_paired, True,
- "is paired mismatch in read 2: %s != %s" % (
- self.reads[1].is_paired, True))
- self.assertEqual(self.reads[0].is_proper_pair, True,
- "is proper pair mismatch in read 1: %s != %s" % (
- self.reads[0].is_proper_pair, True))
- self.assertEqual(self.reads[1].is_proper_pair, True,
- "is proper pair mismatch in read 2: %s != %s" % (
- self.reads[1].is_proper_pair, True))
-
- def testTags(self):
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')])
- self.assertEqual(self.reads[1].tags,
- [('MF', 18), ('RG', 'L2'),
- ('PG', 'P2'), ('XT', 'R')])
-
- def testAddTags(self):
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')]))
-
- self.reads[0].setTag('X1', 'C')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 'C'), ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
- self.reads[0].setTag('X2', 5)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 5), ('X1', 'C'),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
- # add with replacement
- self.reads[0].setTag('X2', 10)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 10), ('X1', 'C'),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
-
- # add without replacement
- self.reads[0].setTag('X2', 5, replace=False)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 10), ('X1', 'C'),
- ('X2', 5),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
-
- def testAddTagsType(self):
- self.reads[0].tags = None
- self.assertEqual(self.reads[0].tags, [])
-
- self.reads[0].setTag('X1', 5.0)
- self.reads[0].setTag('X2', "5.0")
- self.reads[0].setTag('X3', 5)
-
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5)]))
-
- # test setting float for int value
- self.reads[0].setTag('X4', 5, value_type='d')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5),
- ('X4', 5.0)]))
-
- # test setting int for float value - the
- # value will be rounded.
- self.reads[0].setTag('X5', 5.2, value_type='i')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5),
- ('X4', 5.0),
- ('X5', 5)]))
-
- # test setting invalid type code
- self.assertRaises(ValueError, self.reads[0].setTag, 'X6', 5.2, 'g')
-
- def testTagsUpdatingFloat(self):
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')])
- self.reads[0].tags += [('XC', 5.0)]
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)])
-
- def testOpt(self):
- self.assertEqual(self.reads[0].opt("XT"), "U")
- self.assertEqual(self.reads[1].opt("XT"), "R")
-
- def testMissingOpt(self):
- self.assertRaises(KeyError, self.reads[0].opt, "XP")
-
- def testEmptyOpt(self):
- self.assertRaises(KeyError, self.reads[2].opt, "XT")
-
- def tearDown(self):
- self.samfile.close()
-
-
-class BasicTestBAMFile(BasicTestBAMFetch):
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.sam"),
- "r")
- self.reads = [r for r in self.samfile]
-
-
-class BasicTestSAMFile(BasicTestBAMFetch):
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.sam"),
- "r")
- self.reads = [r for r in self.samfile]
-
-
-class BasicTestSAMFetch(BasicTestBAMFetch):
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.sam"),
- "r")
- self.reads = list(self.samfile.fetch())
-
-
-# needs to be implemented
-# class TestAlignedReadFromSamWithoutHeader(TestAlignedReadFromBam):
-#
-# def setUp(self):
-# self.samfile=pysam.Samfile( "ex7.sam","r" )
-# self.reads=list(self.samfile.fetch())
-
-
-class TestIO(unittest.TestCase):
-
- '''check if reading samfile and writing a samfile are consistent.'''
-
- def checkEcho(self,
- input_filename,
- reference_filename,
- output_filename,
- input_mode, output_mode,
- use_template=True):
- '''iterate through *input_filename* writing to *output_filename* and
- comparing the output to *reference_filename*.
-
- The files are opened according to the *input_mode* and *output_mode*.
-
- If *use_template* is set, the header is copied from infile
- using the template mechanism, otherwise target names and
- lengths are passed explicitly.
-
- '''
-
- infile = pysam.Samfile(os.path.join(DATADIR, input_filename),
- input_mode)
- if use_template:
- outfile = pysam.Samfile(output_filename,
- output_mode,
- template=infile)
- else:
- outfile = pysam.Samfile(output_filename,
- output_mode,
- referencenames=infile.references,
- referencelengths=infile.lengths,
- add_sq_text=False)
-
- iter = infile.fetch()
-
- for x in iter:
- outfile.write(x)
- infile.close()
- outfile.close()
-
- self.assertTrue(
- checkBinaryEqual(os.path.join(DATADIR, reference_filename),
- output_filename),
- "files %s and %s are not the same" % (reference_filename,
- output_filename))
-
- def testReadWriteBam(self):
-
- input_filename = "ex1.bam"
- output_filename = "pysam_ex1.bam"
- reference_filename = "ex1.bam"
-
- self.checkEcho(input_filename, reference_filename, output_filename,
- "rb", "wb", use_template=True)
-
- # Disabled - should work, files are not binary equal, but are
- # non-binary equal:
- # diff <(samtools view pysam_ex1.bam) <(samtools view pysam_data/ex1.bam)
- # def testReadWriteBamWithTargetNames(self):
- # input_filename = "ex1.bam"
- # output_filename = "pysam_ex1.bam"
- # reference_filename = "ex1.bam"
-
- # self.checkEcho(input_filename, reference_filename, output_filename,
- # "rb", "wb", use_template=False)
-
- def testReadWriteSamWithHeader(self):
-
- input_filename = "ex2.sam"
- output_filename = "pysam_ex2.sam"
- reference_filename = "ex2.sam"
-
- self.checkEcho(input_filename,
- reference_filename,
- output_filename,
- "r", "wh")
-
- # Release 0.8.0
- # no samfiles without header
- def testReadWriteSamWithoutHeader(self):
-
- input_filename = "ex2.sam"
- output_filename = "pysam_ex2.sam"
- reference_filename = "ex1.sam"
-
- self.checkEcho(input_filename,
- reference_filename,
- output_filename,
- "r", "w")
-
- def testReadSamWithoutTargetNames(self):
- '''see issue 104.'''
- input_filename = os.path.join(DATADIR,
- "example_unmapped_reads_no_sq.sam")
-
- # raise exception in default mode
- self.assertRaises(ValueError, pysam.Samfile, input_filename, "r")
-
- # raise exception if no SQ files
- self.assertRaises(ValueError, pysam.Samfile,
- input_filename, "r",
- check_header=True)
-
- infile = pysam.Samfile(
- input_filename,
- check_header=False,
- check_sq=False)
-
- # TODO
- # result = list(infile.fetch(until_eof=True))
- # self.assertEqual(2, len(result))
-
- def testReadBamWithoutTargetNames(self):
- '''see issue 104.'''
- input_filename = os.path.join(
- DATADIR, "example_unmapped_reads_no_sq.bam")
-
- # raise exception in default mode
- self.assertRaises(ValueError, pysam.Samfile, input_filename, "r")
-
- # raise exception if no SQ files
- self.assertRaises(ValueError, pysam.Samfile, input_filename, "r",
- check_header=True)
-
- infile = pysam.Samfile(
- input_filename, check_header=False, check_sq=False)
- result = list(infile.fetch(until_eof=True))
-
- # TODO
- def testReadSamWithoutHeader(self):
- input_filename = os.path.join(DATADIR, "ex1.sam")
-
- # reading from a samfile without header is not
- # implemented
- self.assertRaises(ValueError,
- pysam.Samfile,
- input_filename,
- "r")
-
- # TODO
- # without check_header header is no read
- # leading to segfault
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "r",
- # check_header=False)
-
- # TODO
- # def testReadUnformattedFile(self):
- # '''test reading from a file that is not bam/sam formatted'''
- # input_filename = os.path.join(DATADIR, 'Makefile')
-
- # # bam - file raise error
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "rb")
-
- # # sam - file error, but can't fetch
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "r")
-
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "r",
- # check_header=False)
-
- def testBAMWithoutAlignedReads(self):
- '''see issue 117'''
- input_filename = os.path.join(DATADIR, "test_unaligned.bam")
- samfile = pysam.Samfile(input_filename, "rb", check_sq=False)
- samfile.fetch(until_eof=True)
-
- def testBAMWithShortBAI(self):
- '''see issue 116'''
- input_filename = os.path.join(DATADIR, "example_bai.bam")
- samfile = pysam.Samfile(input_filename, "rb", check_sq=False)
- samfile.fetch('chr2')
-
- def testFetchFromClosedFile(self):
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- samfile.close()
- self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
-
- def testClosedFile(self):
- '''test that access to a closed samfile raises ValueError.'''
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- samfile.close()
- self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
- self.assertRaises(ValueError, samfile.pileup, 'chr1', 100, 120)
- self.assertRaises(ValueError, samfile.getrname, 0)
- # TODO
- self.assertRaises(ValueError, samfile.tell)
- self.assertRaises(ValueError, samfile.seek, 0)
- self.assertRaises(ValueError, getattr, samfile, "nreferences")
- self.assertRaises(ValueError, getattr, samfile, "references")
- self.assertRaises(ValueError, getattr, samfile, "lengths")
- self.assertRaises(ValueError, getattr, samfile, "text")
- self.assertRaises(ValueError, getattr, samfile, "header")
-
- # write on closed file
- self.assertEqual(0, samfile.write(None))
-
- def testAutoDetection(self):
- '''test if autodetection works.'''
-
- # TODO
- # samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.sam"))
- # self.assertRaises(ValueError, samfile.fetch, 'chr1')
- # samfile.close()
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.bam"))
- samfile.fetch('chr1')
- samfile.close()
-
- # TOOD
- # def testReadingFromSamFileWithoutHeader(self):
- # '''read from samfile without header.
- # '''
- # samfile = pysam.Samfile(os.path.join(DATADIR, "ex7.sam"),
- # check_header=False,
- # check_sq=False)
- # self.assertRaises(NotImplementedError, samfile.__iter__)
-
- def testReadingFromFileWithoutIndex(self):
- '''read from bam file without index.'''
-
- shutil.copyfile(os.path.join(DATADIR, "ex2.bam"), 'tmp_ex2.bam')
- samfile = pysam.Samfile('tmp_ex2.bam',
- "rb")
- self.assertRaises(ValueError, samfile.fetch)
- self.assertEqual(len(list(samfile.fetch(until_eof=True))),
- 3270)
- os.unlink('tmp_ex2.bam')
-
- # def testReadingUniversalFileMode(self):
- # '''read from samfile without header.
- # '''
-
- # input_filename = "ex2.sam"
- # output_filename = "pysam_ex2.sam"
- # reference_filename = "ex1.sam"
-
- # self.checkEcho(input_filename,
- # reference_filename,
- # output_filename,
- # "rU", "w")
-
- def testHead(self):
- '''test IteratorRowHead'''
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- l10 = list(samfile.head(10))
- l100 = list(samfile.head(100))
- self.assertEqual(len(l10), 10)
- self.assertEqual(len(l100), 100)
- self.assertEqual(list(map(str, l10)),
- list(map(str, l100[:10])))
-
-
-class TestFloatTagBug(unittest.TestCase):
-
- '''see issue 71'''
-
- def testFloatTagBug(self):
- '''a float tag before another exposed a parsing bug in bam_aux_get.
-
- Fixed in 0.1.19
- '''
- samfile = pysam.Samfile(os.path.join(DATADIR, "tag_bug.bam"))
- read = next(samfile.fetch(until_eof=True))
- self.assertTrue(('XC', 1) in read.tags)
- self.assertEqual(read.opt('XC'), 1)
-
-
-class TestLargeFieldBug(unittest.TestCase):
-
- '''see issue 100'''
-
- def testLargeFileBug(self):
- '''when creating a read with a large entry in the tag field
- causes an errror:
- NotImplementedError: tags field too large
- '''
- samfile = pysam.Samfile(os.path.join(DATADIR, "issue100.bam"))
- read = next(samfile.fetch(until_eof=True))
- new_read = pysam.AlignedRead()
- new_read.tags = read.tags
- self.assertEqual(new_read.tags, read.tags)
-
-
-class TestTagParsing(unittest.TestCase):
-
- '''tests checking the accuracy of tag setting and retrieval.'''
-
- def makeRead(self):
- a = pysam.AlignedRead()
- a.qname = "read_12345"
- a.tid = 0
- a.seq = "ACGT" * 3
- a.flag = 0
- a.rname = 0
- a.pos = 1
- a.mapq = 20
- a.cigar = ((0, 10), (2, 1), (0, 25))
- a.mrnm = 0
- a.mpos = 200
- a.isize = 0
- a.qual = "1234" * 3
- # todo: create tags
- return a
-
- def testNegativeIntegers(self):
- x = -2
- aligned_read = self.makeRead()
- aligned_read.tags = [("XD", int(x))]
- # print (aligned_read.tags)
-
- def testNegativeIntegers2(self):
- x = -2
- r = self.makeRead()
- r.tags = [("XD", int(x))]
- outfile = pysam.Samfile("test.bam",
- "wb",
- referencenames=("chr1",),
- referencelengths = (1000,))
- outfile.write(r)
- outfile.close()
-
- def testCigarString(self):
- r = self.makeRead()
- self.assertEqual(r.cigarstring, "10M1D25M")
- r.cigarstring = "20M10D20M"
- self.assertEqual(r.cigar, [(0, 20), (2, 10), (0, 20)])
- # unsetting cigar string
- r.cigarstring = None
- self.assertEqual(r.cigarstring, None)
-
- def testCigar(self):
- r = self.makeRead()
- self.assertEqual(r.cigar, [(0, 10), (2, 1), (0, 25)])
- # unsetting cigar string
- r.cigar = None
- self.assertEqual(r.cigar, [])
-
- def testLongTags(self):
- '''see issue 115'''
-
- r = self.makeRead()
- rg = 'HS2000-899_199.L3'
- tags = [('XC', 85), ('XT', 'M'), ('NM', 5),
- ('SM', 29), ('AM', 29), ('XM', 1),
- ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'),
- ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')]
-
- r.tags = tags
- r.tags += [("RG", rg)] * 100
- tags += [("RG", rg)] * 100
-
- self.assertEqual(tags, r.tags)
-
-
-class TestClipping(unittest.TestCase):
-
- def testClipping(self):
-
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "softclip.bam"),
- "rb")
- for read in self.samfile:
-
- if read.qname == "r001":
- self.assertEqual(read.seq, 'AAAAGATAAGGATA')
- self.assertEqual(read.query, 'AGATAAGGATA')
- self.assertEqual(read.qual, None)
- self.assertEqual(read.qqual, None)
-
- elif read.qname == "r002":
-
- self.assertEqual(read.seq, 'GCCTAAGCTAA')
- self.assertEqual(read.query, 'AGCTAA')
- self.assertEqual(read.qual, '01234567890')
- self.assertEqual(read.qqual, '567890')
-
- elif read.qname == "r003":
-
- self.assertEqual(read.seq, 'GCCTAAGCTAA')
- self.assertEqual(read.query, 'GCCTAA')
- self.assertEqual(read.qual, '01234567890')
- self.assertEqual(read.qqual, '012345')
-
- elif read.qname == "r004":
-
- self.assertEqual(read.seq, 'TAGGC')
- self.assertEqual(read.query, 'TAGGC')
- self.assertEqual(read.qual, '01234')
- self.assertEqual(read.qqual, '01234')
-
-
-class TestIteratorRow(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def checkRange(self, rnge):
- '''compare results from iterator with those from samtools.'''
- ps = list(self.samfile.fetch(region=rnge))
- sa = force_str(
- pysam.samtools.view(
- os.path.join(DATADIR, "ex1.bam"),
- rnge,
- raw=True)).splitlines(True)
- self.assertEqual(
- len(ps), len(sa),
- "unequal number of results for range %s: %i != %i" %
- (rnge, len(ps), len(sa)))
- # check if the same reads are returned and in the same order
- for line, (a, b) in enumerate(list(zip(ps, sa))):
- d = b.split("\t")
- self.assertEqual(
- a.qname, d[0],
- "line %i: read id mismatch: %s != %s" %
- (line, a.rname, d[0]))
- self.assertEqual(
- a.pos, int(d[3]) - 1,
- "line %i: read position mismatch: %s != %s, "
- "\n%s\n%s\n" %
- (line, a.pos, int(d[3]) - 1,
- str(a), str(d)))
- qual = d[10]
- self.assertEqual(
- a.qual, qual,
- "line %i: quality mismatch: %s != %s, \n%s\n%s\n" %
- (line, a.qual, qual,
- str(a), str(d)))
-
- def testIteratePerContig(self):
- '''check random access per contig'''
- for contig in self.samfile.references:
- self.checkRange(contig)
-
- def testIterateRanges(self):
- '''check random access per range'''
- for contig, length in zip(self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange("%s:%i-%i" % (contig, start, start + 90))
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorRowAll(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testIterate(self):
- '''compare results from iterator with those from samtools.'''
- ps = list(self.samfile.fetch())
- sa = force_str(
- pysam.samtools.view(
- os.path.join(DATADIR, "ex1.bam"),
- raw=True)).splitlines(True)
-
- self.assertEqual(
- len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa)))
- # check if the same reads are returned
- for line, pair in enumerate(list(zip(ps, sa))):
- data = pair[1].split("\t")
- self.assertEqual(pair[0].qname, data[
- 0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0]))
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorColumn(unittest.TestCase):
-
- '''test iterator column against contents of ex4.bam.'''
-
- # note that samfile contains 1-based coordinates
- # 1D means deletion with respect to reference sequence
- #
- mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35),
- 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35),
- }
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex4.bam"),
- "rb")
-
- def checkRange(self, contig, start=None, end=None, truncate=False):
- '''compare results from iterator with those from samtools.'''
- # check if the same reads are returned and in the same order
- for column in self.samfile.pileup(contig, start, end,
- truncate=truncate):
- if truncate:
- self.assertGreaterEqual(column.pos, start)
- self.assertLess(column.pos, end)
- thiscov = len(column.pileups)
- refcov = self.mCoverages[
- self.samfile.getrname(column.tid)][column.pos]
- self.assertEqual(
- thiscov, refcov, "wrong coverage at pos %s:%i %i should be %i" % (
- self.samfile.getrname(column.tid), column.pos, thiscov, refcov))
-
- def testIterateAll(self):
- '''check random access per contig'''
- self.checkRange(None)
-
- def testIteratePerContig(self):
- '''check random access per contig'''
- for contig in self.samfile.references:
- self.checkRange(contig)
-
- def testIterateRanges(self):
- '''check random access per range'''
- for contig, length in zip(
- self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange(contig, start, start + 90)
-
- def testInverse(self):
- '''test the inverse, is point-wise pileup accurate.'''
- for contig, refseq in list(self.mCoverages.items()):
- refcolumns = sum(refseq)
- for pos, refcov in enumerate(refseq):
- columns = list(self.samfile.pileup(contig, pos, pos + 1))
- if refcov == 0:
- # if no read, no coverage
- self.assertEqual(
- len(columns),
- refcov,
- "wrong number of pileup columns returned for position %s:%i, %i should be %i" % (
- contig, pos,
- len(columns), refcov))
- elif refcov == 1:
- # one read, all columns of the read are returned
- self.assertEqual(
- len(columns),
- refcolumns,
- "pileup incomplete at position %i: got %i, expected %i " %
- (pos, len(columns), refcolumns))
-
- def testIterateTruncate(self):
- '''check random access per range'''
- for contig, length in zip(self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange(contig, start, start + 90, truncate=True)
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorColumn2(unittest.TestCase):
-
- '''test iterator column against contents of ex1.bam.'''
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testStart(self):
- # print self.samfile.fetch().next().pos
- # print self.samfile.pileup().next().pos
- pass
-
- def testTruncate(self):
- '''see issue 107.'''
- # note that ranges in regions start from 1
- p = self.samfile.pileup(region='chr1:170:172', truncate=True)
- columns = [x.pos for x in p]
- self.assertEqual(len(columns), 3)
- self.assertEqual(columns, [169, 170, 171])
-
- p = self.samfile.pileup('chr1', 169, 172, truncate=True)
- columns = [x.pos for x in p]
-
- self.assertEqual(len(columns), 3)
- self.assertEqual(columns, [169, 170, 171])
-
- def testAccessOnClosedIterator(self):
- '''see issue 131
-
- Accessing pileup data after iterator has closed.
- '''
- pcolumn = self.samfile.pileup('chr1', 170, 180).__next__()
- self.assertRaises(ValueError, getattr, pcolumn, "pileups")
-
-
-class TestHeaderSam(unittest.TestCase):
-
- header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
- {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
- 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"},
- {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}],
- 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}],
- 'HD': {'VN': '1.0'},
- 'CO': ['this is a comment', 'this is another comment'],
- }
-
- def compareHeaders(self, a, b):
- '''compare two headers a and b.'''
- for ak, av in a.items():
- self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b))
- self.assertEqual(av, b[ak])
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.sam"),
- "r")
-
- def testHeaders(self):
- self.compareHeaders(self.header, self.samfile.header)
- self.compareHeaders(self.samfile.header, self.header)
-
- def testNameMapping(self):
- for x, y in enumerate(("chr1", "chr2")):
- tid = self.samfile.gettid(y)
- ref = self.samfile.getrname(x)
- self.assertEqual(tid, x)
- self.assertEqual(ref, y)
-
- self.assertEqual(self.samfile.gettid("chr?"), -1)
- self.assertRaises(ValueError, self.samfile.getrname, 2)
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestHeaderBam(TestHeaderSam):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.bam"),
- "rb")
-
-
-class TestHeaderFromRefs(unittest.TestCase):
-
- '''see issue 144
-
- reference names need to be converted to string for python 3
- '''
-
- # def testHeader( self ):
- # refs = ['chr1', 'chr2']
- # tmpfile = "tmp_%i" % id(self)
- # s = pysam.Samfile(tmpfile, 'wb',
- # referencenames=refs,
- # referencelengths=[100]*len(refs))
- # s.close()
-
- # self.assertTrue( checkBinaryEqual( 'issue144.bam', tmpfile ),
- # 'bam files differ')
- # os.unlink( tmpfile )
-
-
-class TestHeader1000Genomes(unittest.TestCase):
-
- '''see issue 110'''
- # bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase2b_alignment/data/NA07048/exome_alignment/NA07048.unmapped.ILLUMINA.bwa.CEU.exome.20120522_p2b.bam"
- bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam"
-
- def testRead(self):
-
- if not checkURL(self.bamfile):
- return
-
- f = pysam.Samfile(self.bamfile, "rb")
- data = f.header.copy()
- self.assertTrue(data)
-
-
-class TestUnmappedReads(unittest.TestCase):
-
- # TODO
- # def testSAM(self):
- # samfile = pysam.Samfile(os.path.join(DATADIR, "ex5.sam"),
- # "r")
- # self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2)
- # samfile.close()
-
- def testBAM(self):
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex5.bam"),
- "rb")
- self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2)
- samfile.close()
-
-
-class TestPileupObjects(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testPileupColumn(self):
- for pcolumn1 in self.samfile.pileup(region="chr1:105"):
- if pcolumn1.pos == 104:
- self.assertEqual(
- pcolumn1.tid, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.tid, 0))
- self.assertEqual(
- pcolumn1.pos, 105 - 1, "position mismatch in position 1: %s != %s" % (pcolumn1.pos, 105 - 1))
- self.assertEqual(
- pcolumn1.n, 2, "# reads mismatch in position 1: %s != %s" % (pcolumn1.n, 2))
- for pcolumn2 in self.samfile.pileup(region="chr2:1480"):
- if pcolumn2.pos == 1479:
- self.assertEqual(
- pcolumn2.tid, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.tid, 1))
- self.assertEqual(
- pcolumn2.pos, 1480 - 1, "position mismatch in position 1: %s != %s" % (pcolumn2.pos, 1480 - 1))
- self.assertEqual(
- pcolumn2.n, 12, "# reads mismatch in position 1: %s != %s" % (pcolumn2.n, 12))
-
- def testPileupRead(self):
- for pcolumn1 in self.samfile.pileup(region="chr1:105"):
- if pcolumn1.pos == 104:
- self.assertEqual(
- len(pcolumn1.pileups), 2,
- "# reads aligned to column mismatch in position 1"
- ": %s != %s" %
- (len(pcolumn1.pileups), 2))
-
-
-# self.assertEqual( pcolumn1.pileups[0] # need to test additional
-# properties here
-
- def tearDown(self):
- self.samfile.close()
-
- def testIteratorOutOfScope(self):
- '''test if exception is raised if pileup col is accessed after
- iterator is exhausted.'''
-
- for pileupcol in self.samfile.pileup():
- pass
-
- self.assertRaises(ValueError, getattr, pileupcol, "pileups")
-
-
-class TestContextManager(unittest.TestCase):
-
- def testManager(self):
- with pysam.Samfile(os.path.join(DATADIR, 'ex1.bam'),
- 'rb') as samfile:
- samfile.fetch()
- self.assertEqual(samfile.closed, True)
-
-
-class TestExceptions(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testMissingFile(self):
-
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.bam", "rb")
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.sam", "r")
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.bam", "r")
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.sam", "rb")
-
- def testBadContig(self):
- self.assertRaises(ValueError, self.samfile.fetch, "chr88")
-
- def testMeaninglessCrap(self):
- self.assertRaises(ValueError, self.samfile.fetch, "skljf")
-
- def testBackwardsOrderNewFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, 'chr1', 100, 10)
-
- def testBackwardsOrderOldFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:100-10")
-
- def testOutOfRangeNegativeNewFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, "chr1", 5, -10)
- self.assertRaises(ValueError, self.samfile.fetch, "chr1", 5, 0)
- self.assertRaises(ValueError, self.samfile.fetch, "chr1", -5, -10)
-
- self.assertRaises(ValueError, self.samfile.count, "chr1", 5, -10)
- self.assertRaises(ValueError, self.samfile.count, "chr1", 5, 0)
- self.assertRaises(ValueError, self.samfile.count, "chr1", -5, -10)
-
- def testOutOfRangeNegativeOldFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5-10")
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5-0")
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5--10")
-
- self.assertRaises(ValueError, self.samfile.count, region="chr1:-5-10")
- self.assertRaises(ValueError, self.samfile.count, region="chr1:-5-0")
- self.assertRaises(ValueError, self.samfile.count, region="chr1:-5--10")
-
- def testOutOfRangNewFormat(self):
- self.assertRaises(
- ValueError, self.samfile.fetch, "chr1", 9999999999, 99999999999)
- self.assertRaises(
- ValueError, self.samfile.count, "chr1", 9999999999, 99999999999)
-
- def testOutOfRangeLargeNewFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, "chr1",
- 9999999999999999999999999999999, 9999999999999999999999999999999999999999)
- self.assertRaises(ValueError, self.samfile.count, "chr1",
- 9999999999999999999999999999999, 9999999999999999999999999999999999999999)
-
- def testOutOfRangeLargeOldFormat(self):
- self.assertRaises(
- ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999")
- self.assertRaises(
- ValueError, self.samfile.count, "chr1:99999999999999999-999999999999999999")
-
- def testZeroToZero(self):
- '''see issue 44'''
- self.assertEqual(len(list(self.samfile.fetch('chr1', 0, 0))), 0)
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestWrongFormat(unittest.TestCase):
-
- '''test cases for opening files not in bam/sam format.'''
-
- def testOpenSamAsBam(self):
- self.assertRaises(ValueError,
- pysam.Samfile,
- os.path.join(DATADIR, 'ex1.sam'),
- 'rb')
-
- def testOpenBamAsSam(self):
- # test fails, needs to be implemented.
- # sam.fetch() fails on reading, not on opening
- # self.assertRaises( ValueError, pysam.Samfile, 'ex1.bam', 'r' )
- pass
-
- def testOpenFastaAsSam(self):
- # test fails, needs to be implemented.
- # sam.fetch() fails on reading, not on opening
- # self.assertRaises( ValueError, pysam.Samfile, 'ex1.fa', 'r' )
- pass
-
- def testOpenFastaAsBam(self):
- self.assertRaises(ValueError,
- pysam.Samfile,
- os.path.join(DATADIR, 'ex1.fa'),
- 'rb')
-
-
-class ReadTest(unittest.TestCase):
-
- def checkFieldEqual(self, read1, read2, exclude=[]):
- '''check if two reads are equal by comparing each field.'''
-
- # add the . for refactoring purposes.
- for x in (".qname", ".seq", ".flag",
- ".rname", ".pos", ".mapq", ".cigar",
- ".mrnm", ".mpos", ".isize",
- ".qual",
- ".bin",
- ".is_paired", ".is_proper_pair",
- ".is_unmapped", ".mate_is_unmapped",
- ".is_reverse", ".mate_is_reverse",
- ".is_read1", ".is_read2",
- ".is_secondary", ".is_qcfail",
- ".is_duplicate"):
- n = x[1:]
- if n in exclude:
- continue
- self.assertEqual(getattr(read1, n), getattr(read2, n),
- "attribute mismatch for %s: %s != %s" %
- (n, getattr(read1, n), getattr(read2, n)))
-
-
-class TestAlignedRead(ReadTest):
-
- '''tests to check if aligned read can be constructed
- and manipulated.
- '''
-
- def testEmpty(self):
- a = pysam.AlignedRead()
- self.assertEqual(a.qname, None)
- self.assertEqual(a.seq, None)
- self.assertEqual(a.qual, None)
- self.assertEqual(a.flag, 0)
- self.assertEqual(a.rname, -1)
- self.assertEqual(a.mapq, 0)
- self.assertEqual(a.cigar, [])
- self.assertEqual(a.tags, [])
- self.assertEqual(a.mrnm, -1)
- self.assertEqual(a.mpos, -1)
- self.assertEqual(a.isize, 0)
-
- def testStrOfEmptyRead(self):
- a = pysam.AlignedRead()
- s = str(a)
- self.assertEqual(
- "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
- s)
-
- def buildRead(self):
- '''build an example read.'''
-
- a = pysam.AlignedRead()
- a.qname = "read_12345"
- a.seq = "ACGT" * 10
- a.flag = 0
- a.rname = 0
- a.pos = 20
- a.mapq = 20
- a.cigar = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
- a.mrnm = 0
- a.mpos = 200
- a.isize = 167
- a.qual = "1234" * 10
- # todo: create tags
- return a
-
- def testUpdate(self):
- '''check if updating fields affects other variable length data
- '''
- a = self.buildRead()
- b = self.buildRead()
-
- # check qname
- b.qname = "read_123"
- self.checkFieldEqual(a, b, "qname")
- b.qname = "read_12345678"
- self.checkFieldEqual(a, b, "qname")
- b.qname = "read_12345"
- self.checkFieldEqual(a, b)
-
- # check cigar
- b.cigar = ((0, 10), )
- self.checkFieldEqual(a, b, "cigar")
- b.cigar = ((0, 10), (2, 1), (0, 10))
- self.checkFieldEqual(a, b, "cigar")
- b.cigar = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
- self.checkFieldEqual(a, b)
-
- # check seq
- b.seq = "ACGT"
- self.checkFieldEqual(a, b, ("seq", "qual"))
- b.seq = "ACGT" * 3
- self.checkFieldEqual(a, b, ("seq", "qual"))
- b.seq = "ACGT" * 10
- self.checkFieldEqual(a, b, ("qual",))
-
- # reset qual
- b = self.buildRead()
-
- # check flags:
- for x in (
- "is_paired", "is_proper_pair",
- "is_unmapped", "mate_is_unmapped",
- "is_reverse", "mate_is_reverse",
- "is_read1", "is_read2",
- "is_secondary", "is_qcfail",
- "is_duplicate"):
- setattr(b, x, True)
- self.assertEqual(getattr(b, x), True)
- self.checkFieldEqual(a, b, ("flag", x,))
- setattr(b, x, False)
- self.assertEqual(getattr(b, x), False)
- self.checkFieldEqual(a, b)
-
- def testUpdate2(self):
- '''issue 135: inplace update of sequence and quality score.
-
- This does not work as setting the sequence will erase
- the quality scores.
- '''
- a = self.buildRead()
- a.seq = a.seq[5:10]
- self.assertEqual(a.qual, None)
-
- a = self.buildRead()
- s = a.qual
- a.seq = a.seq[5:10]
- a.qual = s[5:10]
-
- self.assertEqual(a.qual, s[5:10])
-
- def testLargeRead(self):
- '''build an example read.'''
-
- a = pysam.AlignedRead()
- a.qname = "read_12345"
- a.seq = "ACGT" * 200
- a.flag = 0
- a.rname = 0
- a.pos = 20
- a.mapq = 20
- a.cigar = ((0, 4 * 200), )
- a.mrnm = 0
- a.mpos = 200
- a.isize = 167
- a.qual = "1234" * 200
-
- return a
-
- def testTagParsing(self):
- '''test for tag parsing
-
- see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a
- '''
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex8.bam"),
- "rb")
-
- for entry in samfile:
- before = entry.tags
- entry.tags = entry.tags
- after = entry.tags
- self.assertEqual(after, before)
-
- def testUpdateTlen(self):
- '''check if updating tlen works'''
- a = self.buildRead()
- oldlen = a.tlen
- oldlen *= 2
- a.tlen = oldlen
- self.assertEqual(a.tlen, oldlen)
-
- def testPositions(self):
- a = self.buildRead()
- self.assertEqual(a.positions,
- [20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
- 31, 32, 33, 34, 35, 36, 37, 38, 39,
- 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
- 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])
-
- self.assertEqual(a.aligned_pairs,
- [(0, 20), (1, 21), (2, 22), (3, 23), (4, 24),
- (5, 25), (6, 26), (7, 27), (8, 28), (9, 29),
- (None, 30),
- (10, 31), (11, 32), (12, 33), (13, 34), (14, 35),
- (15, 36), (16, 37), (17, 38), (18, 39), (19, None),
- (20, 40), (21, 41), (22, 42), (23, 43), (24, 44),
- (25, 45), (26, 46), (27, 47), (28, 48), (29, 49),
- (30, 50), (31, 51), (32, 52), (33, 53), (34, 54),
- (35, 55), (36, 56), (37, 57), (38, 58), (39, 59)])
-
- self.assertEqual(
- a.positions,
- [x[1] for x in a.aligned_pairs
- if x[0] is not None and x[1] is not None])
- # alen is the length of the aligned read in genome
- self.assertEqual(a.alen, a.aligned_pairs[-1][0] + 1)
- # aend points to one beyond last aligned base in ref
- self.assertEqual(a.positions[-1], a.aend - 1)
-
- def testBlocks(self):
- a = self.buildRead()
- self.assertEqual(a.blocks,
- [(20, 30), (31, 40), (40, 60)])
-
- # Disabled as not backwards compatible
- # def testFancyStr(self):
- # a = self.buildRead()
- # output = a.fancy_str()
- # self.assertEqual(len(output), 9)
-
-
-class TestDeNovoConstruction(ReadTest):
-
- '''check BAM/SAM file construction using ex6.sam
-
- (note these are +1 coordinates):
-
- read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1
- read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2
- '''
-
- header = {'HD': {'VN': '1.0'},
- 'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}], }
-
- bamfile = os.path.join(DATADIR, "ex6.bam")
- samfile = os.path.join(DATADIR, "ex6.sam")
-
- def setUp(self):
-
- a = pysam.AlignedRead()
- a.qname = "read_28833_29006_6945"
- a.seq = "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
- a.flag = 99
- a.rname = 0
- a.pos = 32
- a.mapq = 20
- a.cigar = ((0, 10), (2, 1), (0, 25))
- a.mrnm = 0
- a.mpos = 199
- a.isize = 167
- a.qual = "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"
- a.tags = (("NM", 1),
- ("RG", "L1"))
-
- b = pysam.AlignedRead()
- b.qname = "read_28701_28881_323b"
- b.seq = "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"
- b.flag = 147
- b.rname = 1
- b.pos = 87
- b.mapq = 30
- b.cigar = ((0, 35), )
- b.mrnm = 1
- b.mpos = 499
- b.isize = 412
- b.qual = "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"
- b.tags = (("MF", 18),
- ("RG", "L2"))
-
- self.reads = (a, b)
-
- # TODO
- # def testSAMWholeFile(self):
-
- # tmpfilename = "tmp_%i.sam" % id(self)
-
- # outfile = pysam.Samfile(tmpfilename,
- # "wh",
- # header=self.header)
-
- # for x in self.reads:
- # outfile.write(x)
- # outfile.close()
- # self.assertTrue(checkBinaryEqual(tmpfilename, self.samfile),
- # "mismatch when construction SAM file, see %s %s" % (tmpfilename, self.samfile))
-
- # os.unlink(tmpfilename)
-
- def testBAMPerRead(self):
- '''check if individual reads are binary equal.'''
- infile = pysam.Samfile(self.bamfile, "rb")
-
- others = list(infile)
- for denovo, other in zip(others, self.reads):
- self.checkFieldEqual(other, denovo)
- self.assertEqual(other.compare(denovo), 0)
-
- # TODO
- # def testSAMPerRead(self):
- # '''check if individual reads are binary equal.'''
- # infile = pysam.Samfile(self.samfile, "r")
-
- # others = list(infile)
- # for denovo, other in zip(others, self.reads):
- # self.checkFieldEqual(other, denovo)
- # self.assertEqual(other.compare(denovo), 0)
-
- def testBAMWholeFile(self):
-
- tmpfilename = "tmp_%i.bam" % id(self)
-
- outfile = pysam.Samfile(tmpfilename, "wb", header=self.header)
-
- for x in self.reads:
- outfile.write(x)
- outfile.close()
-
- self.assertTrue(checkBinaryEqual(tmpfilename, self.bamfile),
- "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile))
-
- os.unlink(tmpfilename)
-
-
-class TestDeNovoConstructionUserTags(TestDeNovoConstruction):
-
- '''test de novo construction with a header that contains lower-case tags.'''
-
- header = {'HD': {'VN': '1.0'},
- 'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}],
- 'x1': {'A': 2, 'B': 5},
- 'x3': {'A': 6, 'B': 5},
- 'x2': {'A': 4, 'B': 5}}
-
- bamfile = os.path.join(DATADIR, "example_user_header.bam")
- samfile = os.path.join(DATADIR, "example_user_header.sam")
-
-
-class TestEmptyHeader(unittest.TestCase):
-
- '''see issue 84.'''
-
- def testEmptyHeader(self):
-
- s = pysam.Samfile(os.path.join(DATADIR, 'example_empty_header.bam'))
- self.assertEqual(s.header, {'SQ': [{'LN': 1000, 'SN': 'chr1'}]})
-
-COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204,
- 0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78,
- 197, 0, 7, 100, 95, 101, 202, 0, 6, 0, 1,
- 186, 0, 84, 0, 244, 0, 0, 324, 0, 107, 195,
- 101, 113, 0, 102, 0, 104, 3, 0, 101, 1, 0,
- 212, 6, 0, 0, 1, 0, 74, 1, 11, 0, 196, 2,
- 197, 103, 0, 108, 98, 2, 7, 0, 1, 2, 194,
- 0, 180, 0, 108, 0, 203, 104, 16, 5, 205,
- 0, 0, 0, 1, 1, 100, 98, 0, 0, 204, 6, 0,
- 79, 0, 0, 101, 7, 109, 90, 265, 1, 27, 10,
- 109, 102, 9, 0, 292, 0, 110, 0, 0, 102,
- 112, 0, 0, 84, 100, 103, 2, 81, 126, 0, 2,
- 90, 0, 15, 96, 15, 1, 0, 2, 0, 107, 92, 0,
- 0, 101, 3, 98, 15, 102, 13, 116, 116, 90, 93,
- 198, 0, 0, 0, 199, 92, 26, 495, 100, 5, 0,
- 100, 5, 209, 0, 92, 107, 90, 0, 0, 0, 0, 109,
- 194, 7, 94, 200, 0, 40, 197, 0, 11, 0, 0, 112,
- 110, 6, 4, 200, 28, 0, 196, 0, 203, 1, 129,
- 0, 0, 1, 0, 94, 0, 1, 0, 107, 5, 201, 3, 3, 100,
- 0, 121, 0, 7, 0, 1, 105, 306, 3, 86, 8, 183, 0,
- 12, 163, 17, 83, 22, 0, 0, 1, 8, 109, 103, 0, 0,
- 295, 0, 200, 16, 172, 3, 16, 182, 3, 11, 0, 0,
- 223, 111, 103, 0, 5, 225, 0, 95]
-
-
-class TestBTagSam(unittest.TestCase):
-
- '''see issue 81.'''
-
- compare = [COMPARE_BTAG,
- [-100, 200, -300, -400],
- [-100, 12],
- [12, 15],
- [-1.0, 5.0, 2.5]]
-
- filename = os.path.join(DATADIR, 'example_btag.sam')
-
- read0 = [('RG', 'QW85I'),
- ('PG', 'tmap'),
- ('MD', '140'),
- ('NM', 0),
- ('AS', 140),
- ('FZ', array.array('H', COMPARE_BTAG)),
- ('XA', 'map2-1'),
- ('XS', 53),
- ('XT', 38),
- ('XF', 1),
- ('XE', 0)]
-
- def testReadTags(self):
-
- s = pysam.Samfile(self.filename)
- for x, read in enumerate(s):
- tags = read.tags
- if x == 0:
- self.assertEqual(tags, self.read0)
-
- fz = list(dict(tags)["FZ"])
- self.assertEqual(fz, self.compare[x])
- self.assertEqual(list(read.opt("FZ")), self.compare[x])
- self.assertEqual(tags, read.get_tags())
- for tag, value in tags:
- self.assertEqual(value, read.get_tag(tag))
-
- def testReadWriteTags(self):
-
- s = pysam.Samfile(self.filename)
- for read in s:
- before = read.tags
- read.tags = before
- self.assertEqual(read.tags, before)
-
- read.set_tags(before)
- self.assertEqual(read.tags, before)
-
- for tag, value in before:
- read.set_tag(tag, value)
- self.assertEqual(value, read.get_tag(tag))
-
-
-class TestBTagBam(TestBTagSam):
- filename = os.path.join(DATADIR, 'example_btag.bam')
-
-
-class TestDoubleFetch(unittest.TestCase):
-
- '''check if two iterators on the same bamfile are independent.'''
-
- filename = os.path.join(DATADIR, 'ex1.bam')
-
- def testDoubleFetch(self):
-
- samfile1 = pysam.Samfile(self.filename, 'rb')
-
- for a, b in zip(samfile1.fetch(multiple_iterators=True),
- samfile1.fetch(multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
-
- def testDoubleFetchWithRegion(self):
-
- samfile1 = pysam.Samfile(self.filename, 'rb')
- chr, start, stop = 'chr1', 200, 3000000
- # just making sure the test has something to catch
- self.assertTrue(len(list(samfile1.fetch(chr, start, stop))) > 0)
-
- for a, b in zip(samfile1.fetch(chr, start, stop),
- samfile1.fetch(chr, start, stop,
- multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
-
- def testDoubleFetchUntilEOF(self):
-
- samfile1 = pysam.Samfile(self.filename, 'rb')
-
- for a, b in zip(samfile1.fetch(until_eof=True),
- samfile1.fetch(until_eof=True,
- multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
-
-
-class TestRemoteFileFTP(unittest.TestCase):
-
- '''test remote access.
-
- '''
-
- # Need to find an ftp server without password on standard
- # port.
-
- url = "ftp://ftp.sanger.ac.uk/pub/rd/humanSequences/CV.bam"
- region = "1:1-1000"
-
- def testFTPView(self):
- return
- if not checkURL(self.url):
- return
-
- result = pysam.samtools.view(self.url, self.region)
- self.assertEqual(len(result), 36)
-
- def testFTPFetch(self):
- return
- if not checkURL(self.url):
- return
-
- samfile = pysam.Samfile(self.url, "rb")
- result = list(samfile.fetch(region=self.region))
- self.assertEqual(len(result), 36)
-
-
-class TestRemoteFileHTTP(unittest.TestCase):
-
- url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/ex1.bam"
- region = "chr1:1-1000"
- local = os.path.join(DATADIR, "ex1.bam")
-
- def testView(self):
- if not checkURL(self.url):
- return
-
- samfile_local = pysam.Samfile(self.local, "rb")
- ref = list(samfile_local.fetch(region=self.region))
-
- result = pysam.samtools.view(
- self.url, self.region).splitlines(True)
- self.assertEqual(len(result), len(ref))
-
- def testFetch(self):
- if not checkURL(self.url):
- return
-
- samfile = pysam.Samfile(self.url, "rb")
- result = list(samfile.fetch(region=self.region))
- samfile_local = pysam.Samfile(self.local, "rb")
- ref = list(samfile_local.fetch(region=self.region))
-
- self.assertEqual(len(ref), len(result))
- for x, y in zip(result, ref):
- self.assertEqual(x.compare(y), 0)
-
- def testFetchAll(self):
- if not checkURL(self.url):
- return
-
- samfile = pysam.Samfile(self.url, "rb")
- result = list(samfile.fetch())
- samfile_local = pysam.Samfile(self.local, "rb")
- ref = list(samfile_local.fetch())
-
- self.assertEqual(len(ref), len(result))
- for x, y in zip(result, ref):
- self.assertEqual(x.compare(y), 0)
-
-
-class TestLargeOptValues(unittest.TestCase):
-
- ints = (65536, 214748, 2147484, 2147483647)
- floats = (65536.0, 214748.0, 2147484.0)
-
- def check(self, samfile):
-
- i = samfile.fetch()
- for exp in self.ints:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs,
- "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- for exp in [-x for x in self.ints]:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs,
- "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- for exp in self.floats:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs,
- "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- for exp in [-x for x in self.floats]:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs, "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- def testSAM(self):
- samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex10.sam"),
- "r")
- self.check(samfile)
-
- def testBAM(self):
- samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex10.bam"),
- "rb")
- self.check(samfile)
-
-
-class TestPileup(unittest.TestCase):
-
- '''test pileup functionality.'''
-
- samfilename = "pysam_data/ex1.bam"
- fastafilename = "pysam_data/ex1.fa"
-
- def setUp(self):
-
- self.samfile = pysam.Samfile(self.samfilename)
- self.fastafile = pysam.Fastafile(self.fastafilename)
-
- def checkEqual(self, references, iterator):
-
- for x, column in enumerate(iterator):
- (contig, pos, reference_base,
- read_bases, read_qualities, alignment_mapping_qualities) \
- = references[x][:-1].split("\t")
- self.assertEqual(int(pos) - 1, column.pos)
-
- def testSamtoolsStepper(self):
- refs = force_str(
- pysam.samtools.mpileup(
- "-f", self.fastafilename,
- self.samfilename)).splitlines(True)
- iterator = self.samfile.pileup(
- stepper="samtools",
- fastafile=self.fastafile)
- self.checkEqual(refs, iterator)
-
- def testAllStepper(self):
- refs = force_str(
- pysam.samtools.mpileup(
- "-f", self.fastafilename,
- "-A", "-B",
- self.samfilename)).splitlines(True)
-
- iterator = self.samfile.pileup(
- stepper="all",
- fastafile=self.fastafile)
- self.checkEqual(refs, iterator)
-
-
-class TestLogging(unittest.TestCase):
-
- '''test around bug issue 42,
-
- failed in versions < 0.4
- '''
-
- def check(self, bamfile, log):
-
- if log:
- logger = logging.getLogger('franklin')
- logger.setLevel(logging.INFO)
- formatter = logging.Formatter(
- '%(asctime)s %(levelname)s %(message)s')
- log_hand = logging.FileHandler('log.txt')
- log_hand.setFormatter(formatter)
- logger.addHandler(log_hand)
-
- bam = pysam.Samfile(bamfile, 'rb')
- cols = bam.pileup()
- self.assertTrue(True)
-
- def testFail1(self):
- self.check(os.path.join(DATADIR, "ex9_fail.bam"),
- False)
- self.check(os.path.join(DATADIR, "ex9_fail.bam"),
- True)
-
- def testNoFail1(self):
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- False)
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- True)
-
- def testNoFail2(self):
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- True)
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- True)
-
-# TODOS
-# 1. finish testing all properties within pileup objects
-# 2. check exceptions and bad input problems (missing files, optional fields that aren't present, etc...)
-# 3. check: presence of sequence
-
-
-class TestSamfileUtilityFunctions(unittest.TestCase):
-
- def testCount(self):
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- for contig in ("chr1", "chr2"):
- for start in range(0, 2000, 100):
- end = start + 1
- self.assertEqual(
- len(list(samfile.fetch(contig, start, end))),
- samfile.count(contig, start, end),
- 'number mismatch for %s:%i-%i %i != %i' % (
- contig, start, end,
- len(list(samfile.fetch(contig, start, end))),
- samfile.count(contig, start, end)))
-
- # test empty intervals
- self.assertEqual(
- len(list(samfile.fetch(contig, start, start))),
- samfile.count(contig, start, start),
- 'number mismatch for %s:%i-%i %i != %i' % (
- contig, start, start,
- len(list(samfile.fetch(contig, start, start))),
- samfile.count(contig, start, start)))
-
- # test half empty intervals
- self.assertEqual(len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start))
-
- self.assertEqual(
- len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start),
- 'number mismatch for %s:%i %i != %i' % (
- contig, start,
- len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start)))
-
- def testMate(self):
- '''test mate access.'''
-
- with open(os.path.join(DATADIR, "ex1.sam"), "rb") as inf:
- readnames = [x.split(b"\t")[0] for x in inf.readlines()]
- if sys.version_info[0] >= 3:
- readnames = [name.decode('ascii') for name in readnames]
-
- counts = collections.defaultdict(int)
- for x in readnames:
- counts[x] += 1
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- for read in samfile.fetch():
- if not read.is_paired:
- self.assertRaises(ValueError, samfile.mate, read)
- elif read.mate_is_unmapped:
- self.assertRaises(ValueError, samfile.mate, read)
- else:
- if counts[read.qname] == 1:
- self.assertRaises(ValueError, samfile.mate, read)
- else:
- mate = samfile.mate(read)
- self.assertEqual(read.qname, mate.qname)
- self.assertEqual(read.is_read1, mate.is_read2)
- self.assertEqual(read.is_read2, mate.is_read1)
- self.assertEqual(read.pos, mate.mpos)
- self.assertEqual(read.mpos, mate.pos)
-
- def testIndexStats(self):
- '''test if total number of mapped/unmapped reads is correct.'''
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- self.assertEqual(samfile.mapped, 3235)
- self.assertEqual(samfile.unmapped, 35)
- self.assertEqual(samfile.nocoordinate, 0)
-
-
-class TestSamtoolsProxy(unittest.TestCase):
-
- '''tests for sanity checking access to samtools functions.'''
-
- def testIndex(self):
- self.assertRaises(IOError, pysam.samtools.index, "missing_file")
-
- def testView(self):
- # note that view still echos "open: No such file or directory"
- self.assertRaises(pysam.SamtoolsError, pysam.samtools.view, "missing_file")
-
- def testSort(self):
- self.assertRaises(pysam.SamtoolsError, pysam.samtools.sort, "missing_file")
-
-
-class TestSamfileIndex(unittest.TestCase):
-
- def testIndex(self):
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- index = pysam.IndexedReads(samfile)
- index.build()
- reads = collections.defaultdict(int)
-
- for read in samfile:
- reads[read.qname] += 1
-
- for qname, counts in reads.items():
- found = list(index.find(qname))
- self.assertEqual(len(found), counts)
- for x in found:
- self.assertEqual(x.qname, qname)
-
-
-if __name__ == "__main__":
- # build data files
- print ("building data files")
- subprocess.call("make -C %s" % DATADIR, shell=True)
- print ("starting tests")
- unittest.main()
- print ("completed tests")
diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py
index ce59da7..de54de5 100644
--- a/tests/StreamFiledescriptors_test.py
+++ b/tests/StreamFiledescriptors_test.py
@@ -1,4 +1,5 @@
import os
+import sys
import subprocess
import threading
import errno
@@ -6,6 +7,8 @@ import unittest
from pysam import AlignmentFile
+IS_PYTHON2 = sys.version_info[0] == 2
+
DATADIR = os.path.abspath(os.path.join(
os.path.dirname(__file__),
"pysam_data"))
@@ -13,7 +16,7 @@ DATADIR = os.path.abspath(os.path.join(
def alignmentfile_writer_thread(infile, outfile):
def _writer_thread(infile, outfile):
- """read from infile and write to outfile"""
+ """read from infile and write to outfile"""
try:
i = 0
for record in infile:
@@ -41,42 +44,48 @@ class StreamTest(unittest.TestCase):
read += 1
return 0, read
+ @unittest.skipIf(IS_PYTHON2, "no context manager in py2")
def test_text_processing(self):
- proc = subprocess.Popen('head -n200',
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- shell=True)
+ with subprocess.Popen('head -n200',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True) as proc:
- in_stream = AlignmentFile('pysam_data/ex1.bam')
- out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
- writer = alignmentfile_writer_thread(in_stream,
- out_stream)
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
- written, read = self.stream_process(proc,
- in_stream,
- out_stream,
- writer)
- self.assertEqual(read, 198)
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 198)
+ @unittest.skip("test contains bug")
def test_samtools_processing(self):
-
- proc = subprocess.Popen('samtools view -b -f 4',
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- shell=True)
-
- in_stream = AlignmentFile('pysam_data/ex1.bam')
- out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
- writer = alignmentfile_writer_thread(in_stream,
- out_stream)
-
- written, read = self.stream_process(proc,
- in_stream,
- out_stream,
- writer)
- self.assertEqual(read, 35)
-
+
+ # The following test causes the suite to hang
+ # as the stream_processor raises:
+ # ValueError: file has no sequences defined (mode='r') - is it SAM/BAM format?
+ # The whole setup then hangs during exception handling.
+ with subprocess.Popen('samtools view -b -f 4',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True) as proc:
+
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
+
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 35)
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/TestUtils.py b/tests/TestUtils.py
index 71ab22a..1168926 100644
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -78,7 +78,6 @@ def check_samtools_view_equal(
'''return true if the two files are equal in their
content through samtools view.
'''
-
# strip MD and NM tags, as not preserved in CRAM files
args = ["-x", "MD", "-x", "NM"]
if not without_header:
@@ -161,8 +160,10 @@ def check_lines_equal(cls, a, b, sort=False, filter_f=None, msg=None):
filter_f:
remover lines in both a and b where expression is True
"""
- aa = openfile(a).readlines()
- bb = openfile(b).readlines()
+ with openfile(a) as inf:
+ aa = inf.readlines()
+ with openfile(b) as inf:
+ bb = inf.readlines()
if filter_f is not None:
aa = [x for x in aa if not filter_f(x)]
@@ -183,3 +184,28 @@ def get_temp_filename(suffix=""):
dir=".")
f.close()
return f.name
+
+
+def load_and_convert(filename, encode=True):
+ '''load data from filename and convert all fields to string.
+
+ Filename can be either plain or compressed (ending in .gz).
+ '''
+ data = []
+ if filename.endswith(".gz"):
+ with gzip.open(filename) as inf:
+ for line in inf:
+ line = line.decode("ascii")
+ if line.startswith("#"):
+ continue
+ d = line.strip().split("\t")
+ data.append(d)
+ else:
+ with open(filename) as f:
+ for line in f:
+ if line.startswith("#"):
+ continue
+ d = line.strip().split("\t")
+ data.append(d)
+
+ return data
diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py
index aa82c66..93307e9 100644
--- a/tests/VariantFile_test.py
+++ b/tests/VariantFile_test.py
@@ -10,10 +10,9 @@ try:
except ImportError:
Path = None
-from TestUtils import get_temp_filename, check_lines_equal
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert
DATADIR="cbcf_data"
-from tabix_test import loadAndConvert
def read_header(filename):
@@ -37,7 +36,7 @@ class TestMissingGenotypes(unittest.TestCase):
filename = "missing_genotypes.vcf"
def setUp(self):
- self.compare = loadAndConvert(
+ self.compare = load_and_convert(
os.path.join(DATADIR, self.filename),
encode=False)
diff --git a/tests/faidx_test.py b/tests/faidx_test.py
index a123550..c87394d 100644
--- a/tests/faidx_test.py
+++ b/tests/faidx_test.py
@@ -222,15 +222,27 @@ class TestRemoteFileFTP(unittest.TestCase):
url = "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa"
-
def testFTPView(self):
if not checkURL(self.url):
return
+
with pysam.Fastafile(self.url) as f:
self.assertEqual(
len(f.fetch("chr1", 0, 1000)),
1000)
+ def test_sequence_lengths_are_available(self):
+ if not checkURL(self.url):
+ return
+
+ with pysam.Fastafile(self.url) as f:
+ self.assertEqual(len(f.references), 3366)
+ self.assertTrue("chr1" in f.references)
+ self.assertEqual(f.lengths[0],
+ 248956422)
+ self.assertEqual(f.get_reference_length("chr1"),
+ 248956422)
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/samtools_test.py b/tests/samtools_test.py
index aa4c554..7eec832 100644
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -63,7 +63,10 @@ class SamtoolsTest(unittest.TestCase):
"ex1.fa", "ex1.fa.fai",
"ex1.sam.gz",
"ex1.bam", "ex1.bam.bai",
- "ex1.sam", "ex2.bam",
+ "ex1.sam",
+ "ex1.sam",
+ "ex2.bam",
+ "ex2.sam",
"ex1.bed"]
# a list of statements to test
@@ -92,7 +95,7 @@ class SamtoolsTest(unittest.TestCase):
# unknow option
# "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam",
# "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam",
- "reheader ex1.sam ex1.bam > %(out)s_ex1.reheader",
+ "reheader ex2.sam ex1.bam > %(out)s_ex1.reheader.bam",
"cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam",
"targetcut ex1.bam > %(out)s_ex1.targetcut",
"phase ex1.bam > %(out)s_ex1.phase",
@@ -143,7 +146,6 @@ class SamtoolsTest(unittest.TestCase):
files.
'''
-
self.check_version()
if not os.path.exists(WORKDIR):
@@ -158,14 +160,23 @@ class SamtoolsTest(unittest.TestCase):
return
+ def get_command(self, statement, map_to_internal=True):
+ """return samtools command from statement"""
+ parts = statement.split(" ")
+ command = parts[0]
+ if map_to_internal:
+ return self.map_command.get(command, command)
+ else:
+ return command
+
def check_statement(self, statement):
parts = statement.split(" ")
r_samtools = {"out": self.executable}
r_pysam = {"out": "pysam"}
- command = parts[0]
- command = self.map_command.get(command, command)
+ command = self.get_command(statement)
+
# self.assertTrue(command in pysam.SAMTOOLS_DISPATCH)
targets = [x for x in parts if "%(out)s" in x]
@@ -217,9 +228,10 @@ class SamtoolsTest(unittest.TestCase):
check_samtools_view_equal(
s, p, without_header=True),
error_msg)
- check_lines_equal(
- self, s, p,
- filter_f=lambda x: x.startswith("#"),
+ else:
+ check_lines_equal(
+ self, s, p,
+ filter_f=lambda x: x.startswith("#"),
msg=error_msg)
def testStatements(self):
@@ -232,6 +244,22 @@ class SamtoolsTest(unittest.TestCase):
continue
self.check_statement(statement)
+ @unittest.skipIf(sys.platform == "darwin", "not supported, pattern does not match")
+ def testUsage(self):
+ if self.executable == "bcftools":
+ # bcftools usage messages end with exit(1)
+ return
+
+ for statement in self.statements:
+ command = self.get_command(statement, map_to_internal=False)
+ if command == "bam2fq":
+ continue
+ mapped_command = self.get_command(statement, map_to_internal=True)
+ pysam_method = getattr(self.module, mapped_command)
+ usage_msg = pysam_method.usage()
+ expected = "Usage:\s+{} {}".format(self.executable, command)
+ self.assertTrue(re.search(expected, usage_msg) is not None)
+
def tearDown(self):
if os.path.exists(WORKDIR):
shutil.rmtree(WORKDIR)
@@ -342,7 +370,8 @@ class BcftoolsTest(SamtoolsTest):
# "filter -s A ex1.vcf.gz > %(out)s_ex1.filter",
# exit
# "gtcheck -s A ex1.vcf.gz > %(out)s_ex1.gtcheck",
- "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
+ # segfauld, used to work wit bcftools 1.3
+ # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
"stats ex1.vcf.gz > %(out)s_ex1.stats",
]
diff --git a/tests/tabix_data/example.gff2.gz b/tests/tabix_data/example.gff2.gz
new file mode 100644
index 0000000..4084a74
Binary files /dev/null and b/tests/tabix_data/example.gff2.gz differ
diff --git a/tests/tabix_data/example.gff2.gz.tbi b/tests/tabix_data/example.gff2.gz.tbi
new file mode 100644
index 0000000..30d39ae
Binary files /dev/null and b/tests/tabix_data/example.gff2.gz.tbi differ
diff --git a/tests/tabix_data/example.gff3.gz b/tests/tabix_data/example.gff3.gz
new file mode 100644
index 0000000..b42b41b
Binary files /dev/null and b/tests/tabix_data/example.gff3.gz differ
diff --git a/tests/tabix_data/example.gff3.gz.tbi b/tests/tabix_data/example.gff3.gz.tbi
new file mode 100644
index 0000000..855e139
Binary files /dev/null and b/tests/tabix_data/example.gff3.gz.tbi differ
diff --git a/tests/tabix_test.py b/tests/tabix_test.py
index ec1e37e..87de282 100644
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -14,7 +14,7 @@ import unittest
import glob
import re
import copy
-from TestUtils import checkURL
+from TestUtils import checkURL, load_and_convert
DATADIR = 'tabix_data'
@@ -35,31 +35,6 @@ def myzip_open(infile, mode="r"):
return gzip.open(mode)
-def loadAndConvert(filename, encode=True):
- '''load data from filename and convert all fields to string.
-
- Filename can be either plain or compressed (ending in .gz).
- '''
- data = []
- if filename.endswith(".gz"):
- with gzip.open(filename) as inf:
- for line in inf:
- line = line.decode("ascii")
- if line.startswith("#"):
- continue
- d = line.strip().split("\t")
- data.append(d)
- else:
- with open(filename) as f:
- for line in f:
- if line.startswith("#"):
- continue
- d = line.strip().split("\t")
- data.append(d)
-
- return data
-
-
def splitToBytes(s):
'''split string and return list of bytes.'''
return [x.encode("ascii") for x in s.split("\t")]
@@ -396,150 +371,8 @@ class TestIterationWithComments(TestIterationWithoutComments):
TestIterationWithoutComments.setUp(self)
-class TestParser(unittest.TestCase):
-
- filename = os.path.join(DATADIR, "example.gtf.gz")
-
- def setUp(self):
-
- self.tabix = pysam.TabixFile(self.filename)
- self.compare = loadAndConvert(self.filename)
-
- def tearDown(self):
- self.tabix.close()
-
- def testRead(self):
-
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
- c = self.compare[x]
- self.assertEqual(c, list(r))
- self.assertEqual(len(c), len(r))
-
- # test indexing
- for y in range(0, len(r)):
- self.assertEqual(c[y], r[y])
-
- # test slicing access
- for y in range(0, len(r) - 1):
- for cc in range(y + 1, len(r)):
- self.assertEqual(c[y:cc],
- r[y:cc])
- self.assertEqual("\t".join(map(str, c)),
- str(r))
-
- def testWrite(self):
-
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- c = list(r)
- for y in range(len(r)):
- r[y] = "test_%05i" % y
- c[y] = "test_%05i" % y
- self.assertEqual([x for x in c], list(r))
- self.assertEqual("\t".join(c), str(r))
- # check second assignment
- for y in range(len(r)):
- r[y] = "test_%05i" % y
- self.assertEqual([x for x in c], list(r))
- self.assertEqual("\t".join(c), str(r))
-
- def testUnset(self):
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- c = list(r)
- e = list(r)
- for y in range(len(r)):
- r[y] = None
- c[y] = None
- e[y] = ""
- self.assertEqual(c, list(r))
- self.assertEqual("\t".join(e), str(r))
-
- def testIteratorCompressed(self):
- '''test iteration from compressed file.'''
- with gzip.open(self.filename) as infile:
- for x, r in enumerate(pysam.tabix_iterator(
- infile, pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- self.assertEqual(len(self.compare[x]), len(r))
-
- # test indexing
- for c in range(0, len(r)):
- self.assertEqual(self.compare[x][c], r[c])
-
- # test slicing access
- for c in range(0, len(r) - 1):
- for cc in range(c + 1, len(r)):
- self.assertEqual(self.compare[x][c:cc],
- r[c:cc])
-
- def testIteratorUncompressed(self):
- '''test iteration from uncompressed file.'''
- tmpfilename = 'tmp_testIteratorUncompressed'
- with gzip.open(self.filename, "rb") as infile, \
- open(tmpfilename, "wb") as outfile:
- outfile.write(infile.read())
-
- with open(tmpfilename) as infile:
- for x, r in enumerate(pysam.tabix_iterator(
- infile, pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- self.assertEqual(len(self.compare[x]), len(r))
-
- # test indexing
- for c in range(0, len(r)):
- self.assertEqual(self.compare[x][c], r[c])
-
- # test slicing access
- for c in range(0, len(r) - 1):
- for cc in range(c + 1, len(r)):
- self.assertEqual(self.compare[x][c:cc],
- r[c:cc])
-
- os.unlink(tmpfilename)
-
- def testCopy(self):
- a = self.tabix.fetch(parser=pysam.asTuple()).next()
- b = copy.copy(a)
- self.assertEqual(a, b)
-
- a = self.tabix.fetch(parser=pysam.asGTF()).next()
- b = copy.copy(a)
- self.assertEqual(a, b)
-
-
-class TestGTF(TestParser):
-
- def testRead(self):
-
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())):
- c = self.compare[x]
- self.assertEqual(len(c), len(r))
- self.assertEqual(list(c), list(r))
- self.assertEqual(c, str(r).split("\t"))
- self.assertTrue(r.gene_id.startswith("ENSG"))
- if r.feature != 'gene':
- self.assertTrue(r.transcript_id.startswith("ENST"))
- self.assertEqual(c[0], r.contig)
- self.assertEqual("\t".join(map(str, c)),
- str(r))
-
- def testSetting(self):
-
- for r in self.tabix.fetch(parser=pysam.asGTF()):
- r.contig = r.contig + "_test"
- r.source = r.source + "_test"
- r.feature = r.feature + "_test"
- r.start += 10
- r.end += 10
- r.score = 20
- r.strand = "+"
- r.frame = 0
- r.attributes = 'gene_id "0001";'
-
-
+
class TestIterators(unittest.TestCase):
-
filename = os.path.join(DATADIR, "example.gtf.gz")
iterator = pysam.tabix_generic_iterator
@@ -549,7 +382,7 @@ class TestIterators(unittest.TestCase):
def setUp(self):
self.tabix = pysam.TabixFile(self.filename)
- self.compare = loadAndConvert(self.filename)
+ self.compare = load_and_convert(self.filename)
self.tmpfilename_uncompressed = 'tmp_TestIterators'
with gzip.open(self.filename, "rb") as infile, \
open(self.tmpfilename_uncompressed, "wb") as outfile:
@@ -622,7 +455,6 @@ class TestIterationMalformattedGTFFiles(unittest.TestCase):
'''test reading from malformatted gtf files.'''
- parser = pysam.asGTF
iterator = pysam.tabix_generic_iterator
parser = pysam.asGTF
@@ -653,7 +485,7 @@ class TestBed(unittest.TestCase):
def setUp(self):
self.tabix = pysam.TabixFile(self.filename)
- self.compare = loadAndConvert(self.filename)
+ self.compare = load_and_convert(self.filename)
def tearDown(self):
self.tabix.close()
@@ -751,7 +583,7 @@ class TestVCFFromTabix(TestVCF):
TestVCF.setUp(self)
self.tabix = pysam.TabixFile(self.tmpfilename + ".gz")
- self.compare = loadAndConvert(self.filename)
+ self.compare = load_and_convert(self.filename)
def tearDown(self):
self.tabix.close()
@@ -858,42 +690,44 @@ class TestVCFFromVCF(TestVCF):
TestVCF.setUp(self)
self.vcf = pysam.VCF()
- self.compare = loadAndConvert(self.filename, encode=False)
+ self.compare = load_and_convert(self.filename, encode=False)
def tearDown(self):
self.vcf.close()
- def testConnecting(self):
+ def open_vcf(self, fn):
+ return self.vcf.connect(fn)
+
+ def get_failure_stage(self):
fn = os.path.basename(self.filename)
for x, msg in self.fail_on_opening:
- if "%i.vcf" % x == fn:
- self.assertRaises(ValueError,
- self.vcf.connect,
- self.tmpfilename + ".gz")
- else:
- self.vcf.connect(self.tmpfilename + ".gz")
+ if "{}.vcf".format(x) == fn:
+ return "opening"
+
+ for x, msg in self.fail_on_parsing:
+ if "{}.vcf".format(x) == fn:
+ return "parsing"
+
+ for x, msg in self.fail_on_samples:
+ if "{}.vcf".format(x) == fn:
+ return "samples"
+
+ return None
+
+ def testConnecting(self):
+
+ if self.get_failure_stage() == "opening":
+ self.assertRaises(ValueError,
+ self.open_vcf,
+ self.tmpfilename + ".gz")
+ else:
+ self.open_vcf(self.tmpfilename + ".gz")
def get_iterator(self):
with open(self.filename) as f:
fn = os.path.basename(self.filename)
-
- for x, msg in self.fail_on_opening:
- if "%i.vcf" % x == fn:
- self.assertRaises(ValueError, self.vcf.parse, f)
- return
-
- for vcf_code, msg in self.fail_on_parsing:
- if "%i.vcf" % vcf_code == fn:
- self.assertRaises((ValueError,
- AssertionError),
- list, self.vcf.parse(f))
- return
- # python 2.7
- # self.assertRaisesRegexp(
- # ValueError, re.compile(msg), self.vcf.parse, f)
-
return list(self.vcf.parse(f))
def get_field_value(self, record, field):
@@ -918,22 +752,15 @@ class TestVCFFromVCF(TestVCF):
def testParsing(self):
+ if self.get_failure_stage() in ("opening", "parsing"):
+ return
+
itr = self.get_iterator()
if itr is None:
return
fn = os.path.basename(self.filename)
- for vcf_code, msg in self.fail_on_parsing:
- if "%i.vcf" % vcf_code == fn:
- self.assertRaises((ValueError,
- AssertionError),
- list, itr)
- return
- # python 2.7
- # self.assertRaisesRegexp(
- # ValueError, re.compile(msg), self.vcf.parse, f)
-
check_samples = self.check_samples
for vcf_code, msg in self.fail_on_samples:
if "%i.vcf" % vcf_code == fn:
@@ -1079,8 +906,14 @@ class TestVCFFromVariantFile(TestVCFFromVCF):
"ref", "alts", "qual",
"filter", "info", "format")
- fail_on_parsing = []
- fail_on_opening = []
+ fail_on_parsing = [
+ (24, "Could not parse the header, sample line not found"),
+ ("issue85", "empty VCF"),
+ ]
+ fail_on_opening = [
+ (24, "Could not parse the header, sample line not found"),
+ ("issue85", "empty VCF"),
+ ]
coordinate_offset = 0
check_samples = True
fail_on_samples = [
@@ -1134,7 +967,7 @@ class TestVCFFromVariantFile(TestVCFFromVCF):
def setUp(self):
TestVCF.setUp(self)
- self.compare = loadAndConvert(self.filename, encode=False)
+ self.compare = load_and_convert(self.filename, encode=False)
def tearDown(self):
if self.vcf:
@@ -1148,9 +981,14 @@ class TestVCFFromVariantFile(TestVCFFromVCF):
def get_field_value(self, record, field):
return getattr(record, field)
+ def open_vcf(self, fn):
+ with pysam.VariantFile(fn) as inf:
+ pass
+
for vcf_file in vcf_files:
- n = "TestVCFFromVariantFile_%s" % os.path.basename(vcf_file[:-4])
+ p = os.path.basename(vcf_file[:-4])
+ n = "TestVCFFromVariantFile_%s" % p
globals()[n] = type(n, (TestVCFFromVariantFile,), dict(filename=vcf_file,))
@@ -1241,7 +1079,7 @@ class TestBackwardsCompatibility(unittest.TestCase):
def check(self, filename, raises=None):
with pysam.TabixFile(filename) as tf:
- ref = loadAndConvert(filename)
+ ref = load_and_convert(filename)
if raises is None:
self.assertEqual(len(list(tf.fetch())), len(ref))
else:
diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py
new file mode 100644
index 0000000..cff0e59
--- /dev/null
+++ b/tests/tabixproxies_test.py
@@ -0,0 +1,318 @@
+import unittest
+import pysam
+import os
+import sys
+import re
+import copy
+import gzip
+from TestUtils import load_and_convert
+
+DATADIR = 'tabix_data'
+
+
+class TestParser(unittest.TestCase):
+
+ filename = os.path.join(DATADIR, "example.gtf.gz")
+
+ def setUp(self):
+
+ self.tabix = pysam.TabixFile(self.filename)
+ self.compare = load_and_convert(self.filename)
+
+ def tearDown(self):
+ self.tabix.close()
+
+ def testRead(self):
+
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
+ c = self.compare[x]
+ self.assertEqual(c, list(r))
+ self.assertEqual(len(c), len(r))
+
+ # test indexing
+ for y in range(0, len(r)):
+ self.assertEqual(c[y], r[y])
+
+ # test slicing access
+ for y in range(0, len(r) - 1):
+ for cc in range(y + 1, len(r)):
+ self.assertEqual(c[y:cc],
+ r[y:cc])
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
+
+ def testWrite(self):
+
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ c = list(r)
+ for y in range(len(r)):
+ r[y] = "test_%05i" % y
+ c[y] = "test_%05i" % y
+ self.assertEqual([x for x in c], list(r))
+ self.assertEqual("\t".join(c), str(r))
+ # check second assignment
+ for y in range(len(r)):
+ r[y] = "test_%05i" % y
+ self.assertEqual([x for x in c], list(r))
+ self.assertEqual("\t".join(c), str(r))
+
+ def testUnset(self):
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ c = list(r)
+ e = list(r)
+ for y in range(len(r)):
+ r[y] = None
+ c[y] = None
+ e[y] = ""
+ self.assertEqual(c, list(r))
+ self.assertEqual("\t".join(e), str(r))
+
+ def testIteratorCompressed(self):
+ '''test iteration from compressed file.'''
+ with gzip.open(self.filename) as infile:
+ for x, r in enumerate(pysam.tabix_iterator(
+ infile, pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ self.assertEqual(len(self.compare[x]), len(r))
+
+ # test indexing
+ for c in range(0, len(r)):
+ self.assertEqual(self.compare[x][c], r[c])
+
+ # test slicing access
+ for c in range(0, len(r) - 1):
+ for cc in range(c + 1, len(r)):
+ self.assertEqual(self.compare[x][c:cc],
+ r[c:cc])
+
+ def testIteratorUncompressed(self):
+ '''test iteration from uncompressed file.'''
+ tmpfilename = 'tmp_testIteratorUncompressed'
+ with gzip.open(self.filename, "rb") as infile, \
+ open(tmpfilename, "wb") as outfile:
+ outfile.write(infile.read())
+
+ with open(tmpfilename) as infile:
+ for x, r in enumerate(pysam.tabix_iterator(
+ infile, pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ self.assertEqual(len(self.compare[x]), len(r))
+
+ # test indexing
+ for c in range(0, len(r)):
+ self.assertEqual(self.compare[x][c], r[c])
+
+ # test slicing access
+ for c in range(0, len(r) - 1):
+ for cc in range(c + 1, len(r)):
+ self.assertEqual(self.compare[x][c:cc],
+ r[c:cc])
+
+ os.unlink(tmpfilename)
+
+ def testCopy(self):
+ a = self.tabix.fetch(parser=pysam.asTuple()).next()
+ b = copy.copy(a)
+ self.assertEqual(a, b)
+
+ a = self.tabix.fetch(parser=pysam.asGTF()).next()
+ b = copy.copy(a)
+ self.assertEqual(a, b)
+
+
+class TestGTF(TestParser):
+
+ parser = pysam.asGTF
+
+ def testRead(self):
+
+ for x, r in enumerate(self.tabix.fetch(parser=self.parser())):
+ c = self.compare[x]
+ self.assertEqual(len(c), len(r))
+ self.assertEqual(list(c), list(r))
+ self.assertEqual(c, str(r).split("\t"))
+ self.assertTrue(r.gene_id.startswith("ENSG"))
+ if r.feature != 'gene':
+ self.assertTrue(r.transcript_id.startswith("ENST"))
+ self.assertEqual(c[0], r.contig)
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
+
+ def testSetting(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+
+ r.contig = r.contig + "_test_contig"
+ r.source = r.source + "_test_source"
+ r.feature = r.feature + "_test_feature"
+ r.start += 10
+ r.end += 10
+ r.score = 20
+ r.strand = "+"
+ r.frame = 0
+ r.attributes = 'gene_id "0001";'
+ r.transcript_id = "0002"
+ sr = str(r)
+ self.assertTrue("_test_contig" in sr)
+ self.assertTrue("_test_source" in sr)
+ self.assertTrue("_test_feature" in sr)
+ self.assertTrue("gene_id \"0001\"" in sr)
+ self.assertTrue("transcript_id \"0002\"" in sr)
+
+ def test_added_attribute_is_output(self):
+ r = self.tabix.fetch(parser=self.parser()).next()
+
+ r.new_int_attribute = 12
+ self.assertTrue("new_int_attribute 12" in str(r).split("\t")[8])
+
+ r.new_float_attribute = 12.0
+ self.assertTrue("new_float_attribute 12.0" in str(r).split("\t")[8])
+
+ r.new_text_attribute = "abc"
+ self.assertTrue("new_text_attribute \"abc\"" in str(r).split("\t")[8])
+
+ def test_setting_start_is_one_based(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.start = 1800
+ self.assertEqual(r.start, 1800)
+ self.assertEqual(str(r).split("\t")[3], "1801")
+
+ def test_setting_end_is_one_based(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.end = 2100
+ self.assertEqual(r.end, 2100)
+ self.assertEqual(str(r).split("\t")[4], "2100")
+
+ def test_setting_frame_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.frame = None
+ self.assertEqual(str(r).split("\t")[7], ".")
+
+ r.frame = 2
+ self.assertEqual(str(r).split("\t")[7], "2")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.frame = "."
+ self.assertEqual(r.frame, None)
+ self.assertEqual(str(r).split("\t")[7], ".")
+
+ def test_setting_source_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.source = None
+ self.assertEqual(str(r).split("\t")[1], ".")
+
+ r.source = "source"
+ self.assertEqual(str(r).split("\t")[1], "source")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.source = "."
+ self.assertEqual(r.source, None)
+ self.assertEqual(str(r).split("\t")[1], ".")
+
+ def test_setting_feature_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.feature = None
+ self.assertEqual(str(r).split("\t")[2], ".")
+
+ r.feature = "feature"
+ self.assertEqual(str(r).split("\t")[2], "feature")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.feature = "."
+ self.assertEqual(r.feature, None)
+ self.assertEqual(str(r).split("\t")[2], ".")
+
+ def test_setting_strand_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.strand = None
+ self.assertEqual(str(r).split("\t")[6], ".")
+
+ r.strand = "-"
+ self.assertEqual(str(r).split("\t")[6], "-")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.strand = "."
+ self.assertEqual(r.strand, None)
+ self.assertEqual(str(r).split("\t")[6], ".")
+
+ def test_setting_score_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.score = None
+ self.assertEqual(str(r).split("\t")[5], ".")
+
+ r.score = 12.0
+ self.assertEqual(str(r).split("\t")[5], "12.0")
+
+ r.score = -12.0
+ self.assertEqual(str(r).split("\t")[5], "-12.0")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.score = "."
+ self.assertEqual(r.score, None)
+ self.assertEqual(str(r).split("\t")[5], ".")
+
+ r.score = 12
+ self.assertEqual(str(r).split("\t")[5], "12")
+
+ r.score = -12
+ self.assertEqual(str(r).split("\t")[5], "-12")
+
+
+class TestGFF3(TestGTF):
+
+ parser = pysam.asGFF3
+ filename = os.path.join(DATADIR, "example.gff3.gz")
+
+ def testRead(self):
+ for x, r in enumerate(self.tabix.fetch(parser=self.parser())):
+ c = self.compare[x]
+ self.assertEqual(len(c), len(r))
+ self.assertEqual(list(c), list(r))
+ self.assertEqual(c, str(r).split("\t"))
+ self.assertEqual(c[0], r.contig)
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
+ self.assertTrue(r.ID.startswith("MI00"))
+
+ def testSetting(self):
+
+ for r in self.tabix.fetch(parser=self.parser()):
+ r.contig = r.contig + "_test_contig"
+ r.source = "test_source"
+ r.feature = "test_feature"
+ r.start += 10
+ r.end += 10
+ r.score = 20
+ r.strand = "+"
+ r.frame = 0
+ r.ID="test"
+ sr = str(r)
+ self.assertTrue("test_contig" in sr)
+ self.assertTrue("test_source" in sr)
+ self.assertTrue("test_feature" in sr)
+ self.assertTrue("ID=test" in sr)
+
+ def test_added_attribute_is_output(self):
+ r = self.tabix.fetch(parser=self.parser()).next()
+
+ r.new_int_attribute = 12
+ self.assertTrue("new_int_attribute=12" in str(r).split("\t")[8])
+
+ r.new_float_attribute = 12.0
+ self.assertTrue("new_float_attribute=12.0" in str(r).split("\t")[8])
+
+ r.new_text_attribute = "abc"
+ self.assertTrue("new_text_attribute=abc" in str(r).split("\t")[8])
+
+
+if __name__ == "__main__":
+ unittest.main()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pysam.git
More information about the debian-med-commit
mailing list