[med-svn] [python-pysam] 01/05: New upstream version 0.13.0+ds
Andreas Tille
tille at debian.org
Wed Dec 13 13:09:29 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository python-pysam.
commit f3bd90ac692a53de9dee562e82a11c5cda33b542
Author: Andreas Tille <tille at debian.org>
Date: Wed Dec 13 12:59:12 2017 +0100
New upstream version 0.13.0+ds
---
.travis.yml | 2 +
INSTALL | 117 ++++--
MANIFEST.in | 5 +
doc/release.rst => NEWS | 19 +
bcftools/consensus.c | 129 ++++--
bcftools/consensus.c.pysam.c | 129 ++++--
bcftools/csq.c | 133 ++++---
bcftools/csq.c.pysam.c | 133 ++++---
bcftools/filter.c | 652 ++++++++++++++++++++++---------
bcftools/filter.c.pysam.c | 652 ++++++++++++++++++++++---------
bcftools/kheap.h | 5 +-
bcftools/main.c | 5 +
bcftools/main.c.pysam.c | 5 +
bcftools/mpileup.c | 9 +-
bcftools/mpileup.c.pysam.c | 9 +-
bcftools/vcfcnv.c | 6 +
bcftools/vcfcnv.c.pysam.c | 6 +
bcftools/vcfconvert.c | 5 +-
bcftools/vcfconvert.c.pysam.c | 5 +-
bcftools/vcfindex.c | 7 +
bcftools/vcfindex.c.pysam.c | 7 +
bcftools/vcfisec.c | 14 +-
bcftools/vcfisec.c.pysam.c | 14 +-
bcftools/vcfmerge.c | 14 +-
bcftools/vcfmerge.c.pysam.c | 14 +-
bcftools/vcfnorm.c | 18 +
bcftools/vcfnorm.c.pysam.c | 18 +
bcftools/vcfquery.c | 17 +
bcftools/vcfquery.c.pysam.c | 17 +
bcftools/vcfsort.c | 306 +++++++++++++++
bcftools/vcfsort.c.pysam.c | 308 +++++++++++++++
bcftools/vcfstats.c | 63 ++-
bcftools/vcfstats.c.pysam.c | 63 ++-
bcftools/version.h | 2 +-
doc/installation.rst | 23 ++
doc/release.rst | 11 +
import.py | 2 +
pysam/libcalignedsegment.pyx | 380 ++++++++++++------
pysam/libcalignmentfile.pxd | 4 +-
pysam/libcalignmentfile.pyx | 2 +-
pysam/libctabix.pyx | 99 +++--
pysam/libctabixproxies.pyx | 1 +
pysam/version.py | 8 +-
run_tests_travis.sh | 11 +-
samtools/bam.h | 2 +-
samtools/bam_lpileup.c | 1 +
samtools/bam_lpileup.c.pysam.c | 1 +
samtools/bam_markdup.c | 844 +++++++++++++++++++++++++++++++++++++++
samtools/bam_markdup.c.pysam.c | 846 ++++++++++++++++++++++++++++++++++++++++
samtools/bam_mate.c | 50 ++-
samtools/bam_mate.c.pysam.c | 50 ++-
samtools/bam_plcmd.c | 22 ++
samtools/bam_plcmd.c.pysam.c | 22 ++
samtools/bam_reheader.c.pysam.c | 2 +-
samtools/bam_sort.c | 494 +++++++++++++++++------
samtools/bam_sort.c.pysam.c | 494 +++++++++++++++++------
samtools/bamtk.c | 15 +
samtools/bamtk.c.pysam.c | 22 +-
samtools/bedidx.c | 4 -
samtools/bedidx.c.pysam.c | 4 -
samtools/dict.c | 4 +
samtools/dict.c.pysam.c | 4 +
samtools/padding.c | 31 +-
samtools/padding.c.pysam.c | 31 +-
samtools/phase.c | 1 +
samtools/phase.c.pysam.c | 1 +
samtools/sam_view.c | 33 +-
samtools/sam_view.c.pysam.c | 33 +-
samtools/version.h | 2 +-
setup.py | 12 +-
tests/AlignedSegment_test.py | 261 +++++++++++--
tests/AlignmentFile_test.py | 137 +------
tests/TestUtils.py | 10 +-
tests/linking_test.py | 16 +-
tests/samtools_test.py | 16 +-
tests/tabix_test.py | 53 ++-
tests/tabixproxies_test.py | 22 +-
77 files changed, 5692 insertions(+), 1297 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index bfc5d1c..f874a90 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,6 +11,8 @@ env:
- CONDA_PY=3.4
- CONDA_PY=3.5
- CONDA_PY=3.6
+ global:
+ - PYSAM_LINKING_TEST=1
addons:
apt:
diff --git a/INSTALL b/INSTALL
index a1edd45..9636125 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,57 +1,102 @@
-System Requirements
-===================
+An online version of the installation instructions can be found here:
+http://pysam.readthedocs.io/en/latest/installation.html
-SAMtools depends on the zlib library <http://www.zlib.net>. The latest
-version 1.2.3 is preferred and with the latest version you can compile
-razip and use it to compress a FASTA file. SAMtools' faidx is able to
-index a razip-compressed FASTA file to save diskspace. Older zlib also
-works with SAMtools, but razip cannot be compiled.
+================
+Installing pysam
+================
-The text-based viewer (tview) requires the GNU ncurses library
-<http://www.gnu.org/software/ncurses/>, which comes with Mac OS X and
-most of the modern Linux/Unix distributions. If you do not have this
-library installed, you can still compile the rest of SAMtools by
-manually modifying one line in Makefile.
+Pysam can be installed through conda_, pypi_ and from the repository.
+The recommended way to install pysam is through conda/bioconda.
-curl
+Conda installation
+==================
-Pysam requires Python (2.7 or greater) and Cython (0.22 or greater).
-It has not been tested on many other platforms.
+To install pysam in your current conda_ environment, type::
-Windows support does not work yet.
+ conda config --add channels r
+ conda config --add channels bioconda
+ conda install pysam
-Compilation
-===========
+This will install pysam from the bioconda_ channel and automatically
+makes sure that dependencies are installed. Also, compilation flags
+will be set automatically, which will potentially save a lot of
+trouble on OS X.
-Unpack the distribution and enter the pysam directory. Type
+Pypi installation
+=================
-python setup.py build
+Pysam provides a python interface to the functionality contained
+within the htslib_ C library. There are two ways that these two
+can be combined, ``builtin`` and ``external``.
-to compile.
+Builtin
+-------
-Installation
-============
+The typical installation will be through pypi_::
+
+ pip install pysam
+
+This will compile the ``builtin`` htslib source code within pysam.
+
+htslib_ can be configured at compilation to turn on additional
+features such support using encrypted configurations, enable plugins,
+and more. See the htslib_ project for more information on these.
+
+Pysam will attempt to configure htslib_ to turn on some advanced
+features. If these fail, for example due to missing library
+dependencies (`libcurl`, `libcrypto`), it will fall back to
+conservative defaults.
+
+Options can be passed to the configure script explicitely by
+setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`.
+For example::
-Type
+ export HTSLIB_CONFIGURE_OPTIONS=--enable-plugins
+ pip install pysam
- python setup.py install
+External
+--------
-to install it within the site-packages directory of your python
-distribution. Type
+pysam can be combined with an externally installed htslib_
+library. This is a good way to avoid duplication of libraries. To link
+against an externally installed library, set the environment variables
+`HTSLIB_LIBRARY_DIR` and `HTSLIB_INCLUDE_DIR` before installing::
- python setup.py install --help
+ export HTSLIB_LIBRARY_DIR=/usr/local/lib
+ export HTSLIB_INCLUDE_DIR=/usr/local/include
+ pip install pysam
-for more options.
+Note that the location of the file :file:`libhts.so` needs to be known
+to the linker once you run pysam, for example by setting the
+environment-varirable `LD_LIBRARY_PATH`.
-Build the documentation
-=======================
+Note that generally the pysam and htslib version need to be compatible. See
+the release notes for more information.
-Install a version of Sphinx that matches your Python version (2 or 3) and run
+Installation from repository
+============================
- python setup.py build_sphinx
+pysam depends on cython_ to provide the connectivity to the htslib_ C
+library. The installation of the source tarball (:file:`.tar.gz`)
+contains pre-built C-files and cython needs not be present
+during installation. However, when installing from the repository,
+cython needs to be installed beforehand.
+
+To install from repository, type::
+
+ python setup.py install
+
+For compilation options, see the section on Pypi installation above.
+
+Requirements
+============
-or
+Depending on the installation method, requirements for building pysam differ.
- python3 setup.py build_sphinx
+When installing through conda_, dependencies will be resolved by the
+package manager. The pip_ installation and installation from source
+require a C compiler and its standard libraries as well as all
+requirements for building htslib. Htslib requirements are listed in
+the htslib/INSTALL file.
-The documentation will be put into build/sphinx.
+Installing from the repository will require cython_ to be installed.
diff --git a/MANIFEST.in b/MANIFEST.in
index 20b7777..4c431ec 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,6 +5,7 @@
#
include MANIFEST.in
include COPYING
+include NEWS
include INSTALL
include KNOWN_BUGS
include THANKS
@@ -31,6 +32,8 @@ exclude bcftools/config.h
# htslib
include htslib/*.c
include htslib/*.h
+include htslib/INSTALL
+include htslib/NEWS
exclude htslib/config.h
include htslib/Makefile
include htslib/htslib_vars.mk
@@ -41,6 +44,8 @@ include htslib/htslib.pc.in
include htslib/htslib/*.h
include htslib/cram/*.c
include htslib/cram/*.h
+include htslib/win/*.c
+include htslib/win/*.h
include cy_build.py
include pysam.py
include requirements.txt
diff --git a/doc/release.rst b/NEWS
similarity index 96%
copy from doc/release.rst
copy to NEWS
index 18af4ad..528d750 100644
--- a/doc/release.rst
+++ b/NEWS
@@ -1,7 +1,26 @@
+An online version of the installation instructions can be found here:
+http://pysam.readthedocs.io/en/latest/release.html
+
=============
Release notes
=============
+Release 0.13.0
+===============
+
+This release wraps htslib/samtools/bcftools versions 1.6.0 and
+contains a series of bugfixes.
+
+* [#544] reading header from remote TabixFiles now works.
+* [#531] add missing tag types H and A. A python float will now be
+ added as 'f' type instead of 'd' type.
+* [#543] use FastaFile instead of Fastafile in pileup.
+* [#546] set is_modified flag in setAttribute so updated attributes
+ are output.
+* [#537] allow tabix index files to be created in a custom location.
+* [#530] add get_index_statistics() method
+
+
Release 0.12.0.1
================
diff --git a/bcftools/consensus.c b/bcftools/consensus.c
index 258ef14..544eca6 100644
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2014 Genome Research Ltd.
+ Copyright (c) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -39,6 +39,16 @@
#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define PICK_REF 1
+#define PICK_ALT 2
+#define PICK_LONG 4
+#define PICK_SHORT 8
typedef struct
{
@@ -75,12 +85,16 @@ typedef struct
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
// Note that the chain is re-initialised for each chromosome/seq_region
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
bcf_srs_t *files;
bcf_hdr_t *hdr;
FILE *fp_out;
FILE *fp_chain;
char **argv;
- int argc, output_iupac, haplotype, isample;
+ int argc, output_iupac, haplotype, allele, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
}
args_t;
@@ -195,7 +209,7 @@ static void init_data(args_t *args)
args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
}
- if ( args->haplotype && args->isample<0 )
+ if ( (args->haplotype || args->allele) && args->isample<0 )
{
if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
args->isample = 0;
@@ -220,10 +234,14 @@ static void init_data(args_t *args)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = stdout;
+ if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records\n");
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
}
static void destroy_data(args_t *args)
{
+ if (args->filter) filter_destroy(args->filter);
bcf_sr_destroy(args->files);
int i;
for (i=0; i<args->vcf_rbuf.m; i++)
@@ -287,9 +305,16 @@ static bcf1_t **next_vcf_line(args_t *args)
int i = rbuf_shift(&args->vcf_rbuf);
return &args->vcf_buf[i];
}
- else if ( bcf_sr_next_line(args->files) )
+ while ( bcf_sr_next_line(args->files) )
+ {
+ if ( args->filter )
+ {
+ int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
+ if ( !is_ok ) continue;
+ }
return &args->files->readers[0].buffer[0];
-
+ }
return NULL;
}
static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
@@ -358,33 +383,36 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int i, ialt = 1;
if ( args->isample >= 0 )
{
+ bcf_unpack(rec, BCF_UN_FMT);
bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
if ( !fmt ) return;
+
+ if ( fmt->type!=BCF_BT_INT8 )
+ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+ uint8_t *ptr = fmt->p + fmt->size*args->isample;
+
if ( args->haplotype )
{
if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ ialt = ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
}
else if ( args->output_iupac )
{
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ ialt = ptr[0];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
int jalt;
if ( fmt->n>1 )
{
- ptr = fmt->p + fmt->size*args->isample + 1;
- jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ jalt = ptr[1];
if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
else jalt = bcf_gt_allele(jalt);
}
else jalt = ialt;
- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
{
char ial = rec->d.allele[ialt][0];
@@ -394,13 +422,40 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
else
{
+ int is_hom = 1;
for (i=0; i<fmt->n; i++)
{
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
- if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
- ialt = bcf_gt_allele(ialt);
- if ( ialt ) break;
+ if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes
+ if ( ptr[i]==bcf_int32_vector_end ) break;
+ ialt = bcf_gt_allele(ptr[i]);
+ if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
+ }
+ if ( !is_hom )
+ {
+ int prev_len = 0, jalt;
+ for (i=0; i<fmt->n; i++)
+ {
+ if ( ptr[i]==bcf_int32_vector_end ) break;
+ jalt = bcf_gt_allele(ptr[i]);
+ if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( args->allele & (PICK_LONG|PICK_SHORT) )
+ {
+ int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]);
+ if ( i==0 ) ialt = jalt, prev_len = len;
+ else if ( len == prev_len )
+ {
+ if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len;
+ else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len;
+ }
+ else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len;
+ else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len;
+ }
+ else
+ {
+ if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt;
+ else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt;
+ }
+ }
}
}
if ( !ialt ) return; // ref allele
@@ -623,12 +678,21 @@ static void usage(args_t *args)
fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
- fprintf(stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(stderr, " -H, --haplotype <which> choose which allele to use from the FORMAT/GT field, note\n");
+ fprintf(stderr, " the codes are case-insensitive:\n");
+ fprintf(stderr, " 1: first allele from GT\n");
+ fprintf(stderr, " 2: second allele\n");
+ fprintf(stderr, " R: REF allele in het genotypes\n");
+ fprintf(stderr, " A: ALT allele\n");
+ fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
+ fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(stderr, " -m, --mask <file> replace regions with N\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
@@ -645,8 +709,10 @@ int main_consensus(int argc, char *argv[])
static struct option loptions[] =
{
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
- {"iupac-codes",0,0,'i'},
+ {"iupac-codes",0,0,'I'},
{"haplotype",1,0,'H'},
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
@@ -655,19 +721,32 @@ int main_consensus(int argc, char *argv[])
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
{
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
- case 'i': args->output_iupac = 1; break;
+ case 'I': args->output_iupac = 1; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'c': args->chain_fname = optarg; break;
case 'H':
- args->haplotype = optarg[0] - '0';
- if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
+ else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
+ else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
+ else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF;
+ else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF;
+ else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
+ else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
+ else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+ else
+ {
+ args->haplotype = optarg[0] - '0';
+ if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ }
break;
default: usage(args); break;
}
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c
index 86e855e..5250b4f 100644
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -2,7 +2,7 @@
/* The MIT License
- Copyright (c) 2014 Genome Research Ltd.
+ Copyright (c) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -41,6 +41,16 @@
#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define PICK_REF 1
+#define PICK_ALT 2
+#define PICK_LONG 4
+#define PICK_SHORT 8
typedef struct
{
@@ -77,12 +87,16 @@ typedef struct
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
// Note that the chain is re-initialised for each chromosome/seq_region
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
bcf_srs_t *files;
bcf_hdr_t *hdr;
FILE *fp_out;
FILE *fp_chain;
char **argv;
- int argc, output_iupac, haplotype, isample;
+ int argc, output_iupac, haplotype, allele, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
}
args_t;
@@ -197,7 +211,7 @@ static void init_data(args_t *args)
args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
}
- if ( args->haplotype && args->isample<0 )
+ if ( (args->haplotype || args->allele) && args->isample<0 )
{
if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
args->isample = 0;
@@ -222,10 +236,14 @@ static void init_data(args_t *args)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = pysam_stdout;
+ if ( args->isample<0 ) fprintf(pysam_stderr,"Note: the --sample option not given, applying all records\n");
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
}
static void destroy_data(args_t *args)
{
+ if (args->filter) filter_destroy(args->filter);
bcf_sr_destroy(args->files);
int i;
for (i=0; i<args->vcf_rbuf.m; i++)
@@ -289,9 +307,16 @@ static bcf1_t **next_vcf_line(args_t *args)
int i = rbuf_shift(&args->vcf_rbuf);
return &args->vcf_buf[i];
}
- else if ( bcf_sr_next_line(args->files) )
+ while ( bcf_sr_next_line(args->files) )
+ {
+ if ( args->filter )
+ {
+ int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
+ if ( !is_ok ) continue;
+ }
return &args->files->readers[0].buffer[0];
-
+ }
return NULL;
}
static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
@@ -360,33 +385,36 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int i, ialt = 1;
if ( args->isample >= 0 )
{
+ bcf_unpack(rec, BCF_UN_FMT);
bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
if ( !fmt ) return;
+
+ if ( fmt->type!=BCF_BT_INT8 )
+ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+ uint8_t *ptr = fmt->p + fmt->size*args->isample;
+
if ( args->haplotype )
{
if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ ialt = ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
}
else if ( args->output_iupac )
{
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ ialt = ptr[0];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
int jalt;
if ( fmt->n>1 )
{
- ptr = fmt->p + fmt->size*args->isample + 1;
- jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ jalt = ptr[1];
if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
else jalt = bcf_gt_allele(jalt);
}
else jalt = ialt;
- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
{
char ial = rec->d.allele[ialt][0];
@@ -396,13 +424,40 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
else
{
+ int is_hom = 1;
for (i=0; i<fmt->n; i++)
{
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
- if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
- ialt = bcf_gt_allele(ialt);
- if ( ialt ) break;
+ if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes
+ if ( ptr[i]==bcf_int32_vector_end ) break;
+ ialt = bcf_gt_allele(ptr[i]);
+ if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
+ }
+ if ( !is_hom )
+ {
+ int prev_len = 0, jalt;
+ for (i=0; i<fmt->n; i++)
+ {
+ if ( ptr[i]==bcf_int32_vector_end ) break;
+ jalt = bcf_gt_allele(ptr[i]);
+ if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( args->allele & (PICK_LONG|PICK_SHORT) )
+ {
+ int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]);
+ if ( i==0 ) ialt = jalt, prev_len = len;
+ else if ( len == prev_len )
+ {
+ if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len;
+ else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len;
+ }
+ else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len;
+ else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len;
+ }
+ else
+ {
+ if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt;
+ else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt;
+ }
+ }
}
}
if ( !ialt ) return; // ref allele
@@ -625,12 +680,21 @@ static void usage(args_t *args)
fprintf(pysam_stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -c, --chain <file> write a chain file for liftover\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(pysam_stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
- fprintf(pysam_stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(pysam_stderr, " -H, --haplotype <which> choose which allele to use from the FORMAT/GT field, note\n");
+ fprintf(pysam_stderr, " the codes are case-insensitive:\n");
+ fprintf(pysam_stderr, " 1: first allele from GT\n");
+ fprintf(pysam_stderr, " 2: second allele\n");
+ fprintf(pysam_stderr, " R: REF allele in het genotypes\n");
+ fprintf(pysam_stderr, " A: ALT allele\n");
+ fprintf(pysam_stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
+ fprintf(pysam_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
+ fprintf(pysam_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(pysam_stderr, " -m, --mask <file> replace regions with N\n");
fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysam_stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(pysam_stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(pysam_stderr, "Examples:\n");
fprintf(pysam_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
@@ -647,8 +711,10 @@ int main_consensus(int argc, char *argv[])
static struct option loptions[] =
{
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
- {"iupac-codes",0,0,'i'},
+ {"iupac-codes",0,0,'I'},
{"haplotype",1,0,'H'},
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
@@ -657,19 +723,32 @@ int main_consensus(int argc, char *argv[])
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
{
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
- case 'i': args->output_iupac = 1; break;
+ case 'I': args->output_iupac = 1; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'c': args->chain_fname = optarg; break;
case 'H':
- args->haplotype = optarg[0] - '0';
- if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
+ else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
+ else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
+ else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF;
+ else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF;
+ else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
+ else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
+ else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+ else
+ {
+ args->haplotype = optarg[0] - '0';
+ if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ }
break;
default: usage(args); break;
}
diff --git a/bcftools/csq.c b/bcftools/csq.c
index b1db103..94ac442 100644
--- a/bcftools/csq.c
+++ b/bcftools/csq.c
@@ -164,17 +164,6 @@
#define N_SPLICE_REGION_EXON 3
#define N_SPLICE_REGION_INTRON 8
-// Ensembl ID format, e.g.
-// ENST00000423372 for human .. ENST%011d
-// ENSMUST00000120394 for mouse .. ENSMUST%011d
-char ENSID_BUF[32], *ENSID_FMT = NULL;
-static inline char *ENSID(uint32_t id)
-{
- sprintf(ENSID_BUF,ENSID_FMT,id);
- return ENSID_BUF;
-}
-
-
#define N_REF_PAD 10 // number of bases to avoid boundary effects
#define STRAND_REV 0
@@ -509,7 +498,6 @@ hap_t;
temporary list of all exons, CDS, UTRs
*/
KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2int, int)
KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
typedef struct
{
@@ -522,25 +510,41 @@ typedef struct
uint32_t iseq:29;
}
ftr_t;
+/*
+ Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+ to integer id. To keep the memory requirements low, the original version
+ relied on IDs in the form of a string prefix and a numerical id. However,
+ it turns out that this assumption is not valid for some ensembl GFFs, see
+ for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+ void *str2id; // khash_str2int
+ int nstr, mstr;
+ char **str; // numeric id to string
+}
+id_tbl_t;
typedef struct
{
// all exons, CDS, UTRs
ftr_t *ftr;
int nftr, mftr;
- // mapping from transcript ensembl id to gene id
+ // mapping from gene id to gf_gene_t
kh_int2gene_t *gid2gene;
// mapping from transcript id to tscript, for quick CDS anchoring
kh_int2tscript_t *id2tr;
// sequences
- void *seq2int;
+ void *seq2int; // str2int hash
char **seq;
int nseq, mseq;
// ignored biotypes
void *ignored_biotypes;
+
+ id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
}
aux_t;
@@ -590,6 +594,7 @@ typedef struct _args_t
int nrm_tr, mrm_tr;
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
+ id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
faidx_t *fai;
kstring_t str, str2;
@@ -694,33 +699,38 @@ static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg,
if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
return se+1;
}
-static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+static void gff_id_init(id_tbl_t *tbl)
{
- ss = strstr(ss,needle);
- if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
- ss += strlen(needle);
- while ( *ss && !isdigit(*ss) ) ss++;
- if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
- char *se;
- uint32_t id = strtol(ss, &se, 10);
- if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
- if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice
- return id;
+ memset(tbl, 0, sizeof(*tbl));
+ tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+ khash_str2int_destroy_free(tbl->str2id);
+ free(tbl->str);
}
-static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
{
- ss = strstr(ss,needle);
+ ss = strstr(ss,needle); // e.g. "ID=transcript:"
if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
ss += strlen(needle);
+
char *se = ss;
- while ( *se && !isdigit(*se) ) se++;
- kstring_t str = {0,0,0};
- kputsn(ss,se-ss,&str);
- ss = se;
- while ( *se && isdigit(*se) ) se++;
- ksprintf(&str,"%%0%dd",(int)(se-ss));
- ENSID_FMT = str.s;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ char tmp = *se;
+ *se = 0;
+
+ int id;
+ if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
+ {
+ id = tbl->nstr++;
+ hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+ tbl->str[id] = strdup(ss);
+ int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+ }
+ *se = tmp;
+
+ return id;
}
static inline int gff_parse_type(char *line)
{
@@ -880,10 +890,8 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
}
// create a mapping from transcript_id to gene_id
- uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
- uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
-
- if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species
+ uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
+ uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
tr->id = trid;
@@ -910,7 +918,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
aux_t *aux = &args->init;
// substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+ uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
gf_gene_t *gene = gene_init(aux, gene_id);
assert( !gene->name ); // the gene_id should be unique
@@ -918,13 +926,17 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
// substring search for "Name=OR4F5"
ss = strstr(chr_end+2,"Name=");
- if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
- ss += 5;
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- gene->name = (char*) malloc(se-ss+1);
- memcpy(gene->name,ss,se-ss);
- gene->name[se-ss] = 0;
+ if ( ss )
+ {
+ ss += 5;
+ char *se = ss;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ gene->name = (char*) malloc(se-ss+1);
+ memcpy(gene->name,ss,se-ss);
+ gene->name[se-ss] = 0;
+ }
+ else
+ gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
}
int gff_parse(args_t *args, char *line, ftr_t *ftr)
{
@@ -999,7 +1011,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr)
ss += 2;
// substring search for "Parent=transcript:ENST00000437963"
- ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+ ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
return 0;
}
@@ -1104,7 +1116,7 @@ void tscript_init_cds(args_t *args)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
- error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
assert( phase == len%3 );
len += tr->cds[i]->len;
}
@@ -1132,7 +1144,7 @@ void tscript_init_cds(args_t *args)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
- error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
len += tr->cds[i]->len;
}
}
@@ -1205,6 +1217,8 @@ void init_gff(args_t *args)
aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
aux->ignored_biotypes = khash_str2int_init();
+ gff_id_init(&aux->gene_ids);
+ gff_id_init(&args->tscript_ids);
// parse gff
kstring_t str = {0,0,0};
@@ -1252,7 +1266,7 @@ void init_gff(args_t *args)
else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
else
- error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
}
tscript_init_cds(args);
@@ -1270,6 +1284,7 @@ void init_gff(args_t *args)
// keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
kh_destroy(int2tscript,aux->id2tr);
free(aux->seq);
+ gff_id_destroy(&aux->gene_ids);
if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
{
@@ -1409,7 +1424,7 @@ void destroy_data(args_t *args)
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- free(ENSID_FMT);
+ gff_id_destroy(&args->tscript_ids);
}
/*
@@ -2491,7 +2506,7 @@ exit_duplicate:
#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
#define node2rpos(i) (hap->stack[i].node->rec->pos)
-void kput_vcsq(vcsq_t *csq, kstring_t *str)
+void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
{
// Remove start/stop from incomplete CDS, but only if there is another
// consequence as something must be reported
@@ -2520,7 +2535,7 @@ void kput_vcsq(vcsq_t *csq, kstring_t *str)
if ( csq->gene ) kputs(csq->gene , str);
kputc_('|', str);
- if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+ if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
kputc_('|', str);
kputs(gf_type2gff_string(csq->biotype), str);
@@ -2889,7 +2904,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
fprintf(args->out,"-");
args->str.l = 0;
- kput_vcsq(&csq->type, &args->str);
+ kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
@@ -2913,7 +2928,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
fprintf(args->out,"-");
args->str.l = 0;
- kput_vcsq(&csq->type, &args->str);
+ kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
}
@@ -3057,11 +3072,11 @@ void vbuf_flush(args_t *args)
}
args->str.l = 0;
- kput_vcsq(&vrec->vcsq[0], &args->str);
+ kput_vcsq(args, &vrec->vcsq[0], &args->str);
for (j=1; j<vrec->nvcsq; j++)
{
kputc_(',', &args->str);
- kput_vcsq(&vrec->vcsq[j], &args->str);
+ kput_vcsq(args, &vrec->vcsq[j], &args->str);
}
bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
if ( args->hdr_nsmpl )
@@ -3665,7 +3680,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
return;
}
-const char *usage(void)
+static const char *usage(void)
{
return
"\n"
diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c
index b79a030..4a7810c 100644
--- a/bcftools/csq.c.pysam.c
+++ b/bcftools/csq.c.pysam.c
@@ -166,17 +166,6 @@
#define N_SPLICE_REGION_EXON 3
#define N_SPLICE_REGION_INTRON 8
-// Ensembl ID format, e.g.
-// ENST00000423372 for human .. ENST%011d
-// ENSMUST00000120394 for mouse .. ENSMUST%011d
-char ENSID_BUF[32], *ENSID_FMT = NULL;
-static inline char *ENSID(uint32_t id)
-{
- sprintf(ENSID_BUF,ENSID_FMT,id);
- return ENSID_BUF;
-}
-
-
#define N_REF_PAD 10 // number of bases to avoid boundary effects
#define STRAND_REV 0
@@ -511,7 +500,6 @@ hap_t;
temporary list of all exons, CDS, UTRs
*/
KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2int, int)
KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
typedef struct
{
@@ -524,25 +512,41 @@ typedef struct
uint32_t iseq:29;
}
ftr_t;
+/*
+ Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+ to integer id. To keep the memory requirements low, the original version
+ relied on IDs in the form of a string prefix and a numerical id. However,
+ it turns out that this assumption is not valid for some ensembl GFFs, see
+ for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+ void *str2id; // khash_str2int
+ int nstr, mstr;
+ char **str; // numeric id to string
+}
+id_tbl_t;
typedef struct
{
// all exons, CDS, UTRs
ftr_t *ftr;
int nftr, mftr;
- // mapping from transcript ensembl id to gene id
+ // mapping from gene id to gf_gene_t
kh_int2gene_t *gid2gene;
// mapping from transcript id to tscript, for quick CDS anchoring
kh_int2tscript_t *id2tr;
// sequences
- void *seq2int;
+ void *seq2int; // str2int hash
char **seq;
int nseq, mseq;
// ignored biotypes
void *ignored_biotypes;
+
+ id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
}
aux_t;
@@ -592,6 +596,7 @@ typedef struct _args_t
int nrm_tr, mrm_tr;
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
+ id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
faidx_t *fai;
kstring_t str, str2;
@@ -696,33 +701,38 @@ static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg,
if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
return se+1;
}
-static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+static void gff_id_init(id_tbl_t *tbl)
{
- ss = strstr(ss,needle);
- if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
- ss += strlen(needle);
- while ( *ss && !isdigit(*ss) ) ss++;
- if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
- char *se;
- uint32_t id = strtol(ss, &se, 10);
- if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
- if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice
- return id;
+ memset(tbl, 0, sizeof(*tbl));
+ tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+ khash_str2int_destroy_free(tbl->str2id);
+ free(tbl->str);
}
-static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
{
- ss = strstr(ss,needle);
+ ss = strstr(ss,needle); // e.g. "ID=transcript:"
if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
ss += strlen(needle);
+
char *se = ss;
- while ( *se && !isdigit(*se) ) se++;
- kstring_t str = {0,0,0};
- kputsn(ss,se-ss,&str);
- ss = se;
- while ( *se && isdigit(*se) ) se++;
- ksprintf(&str,"%%0%dd",(int)(se-ss));
- ENSID_FMT = str.s;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ char tmp = *se;
+ *se = 0;
+
+ int id;
+ if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
+ {
+ id = tbl->nstr++;
+ hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+ tbl->str[id] = strdup(ss);
+ int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+ }
+ *se = tmp;
+
+ return id;
}
static inline int gff_parse_type(char *line)
{
@@ -882,10 +892,8 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
}
// create a mapping from transcript_id to gene_id
- uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
- uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
-
- if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species
+ uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
+ uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
tr->id = trid;
@@ -912,7 +920,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
aux_t *aux = &args->init;
// substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+ uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
gf_gene_t *gene = gene_init(aux, gene_id);
assert( !gene->name ); // the gene_id should be unique
@@ -920,13 +928,17 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
// substring search for "Name=OR4F5"
ss = strstr(chr_end+2,"Name=");
- if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
- ss += 5;
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- gene->name = (char*) malloc(se-ss+1);
- memcpy(gene->name,ss,se-ss);
- gene->name[se-ss] = 0;
+ if ( ss )
+ {
+ ss += 5;
+ char *se = ss;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ gene->name = (char*) malloc(se-ss+1);
+ memcpy(gene->name,ss,se-ss);
+ gene->name[se-ss] = 0;
+ }
+ else
+ gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
}
int gff_parse(args_t *args, char *line, ftr_t *ftr)
{
@@ -1001,7 +1013,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr)
ss += 2;
// substring search for "Parent=transcript:ENST00000437963"
- ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+ ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
return 0;
}
@@ -1106,7 +1118,7 @@ void tscript_init_cds(args_t *args)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
- error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
assert( phase == len%3 );
len += tr->cds[i]->len;
}
@@ -1134,7 +1146,7 @@ void tscript_init_cds(args_t *args)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
- error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
len += tr->cds[i]->len;
}
}
@@ -1207,6 +1219,8 @@ void init_gff(args_t *args)
aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
aux->ignored_biotypes = khash_str2int_init();
+ gff_id_init(&aux->gene_ids);
+ gff_id_init(&args->tscript_ids);
// parse gff
kstring_t str = {0,0,0};
@@ -1254,7 +1268,7 @@ void init_gff(args_t *args)
else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
else
- error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
}
tscript_init_cds(args);
@@ -1272,6 +1286,7 @@ void init_gff(args_t *args)
// keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
kh_destroy(int2tscript,aux->id2tr);
free(aux->seq);
+ gff_id_destroy(&aux->gene_ids);
if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
{
@@ -1411,7 +1426,7 @@ void destroy_data(args_t *args)
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- free(ENSID_FMT);
+ gff_id_destroy(&args->tscript_ids);
}
/*
@@ -2493,7 +2508,7 @@ exit_duplicate:
#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
#define node2rpos(i) (hap->stack[i].node->rec->pos)
-void kput_vcsq(vcsq_t *csq, kstring_t *str)
+void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
{
// Remove start/stop from incomplete CDS, but only if there is another
// consequence as something must be reported
@@ -2522,7 +2537,7 @@ void kput_vcsq(vcsq_t *csq, kstring_t *str)
if ( csq->gene ) kputs(csq->gene , str);
kputc_('|', str);
- if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+ if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
kputc_('|', str);
kputs(gf_type2gff_string(csq->biotype), str);
@@ -2891,7 +2906,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
fprintf(args->out,"-");
args->str.l = 0;
- kput_vcsq(&csq->type, &args->str);
+ kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
@@ -2915,7 +2930,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
fprintf(args->out,"-");
args->str.l = 0;
- kput_vcsq(&csq->type, &args->str);
+ kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
}
@@ -3059,11 +3074,11 @@ void vbuf_flush(args_t *args)
}
args->str.l = 0;
- kput_vcsq(&vrec->vcsq[0], &args->str);
+ kput_vcsq(args, &vrec->vcsq[0], &args->str);
for (j=1; j<vrec->nvcsq; j++)
{
kputc_(',', &args->str);
- kput_vcsq(&vrec->vcsq[j], &args->str);
+ kput_vcsq(args, &vrec->vcsq[j], &args->str);
}
bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
if ( args->hdr_nsmpl )
@@ -3667,7 +3682,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
return;
}
-const char *usage(void)
+static const char *usage(void)
{
return
"\n"
diff --git a/bcftools/filter.c b/bcftools/filter.c
index 78ff1f1..3dc91a7 100644
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -67,21 +67,23 @@ typedef struct _token_t
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
- int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
+ int idx; // 0-based index to VCF vectors, -1: not a vector,
+ // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
+ int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
void *hash; // test presence of str value in the hash via comparator
regex_t *regex; // precompiled regex for string comparison
// modified on filter evaluation at each VCF line
- double *values; // In case str_value is set, values[0] is one sample's string length
- char *str_value; // and values[0]*nsamples gives the total length;
+ double *values;
+ kstring_t str_value;
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
uint8_t *pass_samples; // status of individual samples
int nsamples; // number of samples
- int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars
- // for strings, total length of str_value
+ int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
+ int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1
}
token_t;
@@ -93,6 +95,7 @@ struct _filter_t
token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
int32_t *tmpi;
float *tmpf;
+ kstring_t tmps;
int max_unpack, mtmpi, mtmpf, nsamples;
};
@@ -169,6 +172,7 @@ static int filters_next_token(char **str, int *len)
return TOK_VAL;
}
+ int square_brackets = 0;
while ( tmp[0] )
{
if ( tmp[0]=='"' ) break;
@@ -183,11 +187,12 @@ static int filters_next_token(char **str, int *len)
if ( tmp[0]=='(' ) break;
if ( tmp[0]==')' ) break;
if ( tmp[0]=='+' ) break;
- // hacky: so that [*] is not split, the tokenizer does not recognise square brackets []
- if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break;
- if ( tmp[0]=='-' ) break;
+ if ( tmp[0]=='*' && !square_brackets ) break;
+ if ( tmp[0]=='-' && !square_brackets ) break;
if ( tmp[0]=='/' ) break;
if ( tmp[0]=='~' ) break;
+ if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
+ if ( tmp[0]=='[' ) square_brackets++;
tmp++;
}
if ( tmp > *str )
@@ -270,12 +275,15 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
else if ( line->d.info[i].type==BCF_BT_CHAR )
{
int n = line->d.info[i].len;
- int m = (int)tok->values[0];
- hts_expand(char,n+1,m,tok->str_value);
- memcpy(tok->str_value,line->d.info[i].vptr,n);
- tok->str_value[n] = 0;
- tok->values[0] = m;
- tok->nvalues = n;
+ if ( n >= tok->str_value.m )
+ {
+ tok->str_value.m = n + 1;
+ tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m);
+ if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m);
+ }
+ memcpy(tok->str_value.s, line->d.info[i].vptr, n);
+ tok->str_value.s[n] = 0;
+ tok->nvalues = tok->str_value.l = n;
}
else if ( line->d.info[i].type==BCF_BT_FLOAT )
{
@@ -285,10 +293,11 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->values[0] = line->d.info[i].v1.f;
tok->nvalues = 1;
}
- tok->str_value = NULL;
+ tok->str_value.l = 0;
}
else
{
+ tok->str_value.l = 0;
if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
@@ -297,7 +306,6 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->values[0] = line->d.info[i].v1.i;
tok->nvalues = 1;
}
- tok->str_value = NULL;
}
}
static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
@@ -346,8 +354,8 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
return ret ? 0 : 1;
}
- if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1;
- return strcmp(btok->str_value,line->d.id) ? 1 : 0;
+ if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+ return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
}
/**
@@ -409,13 +417,16 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i;
tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
if ( tok->nvalues<=0 ) tok->nvalues = 0;
else
{
hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+ if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i];
+ tok->nvalues = j;
}
}
else
@@ -435,15 +446,21 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i;
tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
if ( tok->nvalues<=0 ) tok->nvalues = 0;
else
{
hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
- if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
- else tok->values[i] = flt->tmpf[i];
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+ if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]);
+ else tok->values[j] = flt->tmpf[i];
+ j++;
+ }
+ tok->nvalues = j;
}
}
else
@@ -461,33 +478,64 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int m = (int)tok->values[0];
- int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m);
- if ( n<0 ) { tok->nvalues = 0; return; }
- tok->values[0] = m; // allocated length
+ int32_t m = tok->str_value.m;
+ int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m);
+ tok->str_value.m = m;
+ if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; }
if ( tok->idx>=0 )
{
// get ith field (i=tok->idx)
int i = 0;
- char *ss = tok->str_value, *se = tok->str_value + n;
+ char *ss = tok->str_value.s, *se = tok->str_value.s + n;
while ( ss<se && i<tok->idx )
{
if ( *ss==',' ) i++;
ss++;
}
- if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; }
+ if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; }
se = ss;
- while ( se-tok->str_value<n && *se!=',' ) se++;
- if ( ss==tok->str_value ) *se = 0;
+ while ( se - tok->str_value.s < n && *se!=',' ) se++;
+ if ( ss==tok->str_value.s ) *se = 0;
else
{
- memmove(tok->str_value,ss,se-ss);
- tok->str_value[se-ss] = 0;
+ memmove(tok->str_value.s, ss, se-ss);
+ tok->str_value.s[se-ss] = 0;
}
- tok->nvalues = se-ss;
+ tok->str_value.l = se - ss;
}
- else if ( tok->idx==-2 ) tok->nvalues = n;
+ else if ( tok->idx==-2 && tok->idxs[0]==-1 ) // keep all values, TAG[*]
+ tok->str_value.l = n;
+ else if ( tok->idx==-2 )
+ {
+ flt->tmps.l = 0;
+ ks_resize(&flt->tmps, n);
+ int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+ if ( end >= n ) end = n - 1;
+ char *beg = tok->str_value.s, *dst = flt->tmps.s;
+ for (i=0; i<=end; i++)
+ {
+ char *end = beg;
+ while ( *end && *end!=',' ) end++;
+
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ memcpy(dst, beg, end - beg);
+ dst += end - beg;
+ dst[0] = ',';
+ dst++;
+ }
+
+ beg = end+1;
+ }
+ dst[0] = 0;
+ tok->str_value.l = dst - flt->tmps.s;
+
+ #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+ SWAP(char *, flt->tmps.s, tok->str_value.s);
+ SWAP(size_t, flt->tmps.m, tok->str_value.m);
+ }
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -503,127 +551,266 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i;
if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
- tok->nvalues = 0;
- else
{
+ tok->nvalues = tok->nsamples = 0;
+ return;
+ }
+ if ( tok->idx >= -1 ) // scalar or vector index
+ {
+ hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
int is_missing = 1;
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
+ int32_t *ptr = flt->tmpi;
+ for (i=0; i<line->n_sample; i++)
{
- if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
+ if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end )
bcf_double_set_missing(tok->values[i]);
else
{
- tok->values[i] = flt->tmpi[i];
+ tok->values[i] = ptr[idx];
is_missing = 0;
}
+ ptr += nvals;
}
if ( is_missing ) tok->nvalues = 0;
- else if ( tok->idx >= 0 )
+ else tok->nvalues = line->n_sample;
+ tok->nsamples = tok->nvalues;
+ return;
+ }
+ if ( tok->idx == -2 )
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
+ int is_missing = 1;
+ int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+ if ( end >= nvals ) end = nvals - 1;
+ int32_t *ptr = flt->tmpi;
+ for (i=0; i<line->n_sample; i++)
{
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- int nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nvalues = 0; // the index is too big
- else
- {
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nvalues = nsmpl;
- }
+ for (k=0; k<=end; k++)
+ if ( k>=tok->nidxs || tok->idxs[k] )
+ {
+ if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
+ bcf_double_set_missing(tok->values[j]);
+ else
+ {
+ tok->values[j] = ptr[k];
+ is_missing = 0;
+ }
+ j++;
+ }
+ ptr += nvals;
+ }
+ if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+ else
+ {
+ tok->nsamples = line->n_sample;
+ tok->nvalues = j;
}
+ return;
}
- tok->nsamples = tok->nvalues;
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i;
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
{
- tok->nvalues = tok->nsamples = 0; // missing values
+ tok->nvalues = tok->nsamples = 0;
+ return;
}
- else
+ if ( tok->idx >= -1 ) // scalar or vector index
{
+ hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
int is_missing = 1;
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
+ float *ptr = flt->tmpf;
+ for (i=0; i<line->n_sample; i++)
{
- if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+ if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
bcf_double_set_missing(tok->values[i]);
else
{
- tok->values[i] = flt->tmpf[i];
+ tok->values[i] = ptr[idx];
is_missing = 0;
}
+ ptr += nvals;
}
if ( is_missing ) tok->nvalues = 0;
- else if ( tok->idx >= 0 )
+ else tok->nvalues = line->n_sample;
+ tok->nsamples = tok->nvalues;
+ return;
+ }
+ if ( tok->idx == -2 )
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
+ int is_missing = 1;
+ int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+ if ( end >= nvals ) end = nvals - 1;
+ float *ptr = flt->tmpf;
+ for (i=0; i<line->n_sample; i++)
{
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- int nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nvalues = 0; // the index is too big
- else
- {
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nvalues = nsmpl;
- }
+ for (k=0; k<=end; k++)
+ if ( k>=tok->nidxs || tok->idxs[k] )
+ {
+ if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
+ bcf_double_set_missing(tok->values[j]);
+ else
+ {
+ tok->values[j] = ptr[k];
+ is_missing = 0;
+ }
+ j++;
+ }
+ ptr += nvals;
+ }
+ if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+ else
+ {
+ tok->nsamples = line->n_sample;
+ tok->nvalues = j;
}
+ return;
}
- tok->nsamples = tok->nvalues;
}
static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int ndim = tok->nsamples * (int)tok->values[0];
- int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim);
+ tok->str_value.l = tok->nvalues = 0;
+ if ( !line->n_sample ) return;
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- ndim /= nsmpl;
- tok->values[0] = ndim;
+ int ndim = tok->str_value.m;
+ int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
+ tok->str_value.m = ndim;
- if ( ret<=0 )
- {
- tok->nvalues = 0;
- return;
- }
+ if ( nstr<=0 ) return;
- if ( tok->idx < 0 ) // scalar
+ if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
{
- tok->nvalues = tok->nsamples = nsmpl;
+ tok->nsamples = line->n_sample;
+ tok->nstr1 = ndim / line->n_sample;
+ tok->nvalues = tok->str_value.l = nstr;
return;
}
- // vector
+ int nstr1 = nstr / line->n_sample;
+
+ // vector, one or multiple indices
int i;
- for (i=0; i<nsmpl; i++)
+ for (i=0; i<line->n_sample; i++)
{
- char *ss = tok->str_value + i*ndim;
- int is = 0, ivec = 0;
- while ( ivec<tok->idx && is<ndim && ss[is] )
- {
- if ( ss[is]==',' ) ivec++;
- is++;
- }
- if ( ivec!=tok->idx || is==ndim || !ss[is] )
+ char *dst = tok->str_value.s + i*nstr1, *str = dst;
+ int nval = 0, ibeg = 0;
+ while ( ibeg < nstr1 )
{
- ss[0] = '.';
- ss[1] = 0;
- continue;
+ int iend = ibeg + 1;
+ while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+
+ int keep = 0;
+ if ( tok->idx >=0 )
+ keep = tok->idx==nval ? 1 : 0;
+ else if ( nval < tok->nidxs )
+ keep = tok->idxs[nval] ? 1 : 0;
+ else if ( tok->idxs[tok->nidxs-1] < 0 )
+ keep = 1;
+
+ if ( keep )
+ {
+ if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+ dst += iend - ibeg + 1;
+ if ( tok->idx>=0 ) break;
+ }
+ if ( !str[iend] ) break;
+ ibeg = iend + 1;
+ nval++;
}
- int ie = is;
- while ( ie<ndim && ss[ie] && ss[ie]!=',' ) ie++;
- if ( is ) memmove(ss,&ss[is],ie-is);
- if ( ndim-(ie-is) ) memset(ss+ie-is,0,ndim-(ie-is));
+ if ( dst==str ) { dst[0] = '.'; dst+=2; }
+ if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
}
- if ( !ndim )
+ tok->nvalues = tok->str_value.l = nstr;
+ tok->nstr1 = nstr1;
+ tok->nsamples = line->n_sample;
+}
+static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
+{
+ bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
+ if ( !fmt )
{
- tok->nvalues = 0;
+ tok->nvalues = tok->str_value.l = 0;
return;
}
- tok->nvalues = ret;
+
+ int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
+ if ( tok->str_value.m <= nvals*nsmpl )
+ {
+ tok->str_value.m = nvals*nsmpl + 1;
+ tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
+ }
+
+#define BRANCH_INT(type_t,vector_end) \
+ { \
+ for (i=0; i<line->n_sample; i++) \
+ { \
+ type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
+ int is_het = 0, has_ref = 0, missing = 0; \
+ for (j=0; j<fmt->n; j++) \
+ { \
+ if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
+ int ial = ptr[j]; \
+ if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
+ if ( j>0 ) \
+ { \
+ int jal = ptr[j-1]; \
+ if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
+ } \
+ } \
+ char *dst = &tok->str_value.s[nvals*i]; \
+ if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+ else if ( type==3 ) \
+ { \
+ if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
+ else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \
+ else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \
+ } \
+ else \
+ { \
+ if ( j==1 ) \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \
+ else dst[0]='a', dst[1]=0; /* a, haploid */ \
+ } \
+ else if ( !is_het ) \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \
+ else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \
+ } \
+ else \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \
+ else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \
+ } \
+ } \
+ } \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
+ }
+#undef BRANCH_INT
tok->nsamples = nsmpl;
+ tok->nvalues = tok->str_value.l = nvals*nsmpl;
+ tok->str_value.s[tok->str_value.l] = 0;
+ tok->nstr1 = nvals;
}
+static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
+static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
@@ -636,67 +823,73 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to
kstring_t str;
gt_length_too_big:
- str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0;
+ tok->str_value.l = 0;
for (i=0; i<nsmpl; i++)
{
- int plen = str.l;
+ int plen = tok->str_value.l;
- bcf_format_gt(fmt, i, &str);
- kputc_(0,&str);
- if ( str.l - plen > blen )
+ bcf_format_gt(fmt, i, &tok->str_value);
+ kputc_(0, &tok->str_value);
+ if ( tok->str_value.l - plen > blen )
{
// too many alternate alleles or ploidy is too large, the genotype does not fit
// three characters ("0/0" vs "10/10").
- tok->str_value = str.s;
blen *= 2;
goto gt_length_too_big;
}
- plen = str.l - plen;
- while ( plen<blen )
+ plen = tok->str_value.l - plen;
+ while ( plen < blen )
{
- kputc_(0, &str);
+ kputc_(0, &tok->str_value);
plen++;
}
}
- tok->nvalues = str.l;
tok->nsamples = nsmpl;
- tok->values[0] = blen;
- tok->str_value = str.s;
+ tok->nvalues = tok->str_value.l;
+ tok->nstr1 = blen;
}
static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
- kputs(line->d.allele[0], &str);
- tok->nvalues = str.l;
- tok->values[0] = str.m;
- tok->str_value = str.s;
+ tok->str_value.l = 0;
+ kputs(line->d.allele[0], &tok->str_value);
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+ tok->str_value.l = 0;
if ( tok->idx>=0 )
{
- if ( line->n_allele >= tok->idx )
- kputs(line->d.allele[tok->idx], &str);
+ if ( line->n_allele > tok->idx + 1 )
+ kputs(line->d.allele[tok->idx + 1], &tok->str_value);
else
- kputc('.', &str);
+ kputc('.', &tok->str_value);
+ tok->idx = 0;
+ }
+ else if ( tok->idx==-2 )
+ {
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+ if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ if ( tok->str_value.l ) kputc(',', &tok->str_value);
+ kputs(line->d.allele[i+1], &tok->str_value);
+ }
}
else if ( line->n_allele>1 )
{
- kputs(line->d.allele[1], &str);
+ kputs(line->d.allele[1], &tok->str_value);
int i;
for (i=2; i<line->n_allele; i++)
{
- kputc(',', &str);
- kputs(line->d.allele[i], &str);
+ kputc(',', &tok->str_value);
+ kputs(line->d.allele[i], &tok->str_value);
}
}
else if ( line->n_allele==1 )
- kputc('.', &str);
- tok->nvalues = str.l;
- tok->values[0] = str.m;
- tok->str_value = str.s;
+ kputc('.', &tok->str_value);
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
{
@@ -857,11 +1050,11 @@ static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->is_str = 0;
- if ( !tok->nvalues ) return;
+ if ( !tok->str_value.l ) return;
if ( tok->idx==-2 )
{
int i = 0;
- char *ss = tok->str_value;
+ char *ss = tok->str_value.s;
while ( *ss )
{
char *se = ss;
@@ -881,9 +1074,10 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
}
else
{
- tok->values[0] = strlen(tok->str_value);
+ tok->values[0] = strlen(tok->str_value.s);
tok->nvalues = 1;
}
+ tok->str_value.l = 0;
}
#define VECTOR_ARITHMETICS(atok,btok,AOP) \
{ \
@@ -1077,7 +1271,16 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
else \
{ \
- if ( (atok)->nsamples && (btok)->nsamples ) \
+ if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+ { \
+ /* any field can match: [*] */ \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ for (j=0; j<(btok)->nvalues; j++) \
+ if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+ } \
+ } \
+ else if ( (atok)->nsamples && (btok)->nsamples ) \
{ \
for (i=0; i<(atok)->nsamples; i++) \
{ \
@@ -1111,15 +1314,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
(atok)->nsamples = (btok)->nsamples; \
if ( !has_values ) (atok)->nvalues = 0; \
} \
- else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
- { \
- /* any field can match: [*] */ \
- for (i=0; i<(atok)->nvalues; i++) \
- { \
- for (j=0; j<(btok)->nvalues; j++) \
- if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
- } \
- } \
else \
{ \
if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
@@ -1130,18 +1324,18 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
}
static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE
{
- if ( !atok->nvalues ) { return 0; }
- if ( !btok->nvalues ) { atok->nvalues = 0; return 0; }
+ if ( !atok->str_value.l ) { return 0; }
+ if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
int i, pass_site = 0;
if ( atok->nsamples && atok->nsamples==btok->nsamples )
{
for (i=0; i<atok->nsamples; i++)
{
- char *astr = atok->str_value + i*(int)atok->values[0];
- char *bstr = btok->str_value + i*(int)btok->values[0];
- char *aend = astr + (int)atok->values[0], *a = astr;
+ char *astr = atok->str_value.s + i*atok->nstr1;
+ char *bstr = btok->str_value.s + i*btok->nstr1;
+ char *aend = astr + atok->str_value.l, *a = astr;
while ( a<aend && *a ) a++;
- char *bend = bstr + (int)btok->values[0], *b = bstr;
+ char *bend = bstr + btok->str_value.l, *b = bstr;
while ( b<bend && *b ) b++;
if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
@@ -1161,8 +1355,8 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
token_t *xtok, *ytok; // xtok is scalar, ytok array
if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues;
- char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr;
+ char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
+ char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
while ( y<=yend )
{
if ( y==yend || *y==',' )
@@ -1178,7 +1372,7 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
}
}
else
- pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1;
+ pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
}
else
@@ -1186,19 +1380,26 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
token_t *xtok, *ytok;
if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value;
- char *xend = xstr + (int)xtok->values[0], *x = xstr;
+ char *xstr = xtok->str_value.s;
+ char *xend = xstr + xtok->str_value.l, *x = xstr;
while ( x<xend && *x ) x++;
for (i=0; i<ytok->nsamples; i++)
{
- char *ystr = ytok->str_value + i*(int)ytok->values[0];
- char *yend = ystr + (int)ytok->values[0], *y = ystr;
- while ( y<yend && *y ) y++;
- if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
- else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
- if ( logic!=TOK_EQ )
- atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
- pass_site |= atok->pass_samples[i];
+ char *ystr = ytok->str_value.s + i*ytok->nstr1;
+ char *ybeg = ystr, *yend = ystr + ytok->nstr1;
+ int pass = 0;
+ while ( ybeg < yend )
+ {
+ char *y = ybeg;
+ while ( y<yend && *y && *y!=',' ) y++;
+ if ( y - ybeg != x - xstr ) pass = 0;
+ else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
+ if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
+ if ( pass || !*y ) break;
+ ybeg = y+1;
+ }
+ atok->pass_samples[i] = pass;
+ pass_site |= pass;
}
if ( !atok->nsamples )
atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
@@ -1212,18 +1413,70 @@ static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
{
for (i=0; i<atok->nsamples; i++)
{
- char *ptr = atok->str_value + i*(int)atok->values[0];
+ char *ptr = atok->str_value.s + i*atok->nstr1;
atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
pass_site |= atok->pass_samples[i];
}
return pass_site;
}
- pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+ pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
if ( negate ) pass_site = pass_site ? 0 : 1;
return pass_site;
}
+static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
+{
+ // TAG[*] .. any field
+ if ( !strncmp("*]", tag_idx, 3) )
+ {
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
+ return;
+ }
+
+ // TAG[integer] .. one field
+ char *end, *beg = tag_idx;
+ tok->idx = strtol(tag_idx, &end, 10);
+ if ( tok->idx >= 0 && *end==']' ) return;
+
+
+ // TAG[0,1] or TAG[0-2] or [1-] etc
+ int i, ibeg = -1;
+ while ( *beg && *beg!=']' )
+ {
+ int idx = strtol(beg, &end, 10);
+ if ( end[0]==',' ) beg = end + 1;
+ else if ( end[0]==']' ) beg = end;
+ else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
+ else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
+ if ( idx >= tok->nidxs )
+ {
+ tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
+ memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
+ tok->nidxs = idx + 1;
+ }
+ if ( ibeg>=0 )
+ {
+ for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+ ibeg = -1;
+ }
+ tok->idxs[idx] = 1;
+ }
+ if ( ibeg >=0 )
+ {
+ if ( ibeg >= tok->nidxs )
+ {
+ tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
+ memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
+ tok->nidxs = ibeg + 1;
+ }
+ tok->idxs[ibeg] = -1;
+ }
+ tok->idx = -2;
+}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
tok->tok_type = TOK_VAL;
@@ -1361,17 +1614,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
int i;
for (i=0; i<tmp.l; i++)
if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
- if ( is_array )
- {
- if ( tmp.s[is_array]=='*' )
- tok->idx = -2; // tag[*] .. any field
- else
- {
- char *end;
- tok->idx = strtol(tmp.s+is_array, &end, 10);
- if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array);
- }
- }
+ if ( is_array )
+ parse_tag_idx(tmp.s, tmp.s+is_array, tok);
}
tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
if ( is_fmt==-1 )
@@ -1425,7 +1669,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
}
- if(!is_array) tok->idx = -2;
+ if (!is_array)
+ {
+ tok->idx = -2;
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ }
}
}
filter->max_unpack |= BCF_UN_INFO;
@@ -1518,6 +1768,11 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
}
}
+static void str_to_lower(char *str)
+{
+ while ( *str ) { *str = tolower(*str); str++; }
+}
+
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
@@ -1538,8 +1793,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
ret = filters_next_token(&tmp, &len);
if ( ret==-1 ) error("Missing quotes in: %s\n", str);
- //fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
- //int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
+ // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
+ // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
if ( ret==TOK_LFT ) // left bracket
{
@@ -1670,6 +1925,28 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
i = itok;
continue;
}
+ if ( !strcmp(out[i].tag,"GT") )
+ {
+ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+ int ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1;
+ else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+
+ // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
+ if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra
+ else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA
+ else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; } // aa
+ else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; } // a
+ else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r
+ continue;
+ }
if ( !strcmp(out[i].tag,"FILTER") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
@@ -1728,9 +2005,10 @@ void filter_destroy(filter_t *filter)
int i;
for (i=0; i<filter->nfilters; i++)
{
- //if ( filter->filters[i].key ) free(filter->filters[i].key);
- free(filter->filters[i].str_value);
+ if ( filter->filters[i].key ) free(filter->filters[i].key);
+ free(filter->filters[i].str_value.s);
free(filter->filters[i].tag);
+ free(filter->filters[i].idxs);
free(filter->filters[i].values);
free(filter->filters[i].pass_samples);
if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
@@ -1745,6 +2023,7 @@ void filter_destroy(filter_t *filter)
free(filter->str);
free(filter->tmpi);
free(filter->tmpf);
+ free(filter->tmps.s);
free(filter);
}
@@ -1765,16 +2044,15 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
filter->filters[i].setter(filter, line, &filter->filters[i]);
else if ( filter->filters[i].key ) // string constant
{
- filter->filters[i].str_value = filter->filters[i].key;
- filter->filters[i].values[0] = filter->filters[i].values[0];
- filter->filters[i].nvalues = strlen(filter->filters[i].key);
+ filter->filters[i].str_value.l = 0;
+ kputs(filter->filters[i].key, &filter->filters[i].str_value);
+ filter->filters[i].nvalues = filter->filters[i].str_value.l;
}
else // numeric constant
{
filter->filters[i].values[0] = filter->filters[i].threshold;
filter->filters[i].nvalues = 1;
}
-
filter->flt_stack[nstack++] = &filter->filters[i];
continue;
}
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c
index 25200c4..81f8734 100644
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -69,21 +69,23 @@ typedef struct _token_t
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
- int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
+ int idx; // 0-based index to VCF vectors, -1: not a vector,
+ // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
+ int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
void *hash; // test presence of str value in the hash via comparator
regex_t *regex; // precompiled regex for string comparison
// modified on filter evaluation at each VCF line
- double *values; // In case str_value is set, values[0] is one sample's string length
- char *str_value; // and values[0]*nsamples gives the total length;
+ double *values;
+ kstring_t str_value;
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
uint8_t *pass_samples; // status of individual samples
int nsamples; // number of samples
- int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars
- // for strings, total length of str_value
+ int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
+ int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1
}
token_t;
@@ -95,6 +97,7 @@ struct _filter_t
token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
int32_t *tmpi;
float *tmpf;
+ kstring_t tmps;
int max_unpack, mtmpi, mtmpf, nsamples;
};
@@ -171,6 +174,7 @@ static int filters_next_token(char **str, int *len)
return TOK_VAL;
}
+ int square_brackets = 0;
while ( tmp[0] )
{
if ( tmp[0]=='"' ) break;
@@ -185,11 +189,12 @@ static int filters_next_token(char **str, int *len)
if ( tmp[0]=='(' ) break;
if ( tmp[0]==')' ) break;
if ( tmp[0]=='+' ) break;
- // hacky: so that [*] is not split, the tokenizer does not recognise square brackets []
- if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break;
- if ( tmp[0]=='-' ) break;
+ if ( tmp[0]=='*' && !square_brackets ) break;
+ if ( tmp[0]=='-' && !square_brackets ) break;
if ( tmp[0]=='/' ) break;
if ( tmp[0]=='~' ) break;
+ if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
+ if ( tmp[0]=='[' ) square_brackets++;
tmp++;
}
if ( tmp > *str )
@@ -272,12 +277,15 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
else if ( line->d.info[i].type==BCF_BT_CHAR )
{
int n = line->d.info[i].len;
- int m = (int)tok->values[0];
- hts_expand(char,n+1,m,tok->str_value);
- memcpy(tok->str_value,line->d.info[i].vptr,n);
- tok->str_value[n] = 0;
- tok->values[0] = m;
- tok->nvalues = n;
+ if ( n >= tok->str_value.m )
+ {
+ tok->str_value.m = n + 1;
+ tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m);
+ if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m);
+ }
+ memcpy(tok->str_value.s, line->d.info[i].vptr, n);
+ tok->str_value.s[n] = 0;
+ tok->nvalues = tok->str_value.l = n;
}
else if ( line->d.info[i].type==BCF_BT_FLOAT )
{
@@ -287,10 +295,11 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->values[0] = line->d.info[i].v1.f;
tok->nvalues = 1;
}
- tok->str_value = NULL;
+ tok->str_value.l = 0;
}
else
{
+ tok->str_value.l = 0;
if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
@@ -299,7 +308,6 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->values[0] = line->d.info[i].v1.i;
tok->nvalues = 1;
}
- tok->str_value = NULL;
}
}
static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
@@ -348,8 +356,8 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
return ret ? 0 : 1;
}
- if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1;
- return strcmp(btok->str_value,line->d.id) ? 1 : 0;
+ if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+ return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
}
/**
@@ -411,13 +419,16 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i;
tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
if ( tok->nvalues<=0 ) tok->nvalues = 0;
else
{
hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+ if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i];
+ tok->nvalues = j;
}
}
else
@@ -437,15 +448,21 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i;
tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
if ( tok->nvalues<=0 ) tok->nvalues = 0;
else
{
hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
- if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
- else tok->values[i] = flt->tmpf[i];
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+ if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]);
+ else tok->values[j] = flt->tmpf[i];
+ j++;
+ }
+ tok->nvalues = j;
}
}
else
@@ -463,33 +480,64 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int m = (int)tok->values[0];
- int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m);
- if ( n<0 ) { tok->nvalues = 0; return; }
- tok->values[0] = m; // allocated length
+ int32_t m = tok->str_value.m;
+ int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m);
+ tok->str_value.m = m;
+ if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; }
if ( tok->idx>=0 )
{
// get ith field (i=tok->idx)
int i = 0;
- char *ss = tok->str_value, *se = tok->str_value + n;
+ char *ss = tok->str_value.s, *se = tok->str_value.s + n;
while ( ss<se && i<tok->idx )
{
if ( *ss==',' ) i++;
ss++;
}
- if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; }
+ if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; }
se = ss;
- while ( se-tok->str_value<n && *se!=',' ) se++;
- if ( ss==tok->str_value ) *se = 0;
+ while ( se - tok->str_value.s < n && *se!=',' ) se++;
+ if ( ss==tok->str_value.s ) *se = 0;
else
{
- memmove(tok->str_value,ss,se-ss);
- tok->str_value[se-ss] = 0;
+ memmove(tok->str_value.s, ss, se-ss);
+ tok->str_value.s[se-ss] = 0;
}
- tok->nvalues = se-ss;
+ tok->str_value.l = se - ss;
}
- else if ( tok->idx==-2 ) tok->nvalues = n;
+ else if ( tok->idx==-2 && tok->idxs[0]==-1 ) // keep all values, TAG[*]
+ tok->str_value.l = n;
+ else if ( tok->idx==-2 )
+ {
+ flt->tmps.l = 0;
+ ks_resize(&flt->tmps, n);
+ int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+ if ( end >= n ) end = n - 1;
+ char *beg = tok->str_value.s, *dst = flt->tmps.s;
+ for (i=0; i<=end; i++)
+ {
+ char *end = beg;
+ while ( *end && *end!=',' ) end++;
+
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ memcpy(dst, beg, end - beg);
+ dst += end - beg;
+ dst[0] = ',';
+ dst++;
+ }
+
+ beg = end+1;
+ }
+ dst[0] = 0;
+ tok->str_value.l = dst - flt->tmps.s;
+
+ #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+ SWAP(char *, flt->tmps.s, tok->str_value.s);
+ SWAP(size_t, flt->tmps.m, tok->str_value.m);
+ }
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -505,127 +553,266 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i;
if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
- tok->nvalues = 0;
- else
{
+ tok->nvalues = tok->nsamples = 0;
+ return;
+ }
+ if ( tok->idx >= -1 ) // scalar or vector index
+ {
+ hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
int is_missing = 1;
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
+ int32_t *ptr = flt->tmpi;
+ for (i=0; i<line->n_sample; i++)
{
- if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
+ if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end )
bcf_double_set_missing(tok->values[i]);
else
{
- tok->values[i] = flt->tmpi[i];
+ tok->values[i] = ptr[idx];
is_missing = 0;
}
+ ptr += nvals;
}
if ( is_missing ) tok->nvalues = 0;
- else if ( tok->idx >= 0 )
+ else tok->nvalues = line->n_sample;
+ tok->nsamples = tok->nvalues;
+ return;
+ }
+ if ( tok->idx == -2 )
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
+ int is_missing = 1;
+ int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+ if ( end >= nvals ) end = nvals - 1;
+ int32_t *ptr = flt->tmpi;
+ for (i=0; i<line->n_sample; i++)
{
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- int nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nvalues = 0; // the index is too big
- else
- {
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nvalues = nsmpl;
- }
+ for (k=0; k<=end; k++)
+ if ( k>=tok->nidxs || tok->idxs[k] )
+ {
+ if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
+ bcf_double_set_missing(tok->values[j]);
+ else
+ {
+ tok->values[j] = ptr[k];
+ is_missing = 0;
+ }
+ j++;
+ }
+ ptr += nvals;
+ }
+ if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+ else
+ {
+ tok->nsamples = line->n_sample;
+ tok->nvalues = j;
}
+ return;
}
- tok->nsamples = tok->nvalues;
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i;
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
{
- tok->nvalues = tok->nsamples = 0; // missing values
+ tok->nvalues = tok->nsamples = 0;
+ return;
}
- else
+ if ( tok->idx >= -1 ) // scalar or vector index
{
+ hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
int is_missing = 1;
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
+ float *ptr = flt->tmpf;
+ for (i=0; i<line->n_sample; i++)
{
- if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+ if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
bcf_double_set_missing(tok->values[i]);
else
{
- tok->values[i] = flt->tmpf[i];
+ tok->values[i] = ptr[idx];
is_missing = 0;
}
+ ptr += nvals;
}
if ( is_missing ) tok->nvalues = 0;
- else if ( tok->idx >= 0 )
+ else tok->nvalues = line->n_sample;
+ tok->nsamples = tok->nvalues;
+ return;
+ }
+ if ( tok->idx == -2 )
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
+ int is_missing = 1;
+ int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+ if ( end >= nvals ) end = nvals - 1;
+ float *ptr = flt->tmpf;
+ for (i=0; i<line->n_sample; i++)
{
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- int nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nvalues = 0; // the index is too big
- else
- {
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nvalues = nsmpl;
- }
+ for (k=0; k<=end; k++)
+ if ( k>=tok->nidxs || tok->idxs[k] )
+ {
+ if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
+ bcf_double_set_missing(tok->values[j]);
+ else
+ {
+ tok->values[j] = ptr[k];
+ is_missing = 0;
+ }
+ j++;
+ }
+ ptr += nvals;
+ }
+ if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+ else
+ {
+ tok->nsamples = line->n_sample;
+ tok->nvalues = j;
}
+ return;
}
- tok->nsamples = tok->nvalues;
}
static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int ndim = tok->nsamples * (int)tok->values[0];
- int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim);
+ tok->str_value.l = tok->nvalues = 0;
+ if ( !line->n_sample ) return;
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- ndim /= nsmpl;
- tok->values[0] = ndim;
+ int ndim = tok->str_value.m;
+ int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
+ tok->str_value.m = ndim;
- if ( ret<=0 )
- {
- tok->nvalues = 0;
- return;
- }
+ if ( nstr<=0 ) return;
- if ( tok->idx < 0 ) // scalar
+ if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
{
- tok->nvalues = tok->nsamples = nsmpl;
+ tok->nsamples = line->n_sample;
+ tok->nstr1 = ndim / line->n_sample;
+ tok->nvalues = tok->str_value.l = nstr;
return;
}
- // vector
+ int nstr1 = nstr / line->n_sample;
+
+ // vector, one or multiple indices
int i;
- for (i=0; i<nsmpl; i++)
+ for (i=0; i<line->n_sample; i++)
{
- char *ss = tok->str_value + i*ndim;
- int is = 0, ivec = 0;
- while ( ivec<tok->idx && is<ndim && ss[is] )
- {
- if ( ss[is]==',' ) ivec++;
- is++;
- }
- if ( ivec!=tok->idx || is==ndim || !ss[is] )
+ char *dst = tok->str_value.s + i*nstr1, *str = dst;
+ int nval = 0, ibeg = 0;
+ while ( ibeg < nstr1 )
{
- ss[0] = '.';
- ss[1] = 0;
- continue;
+ int iend = ibeg + 1;
+ while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+
+ int keep = 0;
+ if ( tok->idx >=0 )
+ keep = tok->idx==nval ? 1 : 0;
+ else if ( nval < tok->nidxs )
+ keep = tok->idxs[nval] ? 1 : 0;
+ else if ( tok->idxs[tok->nidxs-1] < 0 )
+ keep = 1;
+
+ if ( keep )
+ {
+ if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+ dst += iend - ibeg + 1;
+ if ( tok->idx>=0 ) break;
+ }
+ if ( !str[iend] ) break;
+ ibeg = iend + 1;
+ nval++;
}
- int ie = is;
- while ( ie<ndim && ss[ie] && ss[ie]!=',' ) ie++;
- if ( is ) memmove(ss,&ss[is],ie-is);
- if ( ndim-(ie-is) ) memset(ss+ie-is,0,ndim-(ie-is));
+ if ( dst==str ) { dst[0] = '.'; dst+=2; }
+ if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
}
- if ( !ndim )
+ tok->nvalues = tok->str_value.l = nstr;
+ tok->nstr1 = nstr1;
+ tok->nsamples = line->n_sample;
+}
+static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
+{
+ bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
+ if ( !fmt )
{
- tok->nvalues = 0;
+ tok->nvalues = tok->str_value.l = 0;
return;
}
- tok->nvalues = ret;
+
+ int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
+ if ( tok->str_value.m <= nvals*nsmpl )
+ {
+ tok->str_value.m = nvals*nsmpl + 1;
+ tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
+ }
+
+#define BRANCH_INT(type_t,vector_end) \
+ { \
+ for (i=0; i<line->n_sample; i++) \
+ { \
+ type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
+ int is_het = 0, has_ref = 0, missing = 0; \
+ for (j=0; j<fmt->n; j++) \
+ { \
+ if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
+ int ial = ptr[j]; \
+ if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
+ if ( j>0 ) \
+ { \
+ int jal = ptr[j-1]; \
+ if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
+ } \
+ } \
+ char *dst = &tok->str_value.s[nvals*i]; \
+ if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+ else if ( type==3 ) \
+ { \
+ if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
+ else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \
+ else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \
+ } \
+ else \
+ { \
+ if ( j==1 ) \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \
+ else dst[0]='a', dst[1]=0; /* a, haploid */ \
+ } \
+ else if ( !is_het ) \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \
+ else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \
+ } \
+ else \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \
+ else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \
+ } \
+ } \
+ } \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
+ }
+#undef BRANCH_INT
tok->nsamples = nsmpl;
+ tok->nvalues = tok->str_value.l = nvals*nsmpl;
+ tok->str_value.s[tok->str_value.l] = 0;
+ tok->nstr1 = nvals;
}
+static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
+static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
@@ -638,67 +825,73 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to
kstring_t str;
gt_length_too_big:
- str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0;
+ tok->str_value.l = 0;
for (i=0; i<nsmpl; i++)
{
- int plen = str.l;
+ int plen = tok->str_value.l;
- bcf_format_gt(fmt, i, &str);
- kputc_(0,&str);
- if ( str.l - plen > blen )
+ bcf_format_gt(fmt, i, &tok->str_value);
+ kputc_(0, &tok->str_value);
+ if ( tok->str_value.l - plen > blen )
{
// too many alternate alleles or ploidy is too large, the genotype does not fit
// three characters ("0/0" vs "10/10").
- tok->str_value = str.s;
blen *= 2;
goto gt_length_too_big;
}
- plen = str.l - plen;
- while ( plen<blen )
+ plen = tok->str_value.l - plen;
+ while ( plen < blen )
{
- kputc_(0, &str);
+ kputc_(0, &tok->str_value);
plen++;
}
}
- tok->nvalues = str.l;
tok->nsamples = nsmpl;
- tok->values[0] = blen;
- tok->str_value = str.s;
+ tok->nvalues = tok->str_value.l;
+ tok->nstr1 = blen;
}
static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
- kputs(line->d.allele[0], &str);
- tok->nvalues = str.l;
- tok->values[0] = str.m;
- tok->str_value = str.s;
+ tok->str_value.l = 0;
+ kputs(line->d.allele[0], &tok->str_value);
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+ tok->str_value.l = 0;
if ( tok->idx>=0 )
{
- if ( line->n_allele >= tok->idx )
- kputs(line->d.allele[tok->idx], &str);
+ if ( line->n_allele > tok->idx + 1 )
+ kputs(line->d.allele[tok->idx + 1], &tok->str_value);
else
- kputc('.', &str);
+ kputc('.', &tok->str_value);
+ tok->idx = 0;
+ }
+ else if ( tok->idx==-2 )
+ {
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+ if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ if ( tok->str_value.l ) kputc(',', &tok->str_value);
+ kputs(line->d.allele[i+1], &tok->str_value);
+ }
}
else if ( line->n_allele>1 )
{
- kputs(line->d.allele[1], &str);
+ kputs(line->d.allele[1], &tok->str_value);
int i;
for (i=2; i<line->n_allele; i++)
{
- kputc(',', &str);
- kputs(line->d.allele[i], &str);
+ kputc(',', &tok->str_value);
+ kputs(line->d.allele[i], &tok->str_value);
}
}
else if ( line->n_allele==1 )
- kputc('.', &str);
- tok->nvalues = str.l;
- tok->values[0] = str.m;
- tok->str_value = str.s;
+ kputc('.', &tok->str_value);
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
{
@@ -859,11 +1052,11 @@ static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->is_str = 0;
- if ( !tok->nvalues ) return;
+ if ( !tok->str_value.l ) return;
if ( tok->idx==-2 )
{
int i = 0;
- char *ss = tok->str_value;
+ char *ss = tok->str_value.s;
while ( *ss )
{
char *se = ss;
@@ -883,9 +1076,10 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
}
else
{
- tok->values[0] = strlen(tok->str_value);
+ tok->values[0] = strlen(tok->str_value.s);
tok->nvalues = 1;
}
+ tok->str_value.l = 0;
}
#define VECTOR_ARITHMETICS(atok,btok,AOP) \
{ \
@@ -1079,7 +1273,16 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
else \
{ \
- if ( (atok)->nsamples && (btok)->nsamples ) \
+ if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+ { \
+ /* any field can match: [*] */ \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ for (j=0; j<(btok)->nvalues; j++) \
+ if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+ } \
+ } \
+ else if ( (atok)->nsamples && (btok)->nsamples ) \
{ \
for (i=0; i<(atok)->nsamples; i++) \
{ \
@@ -1113,15 +1316,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
(atok)->nsamples = (btok)->nsamples; \
if ( !has_values ) (atok)->nvalues = 0; \
} \
- else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
- { \
- /* any field can match: [*] */ \
- for (i=0; i<(atok)->nvalues; i++) \
- { \
- for (j=0; j<(btok)->nvalues; j++) \
- if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
- } \
- } \
else \
{ \
if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
@@ -1132,18 +1326,18 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
}
static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE
{
- if ( !atok->nvalues ) { return 0; }
- if ( !btok->nvalues ) { atok->nvalues = 0; return 0; }
+ if ( !atok->str_value.l ) { return 0; }
+ if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
int i, pass_site = 0;
if ( atok->nsamples && atok->nsamples==btok->nsamples )
{
for (i=0; i<atok->nsamples; i++)
{
- char *astr = atok->str_value + i*(int)atok->values[0];
- char *bstr = btok->str_value + i*(int)btok->values[0];
- char *aend = astr + (int)atok->values[0], *a = astr;
+ char *astr = atok->str_value.s + i*atok->nstr1;
+ char *bstr = btok->str_value.s + i*btok->nstr1;
+ char *aend = astr + atok->str_value.l, *a = astr;
while ( a<aend && *a ) a++;
- char *bend = bstr + (int)btok->values[0], *b = bstr;
+ char *bend = bstr + btok->str_value.l, *b = bstr;
while ( b<bend && *b ) b++;
if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
@@ -1163,8 +1357,8 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
token_t *xtok, *ytok; // xtok is scalar, ytok array
if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues;
- char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr;
+ char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
+ char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
while ( y<=yend )
{
if ( y==yend || *y==',' )
@@ -1180,7 +1374,7 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
}
}
else
- pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1;
+ pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
}
else
@@ -1188,19 +1382,26 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
token_t *xtok, *ytok;
if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value;
- char *xend = xstr + (int)xtok->values[0], *x = xstr;
+ char *xstr = xtok->str_value.s;
+ char *xend = xstr + xtok->str_value.l, *x = xstr;
while ( x<xend && *x ) x++;
for (i=0; i<ytok->nsamples; i++)
{
- char *ystr = ytok->str_value + i*(int)ytok->values[0];
- char *yend = ystr + (int)ytok->values[0], *y = ystr;
- while ( y<yend && *y ) y++;
- if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
- else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
- if ( logic!=TOK_EQ )
- atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
- pass_site |= atok->pass_samples[i];
+ char *ystr = ytok->str_value.s + i*ytok->nstr1;
+ char *ybeg = ystr, *yend = ystr + ytok->nstr1;
+ int pass = 0;
+ while ( ybeg < yend )
+ {
+ char *y = ybeg;
+ while ( y<yend && *y && *y!=',' ) y++;
+ if ( y - ybeg != x - xstr ) pass = 0;
+ else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
+ if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
+ if ( pass || !*y ) break;
+ ybeg = y+1;
+ }
+ atok->pass_samples[i] = pass;
+ pass_site |= pass;
}
if ( !atok->nsamples )
atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
@@ -1214,18 +1415,70 @@ static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
{
for (i=0; i<atok->nsamples; i++)
{
- char *ptr = atok->str_value + i*(int)atok->values[0];
+ char *ptr = atok->str_value.s + i*atok->nstr1;
atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
pass_site |= atok->pass_samples[i];
}
return pass_site;
}
- pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+ pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
if ( negate ) pass_site = pass_site ? 0 : 1;
return pass_site;
}
+static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
+{
+ // TAG[*] .. any field
+ if ( !strncmp("*]", tag_idx, 3) )
+ {
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
+ return;
+ }
+
+ // TAG[integer] .. one field
+ char *end, *beg = tag_idx;
+ tok->idx = strtol(tag_idx, &end, 10);
+ if ( tok->idx >= 0 && *end==']' ) return;
+
+
+ // TAG[0,1] or TAG[0-2] or [1-] etc
+ int i, ibeg = -1;
+ while ( *beg && *beg!=']' )
+ {
+ int idx = strtol(beg, &end, 10);
+ if ( end[0]==',' ) beg = end + 1;
+ else if ( end[0]==']' ) beg = end;
+ else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
+ else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
+ if ( idx >= tok->nidxs )
+ {
+ tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
+ memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
+ tok->nidxs = idx + 1;
+ }
+ if ( ibeg>=0 )
+ {
+ for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+ ibeg = -1;
+ }
+ tok->idxs[idx] = 1;
+ }
+ if ( ibeg >=0 )
+ {
+ if ( ibeg >= tok->nidxs )
+ {
+ tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
+ memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
+ tok->nidxs = ibeg + 1;
+ }
+ tok->idxs[ibeg] = -1;
+ }
+ tok->idx = -2;
+}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
tok->tok_type = TOK_VAL;
@@ -1363,17 +1616,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
int i;
for (i=0; i<tmp.l; i++)
if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
- if ( is_array )
- {
- if ( tmp.s[is_array]=='*' )
- tok->idx = -2; // tag[*] .. any field
- else
- {
- char *end;
- tok->idx = strtol(tmp.s+is_array, &end, 10);
- if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array);
- }
- }
+ if ( is_array )
+ parse_tag_idx(tmp.s, tmp.s+is_array, tok);
}
tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
if ( is_fmt==-1 )
@@ -1427,7 +1671,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
}
- if(!is_array) tok->idx = -2;
+ if (!is_array)
+ {
+ tok->idx = -2;
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ }
}
}
filter->max_unpack |= BCF_UN_INFO;
@@ -1520,6 +1770,11 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
}
}
+static void str_to_lower(char *str)
+{
+ while ( *str ) { *str = tolower(*str); str++; }
+}
+
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
@@ -1540,8 +1795,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
ret = filters_next_token(&tmp, &len);
if ( ret==-1 ) error("Missing quotes in: %s\n", str);
- //fprintf(pysam_stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
- //int i; for (i=0; i<nops; i++) fprintf(pysam_stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(pysam_stderr,"\n");
+ // fprintf(pysam_stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
+ // int i; for (i=0; i<nops; i++) fprintf(pysam_stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(pysam_stderr,"\n");
if ( ret==TOK_LFT ) // left bracket
{
@@ -1672,6 +1927,28 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
i = itok;
continue;
}
+ if ( !strcmp(out[i].tag,"GT") )
+ {
+ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+ int ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1;
+ else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+
+ // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
+ if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra
+ else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA
+ else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; } // aa
+ else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; } // a
+ else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r
+ continue;
+ }
if ( !strcmp(out[i].tag,"FILTER") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
@@ -1730,9 +2007,10 @@ void filter_destroy(filter_t *filter)
int i;
for (i=0; i<filter->nfilters; i++)
{
- //if ( filter->filters[i].key ) free(filter->filters[i].key);
- free(filter->filters[i].str_value);
+ if ( filter->filters[i].key ) free(filter->filters[i].key);
+ free(filter->filters[i].str_value.s);
free(filter->filters[i].tag);
+ free(filter->filters[i].idxs);
free(filter->filters[i].values);
free(filter->filters[i].pass_samples);
if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
@@ -1747,6 +2025,7 @@ void filter_destroy(filter_t *filter)
free(filter->str);
free(filter->tmpi);
free(filter->tmpf);
+ free(filter->tmps.s);
free(filter);
}
@@ -1767,16 +2046,15 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
filter->filters[i].setter(filter, line, &filter->filters[i]);
else if ( filter->filters[i].key ) // string constant
{
- filter->filters[i].str_value = filter->filters[i].key;
- filter->filters[i].values[0] = filter->filters[i].values[0];
- filter->filters[i].nvalues = strlen(filter->filters[i].key);
+ filter->filters[i].str_value.l = 0;
+ kputs(filter->filters[i].key, &filter->filters[i].str_value);
+ filter->filters[i].nvalues = filter->filters[i].str_value.l;
}
else // numeric constant
{
filter->filters[i].values[0] = filter->filters[i].threshold;
filter->filters[i].nvalues = 1;
}
-
filter->flt_stack[nstack++] = &filter->filters[i];
continue;
}
diff --git a/bcftools/kheap.h b/bcftools/kheap.h
index ac2f9f9..cb5dda4 100644
--- a/bcftools/kheap.h
+++ b/bcftools/kheap.h
@@ -57,6 +57,8 @@
// "data_t".
heap_t *heap = khp_init(mh);
+ // When inserting a new element, the heap stores a copy of the memory
+ // area pointed to by the third argument.
for (int i=0; i<3; i++)
khp_insert(mh, heap, &data[i]);
@@ -130,7 +132,8 @@
{ \
heap->mdat = heap->ndat; \
kroundup32(heap->mdat); \
- heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
+ heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
+ memset(heap->dat + heap->ndat, 0, (heap->mdat - heap->ndat)*sizeof(kheap_t)); \
} \
int i = heap->ndat - 1; \
while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) ) \
diff --git a/bcftools/main.c b/bcftools/main.c
index 4e3e0e5..03fa6a7 100644
--- a/bcftools/main.c
+++ b/bcftools/main.c
@@ -57,6 +57,7 @@ int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
int main_csq(int argc, char *argv[]);
int bam_mpileup(int argc, char *argv[]);
+int main_sort(int argc, char *argv[]);
typedef struct
{
@@ -126,6 +127,10 @@ static cmd_t cmds[] =
.alias = "reheader",
.help = "modify VCF/BCF header, change sample names"
},
+ { .func = main_sort,
+ .alias = "sort",
+ .help = "sort VCF/BCF file"
+ },
{ .func = main_vcfview,
.alias = "view",
.help = "VCF/BCF conversion, view, subset and filter VCF/BCF files"
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c
index f148252..9d81ba1 100644
--- a/bcftools/main.c.pysam.c
+++ b/bcftools/main.c.pysam.c
@@ -59,6 +59,7 @@ int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
int main_csq(int argc, char *argv[]);
int bam_mpileup(int argc, char *argv[]);
+int main_sort(int argc, char *argv[]);
typedef struct
{
@@ -128,6 +129,10 @@ static cmd_t cmds[] =
.alias = "reheader",
.help = "modify VCF/BCF header, change sample names"
},
+ { .func = main_sort,
+ .alias = "sort",
+ .help = "sort VCF/BCF file"
+ },
{ .func = main_vcfview,
.alias = "view",
.help = "VCF/BCF conversion, view, subset and filter VCF/BCF files"
diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c
index ac37dd4..9b6c6eb 100644
--- a/bcftools/mpileup.c
+++ b/bcftools/mpileup.c
@@ -909,7 +909,7 @@ int bam_mpileup(int argc, char *argv[])
{"ignore-RG", no_argument, NULL, 5},
{"ignore-rg", no_argument, NULL, 5},
{"gvcf", required_argument, NULL, 'g'},
- {"non-reference", no_argument, NULL, 7},
+ {"no-reference", no_argument, NULL, 7},
{"no-version", no_argument, NULL, 8},
{"threads",required_argument,NULL,9},
{"illumina1.3+", no_argument, NULL, '6'},
@@ -1099,11 +1099,8 @@ int bam_mpileup(int argc, char *argv[])
free(mplp.files);
free(mplp.reg_fname); free(mplp.pl_list);
if (mplp.fai) fai_destroy(mplp.fai);
- if (mplp.bed)
- {
- regidx_destroy(mplp.bed);
- regitr_destroy(mplp.bed_itr);
- }
+ if (mplp.bed) regidx_destroy(mplp.bed);
+ if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
if (mplp.reg) regidx_destroy(mplp.reg);
bam_smpl_destroy(mplp.bsmpl);
return ret;
diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c
index 6ef6838..94286e9 100644
--- a/bcftools/mpileup.c.pysam.c
+++ b/bcftools/mpileup.c.pysam.c
@@ -911,7 +911,7 @@ int bam_mpileup(int argc, char *argv[])
{"ignore-RG", no_argument, NULL, 5},
{"ignore-rg", no_argument, NULL, 5},
{"gvcf", required_argument, NULL, 'g'},
- {"non-reference", no_argument, NULL, 7},
+ {"no-reference", no_argument, NULL, 7},
{"no-version", no_argument, NULL, 8},
{"threads",required_argument,NULL,9},
{"illumina1.3+", no_argument, NULL, '6'},
@@ -1101,11 +1101,8 @@ int bam_mpileup(int argc, char *argv[])
free(mplp.files);
free(mplp.reg_fname); free(mplp.pl_list);
if (mplp.fai) fai_destroy(mplp.fai);
- if (mplp.bed)
- {
- regidx_destroy(mplp.bed);
- regitr_destroy(mplp.bed_itr);
- }
+ if (mplp.bed) regidx_destroy(mplp.bed);
+ if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
if (mplp.reg) regidx_destroy(mplp.reg);
bam_smpl_destroy(mplp.bsmpl);
return ret;
diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c
index ffe71c4..11c55bd 100644
--- a/bcftools/vcfcnv.c
+++ b/bcftools/vcfcnv.c
@@ -212,8 +212,14 @@ static double *init_iprobs(int ndim, double same_prob)
static void init_sample_files(sample_t *smpl, char *dir)
{
smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name);
+ if ( !smpl->dat_fh ) error("Error opening file: %s/dat.%s.tab\n",dir,smpl->name);
+
smpl->cn_fh = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name);
+ if ( !smpl->cn_fh ) error("Error opening file: %s/cn.%s.tab\n",dir,smpl->name);
+
smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name);
+ if ( !smpl->summary_fh ) error("Error opening file: %s/summary.%s.tab\n",dir,smpl->name);
+
fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n");
fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n");
fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n");
diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c
index 1075ef1..86ba48f 100644
--- a/bcftools/vcfcnv.c.pysam.c
+++ b/bcftools/vcfcnv.c.pysam.c
@@ -214,8 +214,14 @@ static double *init_iprobs(int ndim, double same_prob)
static void init_sample_files(sample_t *smpl, char *dir)
{
smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name);
+ if ( !smpl->dat_fh ) error("Error opening file: %s/dat.%s.tab\n",dir,smpl->name);
+
smpl->cn_fh = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name);
+ if ( !smpl->cn_fh ) error("Error opening file: %s/cn.%s.tab\n",dir,smpl->name);
+
smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name);
+ if ( !smpl->summary_fh ) error("Error opening file: %s/summary.%s.tab\n",dir,smpl->name);
+
fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n");
fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n");
fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n");
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c
index 8f596d4..1e28ad8 100644
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -1316,12 +1316,13 @@ static void gvcf_to_vcf(args_t *args)
}
// check if alleles compatible with being a gVCF record
+ // ALT must be one of ., <*>, <X>, <NON_REF>
+ // check for INFO/END is below
int i, gallele = -1;
if (line->n_allele==1)
gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
- else
+ else if ( line->d.allele[1][0]=='<' )
{
- if ( line->d.allele[1][0]!='<' ) continue;
for (i=1; i<line->n_allele; i++)
{
if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c
index 53df3d9..d1b15ba 100644
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -1318,12 +1318,13 @@ static void gvcf_to_vcf(args_t *args)
}
// check if alleles compatible with being a gVCF record
+ // ALT must be one of ., <*>, <X>, <NON_REF>
+ // check for INFO/END is below
int i, gallele = -1;
if (line->n_allele==1)
gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
- else
+ else if ( line->d.allele[1][0]=='<' )
{
- if ( line->d.allele[1][0]!='<' ) continue;
for (i=1; i<line->n_allele; i++)
{
if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c
index aa60fb2..807fedd 100644
--- a/bcftools/vcfindex.c
+++ b/bcftools/vcfindex.c
@@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <htslib/kstring.h>
+#include <htslib/bgzf.h>
#include "bcftools.h"
#define BCF_LIDX_SHIFT 14
@@ -208,6 +209,12 @@ int main_vcfindex(int argc, char *argv[])
return 1;
}
}
+
+ // check for truncated files, allow only with -f
+ BGZF *fp = bgzf_open(fname, "r");
+ if ( !fp ) error("index: failed to open %s\n", fname);
+ if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
+ if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
}
int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c
index ff960b9..157fc8e 100644
--- a/bcftools/vcfindex.c.pysam.c
+++ b/bcftools/vcfindex.c.pysam.c
@@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <htslib/kstring.h>
+#include <htslib/bgzf.h>
#include "bcftools.h"
#define BCF_LIDX_SHIFT 14
@@ -210,6 +211,12 @@ int main_vcfindex(int argc, char *argv[])
return 1;
}
}
+
+ // check for truncated files, allow only with -f
+ BGZF *fp = bgzf_open(fname, "r");
+ if ( !fp ) error("index: failed to open %s\n", fname);
+ if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
+ if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
}
int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c
index 9eb3a7c..3e0e1e5 100644
--- a/bcftools/vcfisec.c
+++ b/bcftools/vcfisec.c
@@ -82,13 +82,13 @@ void mkdir_p(const char *fmt, ...)
while (*p)
{
while (*p && *p!='/') p++;
- if ( *p )
- {
- *p = 0;
- mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
- *p = '/';
- p++;
- }
+ if ( !*p ) break;
+ char ctmp = *p;
+ *p = 0;
+ int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+ if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
+ *p = ctmp;
+ while ( *p && *p=='/' ) p++;
}
free(tmp);
free(path);
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c
index e3890d5..15ef22d 100644
--- a/bcftools/vcfisec.c.pysam.c
+++ b/bcftools/vcfisec.c.pysam.c
@@ -84,13 +84,13 @@ void mkdir_p(const char *fmt, ...)
while (*p)
{
while (*p && *p!='/') p++;
- if ( *p )
- {
- *p = 0;
- mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
- *p = '/';
- p++;
- }
+ if ( !*p ) break;
+ char ctmp = *p;
+ *p = 0;
+ int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+ if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
+ *p = ctmp;
+ while ( *p && *p=='/' ) p++;
}
free(tmp);
free(path);
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c
index e9ed5ad..31f5dad 100644
--- a/bcftools/vcfmerge.c
+++ b/bcftools/vcfmerge.c
@@ -662,7 +662,6 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
}
// new allele
map[i] = *nb;
- if ( b[*nb] ) free(b[*nb]);
b[*nb] = const_ai ? strdup(ai) : ai;
(*nb)++;
}
@@ -1668,6 +1667,11 @@ void gvcf_set_alleles(args_t *args)
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = NULL;
+ }
maux->nals = 0;
for (i=0; i<files->nreaders; i++)
@@ -2025,9 +2029,15 @@ int can_merge(args_t *args)
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
+ int i,j,k, ntodo = 0;
+
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = NULL;
+ }
maux->var_types = maux->nals = 0;
- int i,j,k, ntodo = 0;
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c
index a162905..f12e0a6 100644
--- a/bcftools/vcfmerge.c.pysam.c
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -664,7 +664,6 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
}
// new allele
map[i] = *nb;
- if ( b[*nb] ) free(b[*nb]);
b[*nb] = const_ai ? strdup(ai) : ai;
(*nb)++;
}
@@ -1670,6 +1669,11 @@ void gvcf_set_alleles(args_t *args)
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = NULL;
+ }
maux->nals = 0;
for (i=0; i<files->nreaders; i++)
@@ -2027,9 +2031,15 @@ int can_merge(args_t *args)
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
+ int i,j,k, ntodo = 0;
+
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = NULL;
+ }
maux->var_types = maux->nals = 0;
- int i,j,k, ntodo = 0;
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c
index 86c20ab..bc51018 100644
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -1514,6 +1514,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
{
bcf1_t *line;
int i, k;
+ int prev_rid = -1, prev_pos = -1, prev_type = 0;
for (i=0; i<n; i++)
{
k = rbuf_shift(&args->rbuf);
@@ -1534,6 +1535,23 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
continue;
}
}
+ else if ( args->rmdup )
+ {
+ int line_type = bcf_get_variant_types(args->lines[k]);
+ if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
+ {
+ if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ }
+ else
+ {
+ prev_rid = args->lines[k]->rid;
+ prev_pos = args->lines[k]->pos;
+ prev_type = 0;
+ }
+ prev_type |= line_type;
+ }
bcf_write1(file, args->hdr, args->lines[k]);
}
if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c
index a54180d..9308e6b 100644
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -1516,6 +1516,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
{
bcf1_t *line;
int i, k;
+ int prev_rid = -1, prev_pos = -1, prev_type = 0;
for (i=0; i<n; i++)
{
k = rbuf_shift(&args->rbuf);
@@ -1536,6 +1537,23 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
continue;
}
}
+ else if ( args->rmdup )
+ {
+ int line_type = bcf_get_variant_types(args->lines[k]);
+ if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
+ {
+ if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ }
+ else
+ {
+ prev_rid = args->lines[k]->rid;
+ prev_pos = args->lines[k]->pos;
+ prev_type = 0;
+ }
+ prev_type |= line_type;
+ }
bcf_write1(file, args->hdr, args->lines[k]);
}
if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c
index ab4c100..04554f8 100644
--- a/bcftools/vcfquery.c
+++ b/bcftools/vcfquery.c
@@ -32,6 +32,7 @@ THE SOFTWARE. */
#include <sys/types.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash_str2int.h>
#include <htslib/vcfutils.h>
#include "bcftools.h"
#include "filter.h"
@@ -151,10 +152,26 @@ static void query_vcf(args_t *args)
static void list_columns(args_t *args)
{
+ void *has_sample = NULL;
+ if ( args->sample_list )
+ {
+ has_sample = khash_str2int_init();
+ int i, nsmpl;
+ char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+ for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
+ free(smpl);
+ }
+
int i;
bcf_sr_t *reader = &args->files->readers[0];
for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
+ {
+ if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
printf("%s\n", reader->header->samples[i]);
+ }
+
+ if ( has_sample )
+ khash_str2int_destroy_free(has_sample);
}
static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c
index 10f56f1..8fd7cf0 100644
--- a/bcftools/vcfquery.c.pysam.c
+++ b/bcftools/vcfquery.c.pysam.c
@@ -34,6 +34,7 @@ THE SOFTWARE. */
#include <sys/types.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash_str2int.h>
#include <htslib/vcfutils.h>
#include "bcftools.h"
#include "filter.h"
@@ -153,10 +154,26 @@ static void query_vcf(args_t *args)
static void list_columns(args_t *args)
{
+ void *has_sample = NULL;
+ if ( args->sample_list )
+ {
+ has_sample = khash_str2int_init();
+ int i, nsmpl;
+ char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+ for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
+ free(smpl);
+ }
+
int i;
bcf_sr_t *reader = &args->files->readers[0];
for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
+ {
+ if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
fprintf(pysam_stdout, "%s\n", reader->header->samples[i]);
+ }
+
+ if ( has_sample )
+ khash_str2int_destroy_free(has_sample);
}
static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c
new file mode 100644
index 0000000..e41b628
--- /dev/null
+++ b/bcftools/vcfsort.c
@@ -0,0 +1,306 @@
+/* vcfsort.c -- sort subcommand
+
+ Copyright (C) 2017 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/kstring.h>
+#include "kheap.h"
+#include "bcftools.h"
+
+typedef struct
+{
+ char *fname;
+ htsFile *fh;
+ bcf1_t *rec;
+}
+blk_t;
+
+typedef struct _args_t
+{
+ bcf_hdr_t *hdr;
+ char **argv, *fname, *output_fname, *tmp_dir;
+ int argc, output_type;
+ size_t max_mem, mem;
+ bcf1_t **buf;
+ size_t nbuf, mbuf, nblk;
+ blk_t *blk;
+}
+args_t;
+
+int cmp_bcf_pos(const void *aptr, const void *bptr)
+{
+ bcf1_t *a = *((bcf1_t**)aptr);
+ bcf1_t *b = *((bcf1_t**)bptr);
+ if ( a->rid < b->rid ) return -1;
+ if ( a->rid > b->rid ) return 1;
+ if ( a->pos < b->pos ) return -1;
+ if ( a->pos > b->pos ) return 1;
+ return 0;
+}
+
+void buf_flush(args_t *args)
+{
+ if ( !args->nbuf ) return;
+
+ qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos);
+
+ args->nblk++;
+ args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
+ blk_t *blk = args->blk + args->nblk - 1;
+
+ kstring_t str = {0,0,0};
+ ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk);
+ blk->fname = str.s;
+
+ htsFile *fh = hts_open(blk->fname, "wbu");
+ if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno));
+ bcf_hdr_write(fh, args->hdr);
+
+ int i;
+ for (i=0; i<args->nbuf; i++)
+ {
+ bcf_write(fh, args->hdr, args->buf[i]);
+ bcf_destroy(args->buf[i]);
+ }
+ hts_close(fh);
+
+ args->nbuf = 0;
+ args->mem = 0;
+}
+
+void buf_push(args_t *args, bcf1_t *rec)
+{
+ int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*);
+ if ( args->mem + delta > args->max_mem ) buf_flush(args);
+ args->nbuf++;
+ args->mem += delta;
+ hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
+ args->buf[args->nbuf-1] = rec;
+}
+
+void sort_blocks(args_t *args)
+{
+ htsFile *in = hts_open(args->fname, "r");
+ if ( !in ) error("Could not read %s\n", args->fname);
+ args->hdr = bcf_hdr_read(in);
+
+ while ( 1 )
+ {
+ bcf1_t *rec = bcf_init();
+ int ret = bcf_read1(in, args->hdr, rec);
+ if ( ret < -1 ) error("Error encountered while parsing the input\n");
+ if ( ret == -1 )
+ {
+ bcf_destroy(rec);
+ break;
+ }
+ buf_push(args, rec);
+ }
+ buf_flush(args);
+ free(args->buf);
+
+ if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname);
+}
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+ blk_t *a = *aptr;
+ blk_t *b = *bptr;
+ if ( a->rec->rid < b->rec->rid ) return 1;
+ if ( a->rec->rid > b->rec->rid ) return 0;
+ if ( a->rec->pos < b->rec->pos ) return 1;
+ return 0;
+}
+KHEAP_INIT(blk, blk_t*, blk_is_smaller)
+
+void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk)
+{
+ if ( !blk->fh ) return;
+ int ret = bcf_read(blk->fh, hdr, blk->rec);
+ if ( ret < -1 ) error("Error reading %s\n", blk->fname);
+ if ( ret == -1 )
+ {
+ if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname);
+ blk->fh = 0;
+ return;
+ }
+ khp_insert(blk, bhp, &blk);
+}
+
+void merge_blocks(args_t *args)
+{
+ fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk);
+
+ khp_blk_t *bhp = khp_init(blk);
+
+ int i;
+ for (i=0; i<args->nblk; i++)
+ {
+ blk_t *blk = args->blk + i;
+ blk->fh = hts_open(blk->fname, "r");
+ if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno));
+ bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
+ bcf_hdr_destroy(hdr);
+ blk->rec = bcf_init();
+ blk_read(bhp, args->hdr, blk);
+ }
+
+ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ bcf_hdr_write(out, args->hdr);
+ while ( bhp->ndat )
+ {
+ blk_t *blk = bhp->dat[0];
+ bcf_write(out, args->hdr, blk->rec);
+ khp_delete(blk, bhp);
+ blk_read(bhp, args->hdr, blk);
+ }
+ if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname);
+
+ fprintf(stderr,"Cleaning\n");
+ for (i=0; i<args->nblk; i++)
+ {
+ blk_t *blk = args->blk + i;
+ unlink(blk->fname);
+ free(blk->fname);
+ bcf_destroy(blk->rec);
+ }
+ rmdir(args->tmp_dir);
+ free(args->blk);
+ khp_destroy(blk, bhp);
+ fprintf(stderr,"Done\n");
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Sort VCF/BCF file.\n");
+ fprintf(stderr, "Usage: bcftools sort [OPTIONS] <FILE.vcf>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -m, --max-mem <float>[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
+ fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -T, --temp-dir <dir> temporary files [/tmp/bcftools-sort.XXXXXX/]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+size_t parse_mem_string(char *str)
+{
+ char *tmp;
+ double mem = strtod(str, &tmp);
+ if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+ if ( !strcasecmp("k",tmp) ) mem *= 1000;
+ else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
+ else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
+ return mem;
+}
+
+void mkdir_p(const char *fmt, ...);
+void init(args_t *args)
+{
+ if ( !args->tmp_dir )
+ {
+ args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX");
+ char *tmp_dir = mkdtemp(args->tmp_dir);
+ if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno));
+ }
+ else
+ {
+ args->tmp_dir = strdup(args->tmp_dir);
+ mkdir_p(args->tmp_dir);
+ }
+ fprintf(stderr,"Writing to %s\n", args->tmp_dir);
+}
+void destroy(args_t *args)
+{
+ bcf_hdr_destroy(args->hdr);
+ free(args->tmp_dir);
+ free(args);
+}
+
+int main_sort(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->max_mem = 768*1000*1000;
+ args->output_fname = "-";
+
+ static struct option loptions[] =
+ {
+ {"max-mem",required_argument,NULL,'m'},
+ {"temp-dir",required_argument,NULL,'T'},
+ {"output-type",required_argument,NULL,'O'},
+ {"output-file",required_argument,NULL,'o'},
+ {"help",no_argument,NULL,'h'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'm': args->max_mem = parse_mem_string(optarg); break;
+ case 'T': args->tmp_dir = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'h': usage(args);
+ case '?': usage(args);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else args->fname = argv[optind];
+
+ init(args);
+ sort_blocks(args);
+ merge_blocks(args);
+ destroy(args);
+
+ return 0;
+}
diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c
new file mode 100644
index 0000000..a07cd92
--- /dev/null
+++ b/bcftools/vcfsort.c.pysam.c
@@ -0,0 +1,308 @@
+#include "pysam.h"
+
+/* vcfsort.c -- sort subcommand
+
+ Copyright (C) 2017 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/kstring.h>
+#include "kheap.h"
+#include "bcftools.h"
+
+typedef struct
+{
+ char *fname;
+ htsFile *fh;
+ bcf1_t *rec;
+}
+blk_t;
+
+typedef struct _args_t
+{
+ bcf_hdr_t *hdr;
+ char **argv, *fname, *output_fname, *tmp_dir;
+ int argc, output_type;
+ size_t max_mem, mem;
+ bcf1_t **buf;
+ size_t nbuf, mbuf, nblk;
+ blk_t *blk;
+}
+args_t;
+
+int cmp_bcf_pos(const void *aptr, const void *bptr)
+{
+ bcf1_t *a = *((bcf1_t**)aptr);
+ bcf1_t *b = *((bcf1_t**)bptr);
+ if ( a->rid < b->rid ) return -1;
+ if ( a->rid > b->rid ) return 1;
+ if ( a->pos < b->pos ) return -1;
+ if ( a->pos > b->pos ) return 1;
+ return 0;
+}
+
+void buf_flush(args_t *args)
+{
+ if ( !args->nbuf ) return;
+
+ qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos);
+
+ args->nblk++;
+ args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
+ blk_t *blk = args->blk + args->nblk - 1;
+
+ kstring_t str = {0,0,0};
+ ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk);
+ blk->fname = str.s;
+
+ htsFile *fh = hts_open(blk->fname, "wbu");
+ if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno));
+ bcf_hdr_write(fh, args->hdr);
+
+ int i;
+ for (i=0; i<args->nbuf; i++)
+ {
+ bcf_write(fh, args->hdr, args->buf[i]);
+ bcf_destroy(args->buf[i]);
+ }
+ hts_close(fh);
+
+ args->nbuf = 0;
+ args->mem = 0;
+}
+
+void buf_push(args_t *args, bcf1_t *rec)
+{
+ int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*);
+ if ( args->mem + delta > args->max_mem ) buf_flush(args);
+ args->nbuf++;
+ args->mem += delta;
+ hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
+ args->buf[args->nbuf-1] = rec;
+}
+
+void sort_blocks(args_t *args)
+{
+ htsFile *in = hts_open(args->fname, "r");
+ if ( !in ) error("Could not read %s\n", args->fname);
+ args->hdr = bcf_hdr_read(in);
+
+ while ( 1 )
+ {
+ bcf1_t *rec = bcf_init();
+ int ret = bcf_read1(in, args->hdr, rec);
+ if ( ret < -1 ) error("Error encountered while parsing the input\n");
+ if ( ret == -1 )
+ {
+ bcf_destroy(rec);
+ break;
+ }
+ buf_push(args, rec);
+ }
+ buf_flush(args);
+ free(args->buf);
+
+ if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname);
+}
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+ blk_t *a = *aptr;
+ blk_t *b = *bptr;
+ if ( a->rec->rid < b->rec->rid ) return 1;
+ if ( a->rec->rid > b->rec->rid ) return 0;
+ if ( a->rec->pos < b->rec->pos ) return 1;
+ return 0;
+}
+KHEAP_INIT(blk, blk_t*, blk_is_smaller)
+
+void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk)
+{
+ if ( !blk->fh ) return;
+ int ret = bcf_read(blk->fh, hdr, blk->rec);
+ if ( ret < -1 ) error("Error reading %s\n", blk->fname);
+ if ( ret == -1 )
+ {
+ if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname);
+ blk->fh = 0;
+ return;
+ }
+ khp_insert(blk, bhp, &blk);
+}
+
+void merge_blocks(args_t *args)
+{
+ fprintf(pysam_stderr,"Merging %d temporary files\n", (int)args->nblk);
+
+ khp_blk_t *bhp = khp_init(blk);
+
+ int i;
+ for (i=0; i<args->nblk; i++)
+ {
+ blk_t *blk = args->blk + i;
+ blk->fh = hts_open(blk->fname, "r");
+ if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno));
+ bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
+ bcf_hdr_destroy(hdr);
+ blk->rec = bcf_init();
+ blk_read(bhp, args->hdr, blk);
+ }
+
+ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ bcf_hdr_write(out, args->hdr);
+ while ( bhp->ndat )
+ {
+ blk_t *blk = bhp->dat[0];
+ bcf_write(out, args->hdr, blk->rec);
+ khp_delete(blk, bhp);
+ blk_read(bhp, args->hdr, blk);
+ }
+ if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname);
+
+ fprintf(pysam_stderr,"Cleaning\n");
+ for (i=0; i<args->nblk; i++)
+ {
+ blk_t *blk = args->blk + i;
+ unlink(blk->fname);
+ free(blk->fname);
+ bcf_destroy(blk->rec);
+ }
+ rmdir(args->tmp_dir);
+ free(args->blk);
+ khp_destroy(blk, bhp);
+ fprintf(pysam_stderr,"Done\n");
+}
+
+static void usage(args_t *args)
+{
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Sort VCF/BCF file.\n");
+ fprintf(pysam_stderr, "Usage: bcftools sort [OPTIONS] <FILE.vcf>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -m, --max-mem <float>[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
+ fprintf(pysam_stderr, " -o, --output-file <file> output file name [pysam_stdout]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -T, --temp-dir <dir> temporary files [/tmp/bcftools-sort.XXXXXX/]\n");
+ fprintf(pysam_stderr, "\n");
+ exit(1);
+}
+
+size_t parse_mem_string(char *str)
+{
+ char *tmp;
+ double mem = strtod(str, &tmp);
+ if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+ if ( !strcasecmp("k",tmp) ) mem *= 1000;
+ else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
+ else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
+ return mem;
+}
+
+void mkdir_p(const char *fmt, ...);
+void init(args_t *args)
+{
+ if ( !args->tmp_dir )
+ {
+ args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX");
+ char *tmp_dir = mkdtemp(args->tmp_dir);
+ if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno));
+ }
+ else
+ {
+ args->tmp_dir = strdup(args->tmp_dir);
+ mkdir_p(args->tmp_dir);
+ }
+ fprintf(pysam_stderr,"Writing to %s\n", args->tmp_dir);
+}
+void destroy(args_t *args)
+{
+ bcf_hdr_destroy(args->hdr);
+ free(args->tmp_dir);
+ free(args);
+}
+
+int main_sort(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->max_mem = 768*1000*1000;
+ args->output_fname = "-";
+
+ static struct option loptions[] =
+ {
+ {"max-mem",required_argument,NULL,'m'},
+ {"temp-dir",required_argument,NULL,'T'},
+ {"output-type",required_argument,NULL,'O'},
+ {"output-file",required_argument,NULL,'o'},
+ {"help",no_argument,NULL,'h'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'm': args->max_mem = parse_mem_string(optarg); break;
+ case 'T': args->tmp_dir = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'h': usage(args);
+ case '?': usage(args);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else args->fname = argv[optind];
+
+ init(args);
+ sort_blocks(args);
+ merge_blocks(args);
+ destroy(args);
+
+ return 0;
+}
diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c
index 4041a5a..3b73173 100644
--- a/bcftools/vcfstats.c
+++ b/bcftools/vcfstats.c
@@ -87,6 +87,7 @@ typedef struct
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
int subst[15];
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
+ int *smpl_hapRef, *smpl_hapAlt;
int *smpl_indel_hets, *smpl_indel_homs;
int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
unsigned long int *smpl_dp;
@@ -472,6 +473,8 @@ static void init_stats(args_t *args)
stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int));
@@ -548,17 +551,19 @@ static void destroy_stats(args_t *args)
#endif
free(stats->insertions);
free(stats->deletions);
- if (stats->smpl_hets) free(stats->smpl_hets);
- if (stats->smpl_homAA) free(stats->smpl_homAA);
- if (stats->smpl_homRR) free(stats->smpl_homRR);
- if (stats->smpl_indel_homs) free(stats->smpl_indel_homs);
- if (stats->smpl_indel_hets) free(stats->smpl_indel_hets);
- if (stats->smpl_ts) free(stats->smpl_ts);
- if (stats->smpl_tv) free(stats->smpl_tv);
- if (stats->smpl_indels) free(stats->smpl_indels);
- if (stats->smpl_dp) free(stats->smpl_dp);
- if (stats->smpl_ndp) free(stats->smpl_ndp);
- if (stats->smpl_sngl) free(stats->smpl_sngl);
+ free(stats->smpl_hets);
+ free(stats->smpl_homAA);
+ free(stats->smpl_homRR);
+ free(stats->smpl_hapRef);
+ free(stats->smpl_hapAlt);
+ free(stats->smpl_indel_homs);
+ free(stats->smpl_indel_hets);
+ free(stats->smpl_ts);
+ free(stats->smpl_tv);
+ free(stats->smpl_indels);
+ free(stats->smpl_dp);
+ free(stats->smpl_ndp);
+ free(stats->smpl_sngl);
idist_destroy(&stats->dp);
idist_destroy(&stats->dp_sites);
for (j=0; j<stats->nusr; j++)
@@ -861,6 +866,8 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
assert( ial<line->n_allele );
stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
}
+ if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
+ if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
continue;
}
if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
@@ -873,7 +880,10 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
case GT_HOM_AA: nalt_tot++; break;
}
#endif
- if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP
+ int var_type = 0;
+ if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+ if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+ if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP
{
if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
@@ -889,7 +899,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
stats->smpl_tv[is]++;
}
}
- if ( line_type&VCF_INDEL )
+ if ( var_type&VCF_INDEL )
{
if ( gt != GT_HOM_RR )
{
@@ -1068,7 +1078,7 @@ static void do_vcf_stats(args_t *args)
if ( line->n_allele>2 )
{
stats->n_mals++;
- if ( line_type == VCF_SNP ) stats->n_snp_mals++;
+ if ( line_type == VCF_SNP ) stats->n_snp_mals++; // note: this will be fooled by C>C,T
}
if ( files->n_smpl )
@@ -1125,7 +1135,22 @@ static void print_header(args_t *args)
static void print_stats(args_t *args)
{
int i, j,k, id;
- printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
+ printf("# SN, Summary numbers:\n");
+ printf("# number of records .. number of data rows in the VCF\n");
+ printf("# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n");
+ printf("# number of SNPs .. number of rows with a SNP\n");
+ printf("# number of MNPs .. number of rows with a MNP, such as CC>TT\n");
+ printf("# number of indels .. number of rows with an indel\n");
+ printf("# number of others .. number of rows with other type, for example a symbolic allele or\n");
+ printf("# a complex substitution, such as ACT>TCGA\n");
+ printf("# number of multiallelic sites .. number of rows with multiple alternate alleles\n");
+ printf("# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n");
+ printf("# \n");
+ printf("# Note that rows containing multiple types will be counted multiple times, in each\n");
+ printf("# counter. For example, a row with a SNP and an indel increments both the SNP and\n");
+ printf("# the indel counter.\n");
+ printf("# \n");
+ printf("# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
for (id=0; id<args->nstats; id++)
@@ -1470,16 +1495,18 @@ static void print_stats(args_t *args)
if ( args->files->n_smpl )
{
- printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
+ printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+ printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
+ "\t[12]nHapRef\t[13]nHapAlt\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
{
float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
- printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
+ printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
- stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]);
+ stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
}
}
diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c
index a5e5a9f..57adbc0 100644
--- a/bcftools/vcfstats.c.pysam.c
+++ b/bcftools/vcfstats.c.pysam.c
@@ -89,6 +89,7 @@ typedef struct
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
int subst[15];
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
+ int *smpl_hapRef, *smpl_hapAlt;
int *smpl_indel_hets, *smpl_indel_homs;
int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
unsigned long int *smpl_dp;
@@ -474,6 +475,8 @@ static void init_stats(args_t *args)
stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int));
@@ -550,17 +553,19 @@ static void destroy_stats(args_t *args)
#endif
free(stats->insertions);
free(stats->deletions);
- if (stats->smpl_hets) free(stats->smpl_hets);
- if (stats->smpl_homAA) free(stats->smpl_homAA);
- if (stats->smpl_homRR) free(stats->smpl_homRR);
- if (stats->smpl_indel_homs) free(stats->smpl_indel_homs);
- if (stats->smpl_indel_hets) free(stats->smpl_indel_hets);
- if (stats->smpl_ts) free(stats->smpl_ts);
- if (stats->smpl_tv) free(stats->smpl_tv);
- if (stats->smpl_indels) free(stats->smpl_indels);
- if (stats->smpl_dp) free(stats->smpl_dp);
- if (stats->smpl_ndp) free(stats->smpl_ndp);
- if (stats->smpl_sngl) free(stats->smpl_sngl);
+ free(stats->smpl_hets);
+ free(stats->smpl_homAA);
+ free(stats->smpl_homRR);
+ free(stats->smpl_hapRef);
+ free(stats->smpl_hapAlt);
+ free(stats->smpl_indel_homs);
+ free(stats->smpl_indel_hets);
+ free(stats->smpl_ts);
+ free(stats->smpl_tv);
+ free(stats->smpl_indels);
+ free(stats->smpl_dp);
+ free(stats->smpl_ndp);
+ free(stats->smpl_sngl);
idist_destroy(&stats->dp);
idist_destroy(&stats->dp_sites);
for (j=0; j<stats->nusr; j++)
@@ -863,6 +868,8 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
assert( ial<line->n_allele );
stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
}
+ if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
+ if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
continue;
}
if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
@@ -875,7 +882,10 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
case GT_HOM_AA: nalt_tot++; break;
}
#endif
- if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP
+ int var_type = 0;
+ if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+ if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+ if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP
{
if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
@@ -891,7 +901,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
stats->smpl_tv[is]++;
}
}
- if ( line_type&VCF_INDEL )
+ if ( var_type&VCF_INDEL )
{
if ( gt != GT_HOM_RR )
{
@@ -1070,7 +1080,7 @@ static void do_vcf_stats(args_t *args)
if ( line->n_allele>2 )
{
stats->n_mals++;
- if ( line_type == VCF_SNP ) stats->n_snp_mals++;
+ if ( line_type == VCF_SNP ) stats->n_snp_mals++; // note: this will be fooled by C>C,T
}
if ( files->n_smpl )
@@ -1127,7 +1137,22 @@ static void print_header(args_t *args)
static void print_stats(args_t *args)
{
int i, j,k, id;
- fprintf(pysam_stdout, "# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
+ fprintf(pysam_stdout, "# SN, Summary numbers:\n");
+ fprintf(pysam_stdout, "# number of records .. number of data rows in the VCF\n");
+ fprintf(pysam_stdout, "# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n");
+ fprintf(pysam_stdout, "# number of SNPs .. number of rows with a SNP\n");
+ fprintf(pysam_stdout, "# number of MNPs .. number of rows with a MNP, such as CC>TT\n");
+ fprintf(pysam_stdout, "# number of indels .. number of rows with an indel\n");
+ fprintf(pysam_stdout, "# number of others .. number of rows with other type, for example a symbolic allele or\n");
+ fprintf(pysam_stdout, "# a complex substitution, such as ACT>TCGA\n");
+ fprintf(pysam_stdout, "# number of multiallelic sites .. number of rows with multiple alternate alleles\n");
+ fprintf(pysam_stdout, "# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n");
+ fprintf(pysam_stdout, "# \n");
+ fprintf(pysam_stdout, "# Note that rows containing multiple types will be counted multiple times, in each\n");
+ fprintf(pysam_stdout, "# counter. For example, a row with a SNP and an indel increments both the SNP and\n");
+ fprintf(pysam_stdout, "# the indel counter.\n");
+ fprintf(pysam_stdout, "# \n");
+ fprintf(pysam_stdout, "# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
for (id=0; id<args->nstats; id++)
@@ -1472,16 +1497,18 @@ static void print_stats(args_t *args)
if ( args->files->n_smpl )
{
- fprintf(pysam_stdout, "# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
+ fprintf(pysam_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+ fprintf(pysam_stdout, "# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
+ "\t[12]nHapRef\t[13]nHapAlt\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
{
float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
- fprintf(pysam_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
+ fprintf(pysam_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
- stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]);
+ stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
}
}
diff --git a/bcftools/version.h b/bcftools/version.h
index 11ee02d..eb2074c 100644
--- a/bcftools/version.h
+++ b/bcftools/version.h
@@ -1 +1 @@
-#define BCFTOOLS_VERSION "1.5"
+#define BCFTOOLS_VERSION "1.6"
diff --git a/doc/installation.rst b/doc/installation.rst
index e404701..535f4bc 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -69,6 +69,9 @@ Note that the location of the file :file:`libhts.so` needs to be known
to the linker once you run pysam, for example by setting the
environment-varirable `LD_LIBRARY_PATH`.
+Note that generally the pysam and htslib version need to be
+compatible. See the release notes for more information.
+
Installation from repository
============================
@@ -83,3 +86,23 @@ To install from repository, type::
python setup.py install
For compilation options, see the section on Pypi installation above.
+
+Requirements
+============
+
+Depending on the installation method, requirements for building pysam differ.
+
+When installing through conda_, dependencies will be resolved by the
+package manager. The pip_ installation and installation from source
+require a C compiler and its standard libraries as well as all
+requirements for building htslib. Htslib requirements are listed in
+the htslib/INSTALL file.
+
+Installing from the repository will require cython_ to be installed.
+
+
+
+
+
+
+
diff --git a/doc/release.rst b/doc/release.rst
index 18af4ad..81cd274 100644
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,17 @@
Release notes
=============
+Release 0.13.0
+===============
+
+This release wraps htslib/samtools/bcftools versions 1.6.0 and
+contains a series of bugfixes.
+
+* [#544] reading header from remote TabixFiles now works.
+* [#531] add missing tag types H and A. A python float will now be
+ added as 'f' type instead of 'd' type.
+
+
Release 0.12.0.1
================
diff --git a/import.py b/import.py
index c50f623..80e6d4b 100644
--- a/import.py
+++ b/import.py
@@ -12,6 +12,7 @@
# For samtools, type:
# rm -rf samtools
# python import.py samtools download/samtools
+# git checkout -- samtools/version.h
#
# Manually, then:
# modify config.h to set compatibility flags
@@ -19,6 +20,7 @@
# For bcftools, type:
# rm -rf bedtools
# python import.py bedtools download/bedtools
+# git checkout -- bcftools/version.h
# rm -rf bedtools/test bedtools/plugins
import fnmatch
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx
index 67967c4..4b3b4dd 100644
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -64,6 +64,9 @@ from cpython.version cimport PY_MAJOR_VERSION
from cpython cimport PyBytes_FromStringAndSize
from libc.string cimport strchr
from cpython cimport array as c_array
+from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
+ INT8_MAX, INT16_MAX, INT32_MAX, \
+ UINT8_MAX, UINT16_MAX, UINT32_MAX
from pysam.libcutils cimport force_bytes, force_str, \
charptr_to_str, charptr_to_bytes
@@ -74,13 +77,15 @@ from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array,
cdef char * htslib_types = 'cCsSiIf'
cdef char * parray_types = 'bBhHiIf'
+cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+
# translation tables
# cigar code to character and vice versa
cdef char* CODE2CIGAR= "MIDNSHP=XB"
cdef int NCIGAR_CODES = 10
-if PY_MAJOR_VERSION >= 3:
+if IS_PYTHON3:
CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
else:
CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
@@ -142,18 +147,33 @@ cdef convert_binary_tag(uint8_t * tag):
return byte_size, nvalues, c_values
-cdef inline uint8_t get_value_code(value, value_type=None):
- '''guess type code for a *value*. If *value_type* is None,
- the type code will be inferred based on the Python type of
- *value*'''
- cdef uint8_t typecode
- cdef char * _char_type
+cdef inline uint8_t get_tag_typecode(value, value_type=None):
+ """guess type code for a *value*. If *value_type* is None, the type
+ code will be inferred based on the Python type of *value*
+ """
+ # 0 is unknown typecode
+ cdef char typecode = 0
+
if value_type is None:
if isinstance(value, int):
- typecode = 'i'
+ if value < 0:
+ if value >= INT8_MIN:
+ typecode = 'c'
+ elif value >= INT16_MIN:
+ typecode = 's'
+ elif value >= INT32_MIN:
+ typecode = 'i'
+ # unsigned ints
+ else:
+ if value <= UINT8_MAX:
+ typecode = 'C'
+ elif value <= UINT16_MAX:
+ typecode = 'S'
+ elif value <= UINT32_MAX:
+ typecode = 'I'
elif isinstance(value, float):
- typecode = 'd'
+ typecode = 'f'
elif isinstance(value, str):
typecode = 'Z'
elif isinstance(value, bytes):
@@ -162,93 +182,98 @@ cdef inline uint8_t get_value_code(value, value_type=None):
isinstance(value, list) or \
isinstance(value, tuple):
typecode = 'B'
- else:
- return 0
else:
- if value_type not in 'Zidf':
- return 0
- value_type = force_bytes(value_type)
- _char_type = value_type
- typecode = (<uint8_t*>_char_type)[0]
+ if value_type in 'aAsSIcCZidfH':
+ typecode = force_bytes(value_type)[0]
return typecode
-cdef inline bytes getTypecode(value, maximum_value=None):
+cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None):
'''returns the value typecode of a value.
- If max is specified, the approprite type is
- returned for a range where value is the minimum.
+ If max is specified, the appropriate type is returned for a range
+ where value is the minimum.
+
+ Note that this method returns types from the extended BAM alphabet
+ of types that includes tags that are not part of the SAM
+ specification.
'''
- if maximum_value is None:
- maximum_value = value
- cdef bytes valuetype
+ cdef uint8_t typecode
t = type(value)
if t is float:
- valuetype = b'f'
+ typecode = 'f'
elif t is int:
+ if max_value is None:
+ max_value = value
+ if min_value is None:
+ min_value = value
# signed ints
- if value < 0:
- if value >= -128 and maximum_value < 128:
- valuetype = b'c'
- elif value >= -32768 and maximum_value < 32768:
- valuetype = b's'
- elif value < -2147483648 or maximum_value >= 2147483648:
+ if min_value < 0:
+ if min_value >= INT8_MIN and max_value <= INT8_MAX:
+ typecode = 'c'
+ elif min_value >= INT16_MIN and max_value <= INT16_MAX:
+ typecode = 's'
+ elif min_value >= INT32_MIN or max_value <= INT32_MAX:
+ typecode = 'i'
+ else:
raise ValueError(
"at least one signed integer out of range of "
"BAM/SAM specification")
- else:
- valuetype = b'i'
# unsigned ints
else:
- if maximum_value < 256:
- valuetype = b'C'
- elif maximum_value < 65536:
- valuetype = b'S'
- elif maximum_value >= 4294967296:
+ if max_value <= UINT8_MAX:
+ typecode = 'C'
+ elif max_value <= UINT16_MAX:
+ typecode = 'S'
+ elif max_value <= UINT32_MAX:
+ typecode = 'I'
+ else:
raise ValueError(
"at least one integer out of range of BAM/SAM specification")
- else:
- valuetype = b'I'
else:
# Note: hex strings (H) are not supported yet
if t is not bytes:
value = value.encode('ascii')
if len(value) == 1:
- valuetype = b'A'
+ typecode = 'A'
else:
- valuetype = b'Z'
+ typecode = 'Z'
+
+ return typecode
+
- return valuetype
+# mapping python array.array and htslib typecodes to struct typecodes
+DATATYPE2FORMAT = {
+ ord('c'): ('b', 1),
+ ord('C'): ('B', 1),
+ ord('s'): ('h', 2),
+ ord('S'): ('H', 2),
+ ord('i'): ('i', 4),
+ ord('I'): ('I', 4),
+ ord('f'): ('f', 4),
+ ord('d'): ('d', 8),
+ ord('A'): ('c', 1),
+ ord('a'): ('c', 1)}
-cdef inline packTags(tags):
+cdef inline pack_tags(tags):
"""pack a list of tags. Each tag is a tuple of (tag, tuple).
Values are packed into the most space efficient data structure
possible unless the tag contains a third field with the typecode.
- Returns a format string and the associated list of arguments
- to be used in a call to struct.pack_into.
+ Returns a format string and the associated list of arguments to be
+ used in a call to struct.pack_into.
"""
fmts, args = ["<"], []
- cdef char array_typecode
-
- datatype2format = {
- b'c': ('b', 1),
- b'C': ('B', 1),
- b's': ('h', 2),
- b'S': ('H', 2),
- b'i': ('i', 4),
- b'I': ('I', 4),
- b'f': ('f', 4),
- b'A': ('c', 1)}
-
+ # htslib typecode
+ cdef uint8_t typecode
for tag in tags:
if len(tag) == 2:
@@ -259,68 +284,76 @@ cdef inline packTags(tags):
else:
raise ValueError("malformatted tag: %s" % str(tag))
+ if valuetype is None:
+ typecode = 0
+ else:
+ # only first character in valuecode matters
+ if IS_PYTHON3:
+ typecode = force_bytes(valuetype)[0]
+ else:
+ typecode = ord(valuetype[0])
+
pytag = force_bytes(pytag)
- valuetype = force_bytes(valuetype)
- t = type(value)
+ pytype = type(value)
- if t is tuple or t is list:
+ if pytype is tuple or pytype is list:
# binary tags from tuples or lists
- if valuetype is None:
+ if not typecode:
# automatically determine value type - first value
# determines type. If there is a mix of types, the
# result is undefined.
- valuetype = getTypecode(min(value), max(value))
+ typecode = get_btag_typecode(min(value),
+ min_value=min(value),
+ max_value=max(value))
- if valuetype not in datatype2format:
- raise ValueError("invalid value type '%s'" % valuetype)
+ if typecode not in DATATYPE2FORMAT:
+ raise ValueError("invalid value type '{}'".format(chr(typecode)))
- datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0])
+ datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0])
args.extend([pytag[:2],
- b"B",
- valuetype,
+ ord("B"),
+ typecode,
len(value)] + list(value))
elif isinstance(value, array.array):
- valuetype = value.typecode
- if valuetype not in datatype2format:
- valuetype = None
# binary tags from arrays
- if valuetype is None:
- array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
-
- if array_typecode == 0:
- raise ValueError("unsupported type code '{}'"
- .format(value.typecode))
+ if typecode == 0:
+ typecode = map_typecode_python_to_htslib(ord(value.typecode))
- valuetype = force_bytes(chr(array_typecode))
-
- if valuetype not in datatype2format:
- raise ValueError("invalid value type '%s' (%s)" %
- (valuetype, type(valuetype)))
+ if typecode == 0:
+ raise ValueError("unsupported type code '{}'".format(value.typecode))
+ if typecode not in DATATYPE2FORMAT:
+ raise ValueError("invalid value type '{}' ({})".format(chr(typecode), array.typecode))
+
# use array.tostring() to retrieve byte representation and
# save as bytes
- datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1])
+ datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
args.extend([pytag[:2],
- b"B",
- valuetype,
+ ord("B"),
+ typecode,
len(value),
force_bytes(value.tostring())])
else:
- if valuetype is None:
- valuetype = getTypecode(value)
-
- if valuetype in b"AZ":
+ if typecode == 0:
+ typecode = get_tag_typecode(value)
+ if typecode == 0:
+ raise ValueError("could not deduce typecode for value {}".format(value))
+
+ if typecode == 'a' or typecode == 'A' or typecode == 'Z' or typecode == 'H':
value = force_bytes(value)
- if valuetype == b"Z":
- datafmt = "2sc%is" % (len(value)+1)
- else:
- datafmt = "2sc%s" % datatype2format[valuetype][0]
+ if typecode == "a":
+ typecode = 'A'
+ if typecode == 'Z' or typecode == 'H':
+ datafmt = "2sB%is" % (len(value)+1)
+ else:
+ datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
+
args.extend([pytag[:2],
- valuetype,
+ typecode,
value])
fmts.append(datafmt)
@@ -545,6 +578,31 @@ cdef inline uint32_t get_alignment_length(bam1_t * src):
l += cigar_p[k] >> BAM_CIGAR_SHIFT
return l
+cdef inline uint32_t get_md_reference_length(char * md_tag):
+ cdef int l = 0
+ cdef int md_idx = 0
+ cdef int nmatches = 0
+
+ while md_tag[md_idx] != 0:
+ if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
+ nmatches *= 10
+ nmatches += md_tag[md_idx] - 48
+ md_idx += 1
+ continue
+ else:
+ l += nmatches
+ nmatches = 0
+ if md_tag[md_idx] == '^':
+ md_idx += 1
+ while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
+ md_idx += 1
+ l += 1
+ else:
+ md_idx += 1
+ l += 1
+
+ l += nmatches
+ return l
# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
cdef inline bytes build_alignment_sequence(bam1_t * src):
@@ -634,6 +692,21 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
cdef int md_idx = 0
s_idx = 0
+ # Check if MD tag is valid by matching CIGAR length to MD tag defined length
+ # Insertions would be in addition to what is described by MD, so we calculate
+ # the number of insertions seperately.
+ insertions = 0
+
+ while s[s_idx] != 0:
+ if s[s_idx] >= 'a':
+ insertions += 1
+ s_idx += 1
+ s_idx = 0
+
+ cdef uint32_t md_len = get_md_reference_length(md_tag)
+ if md_len + insertions > max_len:
+ raise AssertionError("Invalid MD tag: MD length {} mismatch with CIGAR length {}".format(md_len, max_len))
+
while md_tag[md_idx] != 0:
# c is numerical
if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
@@ -1918,24 +1991,56 @@ cdef class AlignedSegment:
section.
*value_type* describes the type of *value* that is to entered
- into the alignment record.. It can be set explicitly to one
- of the valid one-letter type codes. If unset, an appropriate
- type will be chosen automatically.
+ into the alignment record. It can be set explicitly to one of
+ the valid one-letter type codes. If unset, an appropriate type
+ will be chosen automatically based on the python type of
+ *value*.
An existing value of the same *tag* will be overwritten unless
- replace is set to False. This is usually not recommened as a
+ *replace* is set to False. This is usually not recommened as a
tag may only appear once in the optional alignment section.
If *value* is None, the tag will be deleted.
+
+ This method accepts valid SAM specification value types, which
+ are::
+
+ A: printable char
+ i: signed int
+ f: float
+ Z: printable string
+ H: Byte array in hex format
+ B: Integer or numeric array
+
+ Additionally, it will accept the integer BAM types ('cCsSI')
+
+ For htslib compatibility, 'a' is synonymous with 'A' and the
+ method accepts a 'd' type code for a double precision float.
+
+ When deducing the type code by the python type of *value*, the
+ following mapping is applied::
+
+ i: python int
+ f: python float
+ Z: python str or bytes
+ B: python array.array, list or tuple
+
+ Note that a single character string will be output as 'Z' and
+ not 'A' as the former is the more general type.
"""
cdef int value_size
+ cdef uint8_t tc
cdef uint8_t * value_ptr
cdef uint8_t *existing_ptr
- cdef uint8_t typecode
cdef float float_value
cdef double double_value
- cdef int32_t int_value
+ cdef int32_t int32_t_value
+ cdef uint32_t uint32_t_value
+ cdef int16_t int16_t_value
+ cdef uint16_t uint16_t_value
+ cdef int8_t int8_t_value
+ cdef uint8_t uint8_t_value
cdef bam1_t * src = self._delegate
cdef char * _value_type
cdef c_array.array array_value
@@ -1954,19 +2059,51 @@ cdef class AlignedSegment:
if value is None:
return
- typecode = get_value_code(value, value_type)
+ cdef uint8_t typecode = get_tag_typecode(value, value_type)
if typecode == 0:
- raise ValueError("can't guess type or invalid type code specified")
+ raise ValueError("can't guess type or invalid type code specified: {} {}".format(
+ value, value_type))
- # Not Endian-safe, but then again neither is samtools!
+ # sam_format1 for typecasting
if typecode == 'Z':
value = force_bytes(value)
value_ptr = <uint8_t*><char*>value
value_size = len(value)+1
+ elif typecode == 'H':
+ # Note that hex tags are stored the very same
+ # way as Z string.s
+ value = force_bytes(value)
+ value_ptr = <uint8_t*><char*>value
+ value_size = len(value)+1
+ elif typecode == 'A' or typecode == 'a':
+ value = force_bytes(value)
+ value_ptr = <uint8_t*><char*>value
+ value_size = sizeof(char)
+ typecode = 'A'
elif typecode == 'i':
- int_value = value
- value_ptr = <uint8_t*>&int_value
+ int32_t_value = value
+ value_ptr = <uint8_t*>&int32_t_value
value_size = sizeof(int32_t)
+ elif typecode == 'I':
+ uint32_t_value = value
+ value_ptr = <uint8_t*>&uint32_t_value
+ value_size = sizeof(uint32_t)
+ elif typecode == 's':
+ int16_t_value = value
+ value_ptr = <uint8_t*>&int16_t_value
+ value_size = sizeof(int16_t)
+ elif typecode == 'S':
+ uint16_t_value = value
+ value_ptr = <uint8_t*>&uint16_t_value
+ value_size = sizeof(uint16_t)
+ elif typecode == 'c':
+ int8_t_value = value
+ value_ptr = <uint8_t*>&int8_t_value
+ value_size = sizeof(int8_t)
+ elif typecode == 'C':
+ uint8_t_value = value
+ value_ptr = <uint8_t*>&uint8_t_value
+ value_size = sizeof(uint8_t)
elif typecode == 'd':
double_value = value
value_ptr = <uint8_t*>&double_value
@@ -1978,13 +2115,10 @@ cdef class AlignedSegment:
elif typecode == 'B':
# the following goes through python, needs to be cleaned up
# pack array using struct
- if value_type is None:
- fmt, args = packTags([(tag, value)])
- else:
- fmt, args = packTags([(tag, value, value_type)])
+ fmt, args = pack_tags([(tag, value, value_type)])
# remove tag and type code as set by bam_aux_append
- # first four chars of format (<2sc)
+ # first four chars of format (<2sB)
fmt = '<' + fmt[4:]
# first two values to pack
args = args[2:]
@@ -2000,7 +2134,7 @@ cdef class AlignedSegment:
<uint8_t*>buffer.raw)
return
else:
- raise ValueError('unsupported value_type in set_option')
+ raise ValueError('unsupported value_type {} in set_option'.format(typecode))
bam_aux_append(src,
tag,
@@ -2027,6 +2161,10 @@ cdef class AlignedSegment:
This method is the fastest way to access the optional
alignment section if only few tags need to be retrieved.
+ Possible value types are "AcCsSiIfZHB" (see BAM format
+ specification) as well as additional value type 'd' as
+ implemented in htslib.
+
Parameters
----------
@@ -2061,19 +2199,20 @@ cdef class AlignedSegment:
else:
auxtype = chr(v[0])
- if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
- value = <int>bam_aux2i(v)
- elif auxtype == 'i' or auxtype == 'I':
- value = <int32_t>bam_aux2i(v)
+ if auxtype in "iIcCsS":
+ value = bam_aux2i(v)
elif auxtype == 'f' or auxtype == 'F':
- value = <float>bam_aux2f(v)
+ value = bam_aux2f(v)
elif auxtype == 'd' or auxtype == 'D':
- value = <double>bam_aux2f(v)
- elif auxtype == 'A':
+ value = bam_aux2f(v)
+ elif auxtype == 'A' or auxtype == 'a':
+ # force A to a
+ v[0] = 'A'
# there might a more efficient way
# to convert a char into a string
value = '%c' % <char>bam_aux2A(v)
- elif auxtype == 'Z':
+ elif auxtype == 'Z' or auxtype == 'H':
+ # Z and H are treated equally as strings in htslib
value = charptr_to_str(<char*>bam_aux2Z(v))
elif auxtype[0] == 'B':
bytesize, nvalues, values = convert_binary_tag(v + 1)
@@ -2141,7 +2280,7 @@ cdef class AlignedSegment:
elif auxtype == 'd':
value = <double>bam_aux2f(s)
s += 8
- elif auxtype == 'A':
+ elif auxtype in ('A', 'a'):
value = "%c" % <char>bam_aux2A(s)
s += 1
elif auxtype in ('Z', 'H'):
@@ -2166,7 +2305,7 @@ cdef class AlignedSegment:
return result
def set_tags(self, tags):
- """sets the fields in the optional alignmest section with
+ """sets the fields in the optional alignment section with
a list of (tag, value) tuples.
The :term:`value type` of the values is determined from the
@@ -2188,7 +2327,7 @@ cdef class AlignedSegment:
# convert and pack the data
if tags is not None and len(tags) > 0:
- fmt, args = packTags(tags)
+ fmt, args = pack_tags(tags)
new_size = struct.calcsize(fmt)
buffer = ctypes.create_string_buffer(new_size)
struct.pack_into(fmt,
@@ -2196,6 +2335,7 @@ cdef class AlignedSegment:
0,
*args)
+
# delete the old data and allocate new space.
# If total_size == 0, the aux field will be
# empty
diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd
index d59e704..fb2bd0c 100644
--- a/pysam/libcalignmentfile.pxd
+++ b/pysam/libcalignmentfile.pxd
@@ -4,7 +4,7 @@ from libc.stdlib cimport malloc, calloc, realloc, free
from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
from libc.stdio cimport FILE, printf
-from pysam.libcfaidx cimport faidx_t, Fastafile
+from pysam.libcfaidx cimport faidx_t, FastaFile
from pysam.libcalignedsegment cimport AlignedSegment
from pysam.libchtslib cimport *
@@ -121,7 +121,7 @@ cdef class IteratorColumn:
cdef bam_plp_t pileup_iter
cdef __iterdata iterdata
cdef AlignmentFile samfile
- cdef Fastafile fastafile
+ cdef FastaFile fastafile
cdef stepper
cdef int max_depth
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx
index cea312c..1599dfa 100644
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -2149,7 +2149,7 @@ cdef class IteratorColumn:
def __get__(self):
return self.iterdata.seq_len
- def addReference(self, Fastafile fastafile):
+ def addReference(self, FastaFile fastafile):
'''
add reference sequences in `fastafile` to iterator.'''
self.fastafile = fastafile
diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx
index f8b0e38..23e5832 100644
--- a/pysam/libctabix.pyx
+++ b/pysam/libctabix.pyx
@@ -53,6 +53,7 @@
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
+import binascii
import os
import sys
@@ -71,9 +72,9 @@ cimport pysam.libctabixproxies as ctabixproxies
from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
- tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
+ tbx_index_build2, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
- tbx_destroy, hisremote, region_list, \
+ tbx_destroy, hisremote, region_list, hts_getline, \
TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
@@ -472,9 +473,9 @@ cdef class TabixFile:
# without region or reference - iterate from start
with nogil:
itr = tbx_itr_queryi(fileobj.index,
- HTS_IDX_START,
- 0,
- 0)
+ HTS_IDX_START,
+ 0,
+ 0)
else:
s = force_bytes(region, encoding=fileobj.encoding)
cstr = s
@@ -528,18 +529,42 @@ cdef class TabixFile:
.. note::
The header is returned as an iterator presenting lines
without the newline character.
-
- .. note::
- The header is only available for local files. For remote
- files an Attribute Error is raised.
-
'''
def __get__(self):
- if self.is_remote:
- raise AttributeError(
- "the header is not available for remote files")
- return GZIteratorHead(self.filename)
+
+ cdef char *cfilename = self.filename
+
+ cdef kstring_t buffer
+ buffer.l = buffer.m = 0
+ buffer.s = NULL
+
+ cdef htsFile * fp = NULL
+ cdef int KS_SEP_LINE = 2
+ cdef tbx_t * tbx = NULL
+ lines = []
+ with nogil:
+ fp = hts_open(cfilename, 'r')
+
+ if fp == NULL:
+ raise OSError("could not open {} for reading header".format(self.filename))
+
+ with nogil:
+ tbx = tbx_index_load(cfilename)
+
+ if tbx == NULL:
+ raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
+
+ while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
+ if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
+ break
+ lines.append(force_str(buffer.s, self.encoding))
+
+ with nogil:
+ hts_close(fp)
+ free(buffer.s)
+
+ return lines
property contigs:
'''list of chromosome names'''
@@ -843,16 +868,25 @@ def tabix_compress(filename_in,
raise IOError("error %i when closing file %s" % (r, filename_in))
-def tabix_index(filename,
+def is_gzip_file(filename):
+ gzip_magic_hex = b'1f8b'
+ fd = os.open(filename, os.O_RDONLY)
+ header = os.read(fd, 2)
+ return header == binascii.a2b_hex(gzip_magic_hex)
+
+
+def tabix_index(filename,
force=False,
- seq_col=None,
- start_col=None,
+ seq_col=None,
+ start_col=None,
end_col=None,
preset=None,
meta_char="#",
int line_skip=0,
zerobased=False,
int min_shift=-1,
+ index=None,
+ keep_original=False,
):
'''index tab-separated *filename* using tabix.
@@ -876,20 +910,22 @@ def tabix_index(filename,
Lines beginning with *meta_char* and the first *line_skip* lines
will be skipped.
-
- If *filename* does not end in ".gz", it will be automatically
- compressed. The original file will be removed and only the
- compressed file will be retained.
- If *filename* ends in *gz*, the file is assumed to be already
- compressed with bgzf.
+ If *filename* is not detected as a gzip file it will be automatically
+ compressed. The original file will be removed and only the compressed
+ file will be retained.
*min-shift* sets the minimal interval size to 1<<INT; 0 for the
old tabix index. The default of -1 is changed inside htslib to
the old tabix default of 0.
- returns the filename of the compressed data
+ *index* controls the filename which should be used for creating the index.
+ If not set, the default is to append ``.tbi`` to *filename*.
+
+ When automatically compressing files, if *keep_original* is set the
+ uncompressed file will not be deleted.
+ returns the filename of the compressed data
'''
if not os.path.exists(filename):
@@ -900,14 +936,17 @@ def tabix_index(filename,
raise ValueError(
"neither preset nor seq_col,start_col and end_col given")
- if not filename.endswith(".gz"):
+ if not is_gzip_file(filename):
tabix_compress(filename, filename + ".gz", force=force)
- os.unlink( filename )
+ if not keep_original:
+ os.unlink( filename )
filename += ".gz"
- if not force and os.path.exists(filename + ".tbi"):
+ index = index or filename + ".tbi"
+
+ if not force and os.path.exists(index):
raise IOError(
- "Filename '%s.tbi' already exists, use *force* to overwrite")
+ "Filename '%s' already exists, use *force* to overwrite" % index)
# columns (1-based):
# preset-code, contig, start, end, metachar for
@@ -949,9 +988,11 @@ def tabix_index(filename,
fn = encode_filename(filename)
+ fn_index = encode_filename(index)
cdef char *cfn = fn
+ cdef char *fnidx = fn_index
with nogil:
- tbx_index_build(cfn, min_shift, &conf)
+ tbx_index_build2(cfn, fnidx, min_shift, &conf)
return filename
diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx
index 5ff0948..e2d7ef4 100644
--- a/pysam/libctabixproxies.pyx
+++ b/pysam/libctabixproxies.pyx
@@ -516,6 +516,7 @@ cdef class GTFProxy(NamedTupleProxy):
self.attribute_dict = self.attribute_string2dict(
self.attributes)
self.attribute_dict[name] = value
+ self.is_modified = True
def attribute_string2dict(self, s):
return collections.OrderedDict(
diff --git a/pysam/version.py b/pysam/version.py
index 2a416f1..ab9aeaf 100644
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,10 +1,10 @@
# pysam versioning information
-__version__ = "0.12.0.1"
+__version__ = "0.13"
# TODO: upgrade number
-__samtools_version__ = "1.5"
+__samtools_version__ = "1.6"
# TODO: upgrade code and number
-__bcftools_version__ = "1.5"
+__bcftools_version__ = "1.6"
-__htslib_version__ = "1.5"
+__htslib_version__ = "1.6"
diff --git a/run_tests_travis.sh b/run_tests_travis.sh
index fa44857..2378fcd 100755
--- a/run_tests_travis.sh
+++ b/run_tests_travis.sh
@@ -26,23 +26,24 @@ bash Miniconda3.sh -b
# Create a new conda environment with the target python version
~/miniconda3/bin/conda install conda-build -y
-~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy pytest psutil pip
+~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy pytest psutil pip
# activate testenv environment
source ~/miniconda3/bin/activate testenv
-conda config --add channels conda-forge
-conda config --add channels defaults
conda config --add channels r
+conda config --add channels defaults
+conda config --add channels conda-forge
conda config --add channels bioconda
# pin versions, so that tests do not fail when pysam/htslib out of step
-conda install -y "samtools=1.5" "bcftools=1.5" "htslib=1.5"
+# add htslib dependencies
+conda install -y "samtools=1.6" "bcftools=1.6" "htslib=1.6" xz curl bzip2
# Need to make C compiler and linker use the anaconda includes and libraries:
export PREFIX=~/miniconda3/
export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
-export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl --disable-lzma"
samtools --version
htslib --version
diff --git a/samtools/bam.h b/samtools/bam.h
index 48388b7..2120875 100644
--- a/samtools/bam.h
+++ b/samtools/bam.h
@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "1.5"
+#define BAM_VERSION "1.6"
#include <stdint.h>
#include <stdlib.h>
diff --git a/samtools/bam_lpileup.c b/samtools/bam_lpileup.c
index e20cc92..cc7a75b 100644
--- a/samtools/bam_lpileup.c
+++ b/samtools/bam_lpileup.c
@@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include "bam_plbuf.h"
#include "bam_lpileup.h"
+#include "samtools.h"
#include <htslib/ksort.h>
#define TV_GAP 2
diff --git a/samtools/bam_lpileup.c.pysam.c b/samtools/bam_lpileup.c.pysam.c
index 9f7f063..93fde4f 100644
--- a/samtools/bam_lpileup.c.pysam.c
+++ b/samtools/bam_lpileup.c.pysam.c
@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include "bam_plbuf.h"
#include "bam_lpileup.h"
+#include "samtools.h"
#include <htslib/ksort.h>
#define TV_GAP 2
diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c
new file mode 100644
index 0000000..cf6a82a
--- /dev/null
+++ b/samtools/bam_markdup.c
@@ -0,0 +1,844 @@
+/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
+ through fixmates with the mate scoring option on.
+
+ Copyright (C) 2017 Genome Research Ltd.
+
+ Author: Andrew Whitwham <aw7 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "htslib/thread_pool.h"
+#include "htslib/sam.h"
+#include "sam_opts.h"
+#include "samtools.h"
+#include "htslib/khash.h"
+#include "htslib/klist.h"
+
+typedef struct {
+ int32_t single;
+ int32_t this_ref;
+ int32_t this_coord;
+ int32_t other_ref;
+ int32_t other_coord;
+ int32_t leftmost;
+ int32_t orientation;
+} key_data_t;
+
+typedef struct {
+ bam1_t *p;
+} in_hash_t;
+
+typedef struct {
+ bam1_t *b;
+ int32_t pos;
+ key_data_t pair_key;
+ key_data_t single_key;
+} read_queue_t;
+
+
+
+static khint32_t do_hash(unsigned char *key, khint32_t len);
+
+static khint_t hash_key(key_data_t key) {
+ int i = 0;
+ khint_t hash;
+
+ if (key.single) {
+ unsigned char sig[12];
+
+ memcpy(sig + i, &key.this_ref, 4); i += 4;
+ memcpy(sig + i, &key.this_coord, 4); i += 4;
+ memcpy(sig + i, &key.orientation, 4); i += 4;
+
+ hash = do_hash(sig, i);
+ } else {
+ unsigned char sig[24];
+
+ memcpy(sig + i, &key.this_ref, 4); i += 4;
+ memcpy(sig + i, &key.this_coord, 4); i += 4;
+ memcpy(sig + i, &key.other_ref, 4); i += 4;
+ memcpy(sig + i, &key.other_coord, 4); i += 4;
+ memcpy(sig + i, &key.leftmost, 4); i += 4;
+ memcpy(sig + i, &key.orientation, 4); i += 4;
+
+ hash = do_hash(sig, i);
+ }
+
+ return hash;
+}
+
+
+static int key_equal(key_data_t a, key_data_t b) {
+ int match = 1;
+
+ if (a.this_coord != b.this_coord)
+ match = 0;
+ else if (a.orientation != b.orientation)
+ match = 0;
+ else if (a.this_ref != b.this_ref)
+ match = 0;
+ else if (a.single != b.single)
+ match = 0;
+
+ if (!a.single) {
+ if (a.other_coord != b.other_coord)
+ match = 0;
+ else if (a.leftmost != b.leftmost)
+ match = 0;
+ else if (a.other_ref != b.other_ref)
+ match = 0;
+ }
+
+ return match;
+}
+
+
+#define __free_queue_element(p)
+#define O_FF 2
+#define O_RR 3
+#define O_FR 5
+#define O_RF 7
+
+KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash
+KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
+
+
+/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
+
+static int32_t unclipped_other_start(int32_t op, char *cigar) {
+ char *c = cigar;
+ int32_t clipped = 0;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ if (*c == 'S' || *c == 'H') { // clips
+ clipped += num;
+ } else {
+ break;
+ }
+
+ c++;
+ }
+
+ return op - clipped + 1;
+}
+
+
+/* Calculate the current read's start based on the stored cigar string. */
+
+static int32_t unclipped_start(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ int32_t clipped = 0;
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; i++) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return b->core.pos - clipped + 1;
+}
+
+
+/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
+
+static int32_t unclipped_other_end(int32_t op, char *cigar) {
+ char *c = cigar;
+ int32_t refpos = 0;
+ int skip = 1;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ switch (*c) {
+ case 'M':
+ case 'D':
+ case 'N':
+ case '=':
+ case 'X':
+ refpos += num;
+ skip = 0; // ignore initial clips
+ break;
+
+ case 'S':
+ case 'H':
+ if (!skip) {
+ refpos += num;
+ }
+ break;
+ }
+
+ c++;
+ }
+
+ return op + refpos;
+}
+
+
+/* Calculate the current read's end based on the stored cigar string. */
+
+static int32_t unclipped_end(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ int32_t end_pos, clipped = 0;
+ int32_t i;
+
+ end_pos = bam_endpos(b);
+
+ // now get the clipped end bases (if any)
+ // if we get to the beginning of the cigar string
+ // without hitting a non-clip then the results are meaningless
+ for (i = b->core.n_cigar - 1; i >= 0; i--) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return end_pos + clipped;
+}
+
+
+/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
+
+static khint32_t do_hash(unsigned char *key, khint32_t len) {
+ khint32_t hash, i;
+
+ for (hash = 0, i = 0; i < len; ++i) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash;
+}
+
+
+/* Get mate score from tag. */
+
+static int64_t get_mate_score(bam1_t *b) {
+ uint8_t *data;
+ int64_t score;
+
+ if ((data = bam_aux_get(b, "ms"))) {
+ score = bam_aux2i(data);
+ } else {
+ fprintf(stderr, "[markdup] error: no ms score tag.\n");
+ return -1;
+ }
+
+ return score;
+}
+
+
+/* Calc current score from quality. */
+
+static int64_t calc_score(bam1_t *b)
+{
+ int64_t score = 0;
+ uint8_t *qual = bam_get_qual(b);
+ int i;
+
+ for (i = 0; i < b->core.l_qseq; i++) {
+ if (qual[i] >= 15) score += qual[i];
+ }
+
+ return score;
+}
+
+
+/* Create a signature hash of the current read and its pair.
+ Uses the unclipped start (or end depending on orientation),
+ the reference id, orientation and whether the current
+ read is leftmost of the pair. */
+
+static int make_pair_key(key_data_t *key, bam1_t *bam) {
+ int32_t this_ref, this_coord, this_end;
+ int32_t other_ref, other_coord, other_end;
+ int32_t orientation, leftmost;
+ uint8_t *data;
+ char *cig;
+
+ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
+ other_ref = bam->core.mtid + 1;
+
+ this_coord = unclipped_start(bam);
+ this_end = unclipped_end(bam);
+
+ if ((data = bam_aux_get(bam, "MC"))) {
+ cig = bam_aux2Z(data);
+ other_end = unclipped_other_end(bam->core.mpos, cig);
+ other_coord = unclipped_other_start(bam->core.mpos, cig);
+ } else {
+ fprintf(stderr, "[markdup] error: no MC tag.\n");
+ return 1;
+ }
+
+ // work out orientations
+ if (this_ref != other_ref) {
+ leftmost = this_ref < other_ref;
+ } else {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ if (!bam_is_rev(bam)) {
+ leftmost = this_coord <= other_coord;
+ } else {
+ leftmost = this_end <= other_end;
+ }
+ } else {
+ if (bam_is_rev(bam)) {
+ leftmost = this_end <= other_coord;
+ } else {
+ leftmost = this_coord <= other_end;
+ }
+ }
+ }
+
+ // pair orientation
+ if (leftmost) {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ other_coord = other_end;
+
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
+ } else {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
+ }
+ } else {
+ if (!bam_is_rev(bam)) {
+ orientation = O_FR;
+ other_coord = other_end;
+ } else {
+ orientation = O_RF;
+ this_coord = this_end;
+ }
+ }
+ } else {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ this_coord = this_end;
+
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
+ } else {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
+ }
+ } else {
+ if (!bam_is_rev(bam)) {
+ orientation = O_RF;
+ other_coord = other_end;
+ } else {
+ orientation = O_FR;
+ this_coord = this_end;
+ }
+ }
+ }
+
+ if (!leftmost)
+ leftmost = 13;
+ else
+ leftmost = 11;
+
+ key->single = 0;
+ key->this_ref = this_ref;
+ key->this_coord = this_coord;
+ key->other_ref = other_ref;
+ key->other_coord = other_coord;
+ key->leftmost = leftmost;
+ key->orientation = orientation;
+
+ return 0;
+}
+
+
+/* Create a signature hash of single read (or read with an unmatched pair).
+ Uses unclipped start (or end depending on orientation), reference id,
+ and orientation. */
+
+static void make_single_key(key_data_t *key, bam1_t *bam) {
+ int32_t this_ref, this_coord;
+ int32_t orientation;
+
+ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
+
+ if (bam_is_rev(bam)) {
+ this_coord = unclipped_end(bam);
+ orientation = O_RR;
+ } else {
+ this_coord = unclipped_start(bam);
+ orientation = O_FF;
+ }
+
+ key->single = 1;
+ key->this_ref = this_ref;
+ key->this_coord = this_coord;
+ key->orientation = orientation;
+}
+
+
+/* Compare the reads near each other (coordinate sorted) and try to spot the duplicates.
+ Generally the highest quality scoring is chosen as the original and all others the duplicates.
+ The score is based on the sum of the quality values (<= 15) of the read and its mate (if any).
+ While single reads are compared to only one read of a pair, the pair will chosen as the original.
+ The comparison is done on position and orientation, see above for details. */
+
+static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32_t max_length, int do_stats) {
+ bam_hdr_t *header;
+ khiter_t k;
+ khash_t(reads) *pair_hash = kh_init(reads);
+ khash_t(reads) *single_hash = kh_init(reads);
+ klist_t(read_queue) *read_buffer = kl_init(read_queue);
+ kliter_t(read_queue) *rq;
+ int32_t prev_tid, prev_coord;
+ read_queue_t *in_read;
+ int ret;
+ int reading, writing, excluded, duplicate, single, pair, single_dup, examined;
+
+ if ((header = sam_hdr_read(in)) == NULL) {
+ fprintf(stderr, "[markdup] error reading header\n");
+ return 1;
+ }
+
+ // accept unknown, unsorted or coordinate sort order, but error on queryname sorted.
+ // only really works on coordinate sorted files.
+ if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) {
+ char *p, *q;
+
+ p = strstr(header->text, "\tSO:queryname");
+ q = strchr(header->text, '\n');
+
+ // looking for SO:queryname within @HD only
+ // (e.g. must ignore in a @CO comment line later in header)
+ if ((p != 0) && (p < q)) {
+ fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n");
+ return 1;
+ }
+ }
+
+ if (sam_hdr_write(out, header) < 0) {
+ fprintf(stderr, "[markdup] error writing header.\n");
+ return 1;
+ }
+
+ // used for coordinate order checks
+ prev_tid = prev_coord = 0;
+
+ // get the buffer going
+ in_read = kl_pushp(read_queue, read_buffer);
+
+
+ if ((in_read->b = bam_init1()) == NULL) {
+ fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ return 1;
+ }
+
+ reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0;
+
+ while ((ret = sam_read1(in, header, in_read->b)) >= 0) {
+
+ // do some basic coordinate order checks
+ if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
+ if (in_read->b->core.tid < prev_tid ||
+ ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) {
+ fprintf(stderr, "[markdup] error: bad coordinate order.\n");
+ return 1;
+ }
+ }
+
+ prev_coord = in_read->pos = in_read->b->core.pos;
+ prev_tid = in_read->b->core.tid;
+ in_read->pair_key.single = 1;
+ in_read->single_key.single = 0;
+
+ reading++;
+
+ // read must not be secondary, supplementary, unmapped or failed QC
+ if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) {
+ examined++;
+
+ // look at the pairs first
+ if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
+ int ret, mate_tmp;
+ key_data_t pair_key;
+ key_data_t single_key;
+ in_hash_t *bp;
+
+ if (make_pair_key(&pair_key, in_read->b)) {
+ fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
+ return 1;
+ }
+
+ make_single_key(&single_key, in_read->b);
+
+ pair++;
+ in_read->pos = single_key.this_coord; // cigar/orientation modified pos
+
+ // put in singles hash for checking against non paired reads
+ k = kh_put(reads, single_hash, single_key, &ret);
+
+ if (ret > 0) { // new
+ // add to single duplicate hash
+ bp = &kh_val(single_hash, k);
+ bp->p = in_read->b;
+ in_read->single_key = single_key;
+ } else if (ret == 0) { // exists
+ // look at singles only for duplication marking
+ bp = &kh_val(single_hash, k);
+
+ if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) {
+ bam1_t *dup = bp->p;
+
+ // singleton will always be marked duplicate even if
+ // scores more than one read of the pair
+
+ bp->p = in_read->b;
+ dup->core.flag |= BAM_FDUP;
+ single_dup++;
+ }
+ } else {
+ fprintf(stderr, "[markdup] error: single hashing failure.\n");
+ return 1;
+ }
+
+ // now do the pair
+ k = kh_put(reads, pair_hash, pair_key, &ret);
+
+ if (ret > 0) { // new
+ // add to the pair hash
+ bp = &kh_val(pair_hash, k);
+ bp->p = in_read->b;
+ in_read->pair_key = pair_key;
+ } else if (ret == 0) {
+ int64_t old_score, new_score, tie_add = 0;
+ bam1_t *dup;
+
+ bp = &kh_val(pair_hash, k);
+
+ if ((mate_tmp = get_mate_score(bp->p)) == -1) {
+ fprintf(stderr, "[markdup] error: no ms score tag.\n");
+ return 1;
+ } else {
+ old_score = calc_score(bp->p) + mate_tmp;
+ }
+
+ if ((mate_tmp = get_mate_score(in_read->b)) == -1) {
+ fprintf(stderr, "[markdup] error: no ms score tag.\n");
+ return 1;
+ } else {
+ new_score = calc_score(in_read->b) + mate_tmp;
+ }
+
+ // choose the highest score as the original
+ // and add it to the pair hash, mark the other as duplicate
+
+ if (new_score == old_score) {
+ if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) {
+ tie_add = 1;
+ } else {
+ tie_add = -1;
+ }
+ }
+
+ if (new_score + tie_add > old_score) { // swap reads
+ dup = bp->p;
+ bp->p = in_read->b;
+ } else {
+ dup = in_read->b;
+ }
+
+ dup->core.flag |= BAM_FDUP;
+
+ duplicate++;
+ } else {
+ fprintf(stderr, "[markdup] error: pair hashing failure.\n");
+ return 1;
+ }
+ } else { // do the single (or effectively single) reads
+ int ret;
+ key_data_t single_key;
+ in_hash_t *bp;
+
+ make_single_key(&single_key, in_read->b);
+
+ single++;
+ in_read->pos = single_key.this_coord; // cigar/orientation modified pos
+
+ k = kh_put(reads, single_hash, single_key, &ret);
+
+ if (ret > 0) { // new
+ bp = &kh_val(single_hash, k);
+ bp->p = in_read->b;
+ in_read->single_key = single_key;
+ } else if (ret == 0) { // exists
+ bp = &kh_val(single_hash, k);
+
+ if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) {
+ // if matched against one of a pair just mark as duplicate
+ in_read->b->core.flag |= BAM_FDUP;
+ } else {
+ int64_t old_score, new_score;
+ bam1_t *dup;
+
+ old_score = calc_score(bp->p);
+ new_score = calc_score(in_read->b);
+
+ // choose the highest score as the original, add it
+ // to the single hash and mark the other as duplicate
+ if (new_score > old_score) { // swap reads
+ dup = bp->p;
+ bp->p = in_read->b;
+ } else {
+ dup = in_read->b;
+ }
+
+ dup->core.flag |= BAM_FDUP;
+ }
+
+ single_dup++;
+ } else {
+ fprintf(stderr, "[markdup] error: single hashing failure.\n");
+ return 1;
+ }
+ }
+ } else {
+ excluded++;
+ }
+
+ // loop through the stored reads and write out those we
+ // no longer need
+ rq = kl_begin(read_buffer);
+ while (rq != kl_end(read_buffer)) {
+ in_read = &kl_val(rq);
+
+ /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads
+ should just be written as they cannot be matched as duplicates. */
+ if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
+ break;
+ }
+
+ if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (sam_write1(out, header, in_read->b) < 0) {
+ fprintf(stderr, "[markdup] error: writing output failed.\n");
+ return 1;
+ }
+
+ writing++;
+ }
+
+ // remove from hash
+ if (in_read->pair_key.single == 0) {
+ k = kh_get(reads, pair_hash, in_read->pair_key);
+ kh_del(reads, pair_hash, k);
+ }
+
+ if (in_read->single_key.single == 1) {
+ k = kh_get(reads, single_hash, in_read->single_key);
+ kh_del(reads, single_hash, k);
+ }
+
+ kl_shift(read_queue, read_buffer, NULL);
+ bam_destroy1(in_read->b);
+ rq = kl_begin(read_buffer);
+ }
+
+ // set the next one up for reading
+ in_read = kl_pushp(read_queue, read_buffer);
+
+ if ((in_read->b = bam_init1()) == NULL) {
+ fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ return 1;
+ }
+ }
+
+ if (ret < -1) {
+ fprintf(stderr, "[markdup] error: truncated input file.\n");
+ return 1;
+ }
+
+ // write out the end of the list
+ rq = kl_begin(read_buffer);
+ while (rq != kl_end(read_buffer)) {
+ in_read = &kl_val(rq);
+
+ if (bam_get_qname(in_read->b)) { // last entry will be blank
+ if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (sam_write1(out, header, in_read->b) < 0) {
+ fprintf(stderr, "[markdup] error: writing final output failed.\n");
+ return 1;
+ }
+
+ writing++;
+ }
+ }
+
+ kl_shift(read_queue, read_buffer, NULL);
+ bam_destroy1(in_read->b);
+ rq = kl_begin(read_buffer);
+ }
+
+ if (do_stats) {
+ fprintf(stderr, "READ %d WRITTEN %d \n"
+ "EXCLUDED %d EXAMINED %d\n"
+ "PAIRED %d SINGLE %d\n"
+ "DULPICATE PAIR %d DUPLICATE SINGLE %d\n"
+ "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single,
+ duplicate, single_dup, single_dup + duplicate);
+ }
+
+ kh_destroy(reads, pair_hash);
+ kh_destroy(reads, single_hash);
+ kl_destroy(read_queue, read_buffer);
+ bam_hdr_destroy(header);
+
+ return 0;
+}
+
+
+static int markdup_usage(void) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools markdup <input.bam> <output.bam>\n\n");
+ fprintf(stderr, "Option: \n");
+ fprintf(stderr, " -r Remove duplicate reads\n");
+ fprintf(stderr, " -l Max read length (default 300 bases)\n");
+ fprintf(stderr, " -s Report stats.\n");
+
+ sam_global_opt_help(stderr, "-.O..@");
+
+ fprintf(stderr, "\nThe input file must be coordinate sorted and must have gone"
+ " through fixmates with the mate scoring option on.\n");
+
+ return 1;
+}
+
+
+int bam_markdup(int argc, char **argv) {
+ int c, ret, remove_dups = 0, report_stats = 0;
+ int32_t max_length = 300;
+ samFile *in = NULL, *out = NULL;
+ char wmode[3] = {'w', 'b', 0};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "rsl:O:@:", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'r': remove_dups = 1; break;
+ case 'l': max_length = atoi(optarg); break;
+ case 's': report_stats = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return markdup_usage();
+ }
+ }
+
+ if (optind + 2 > argc)
+ return markdup_usage();
+
+ in = sam_open_format(argv[optind], "r", &ga.in);
+
+ if (!in) {
+ print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]);
+ return 1;
+ }
+
+ sam_open_mode(wmode + 1, argv[optind + 1], NULL);
+ out = sam_open_format(argv[optind + 1], wmode, &ga.out);
+
+ if (!out) {
+ print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]);
+ return 1;
+ }
+
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "[markdup] error creating thread pool\n");
+ return 1;
+ }
+
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
+
+ // actual stuff happens here
+ ret = bam_mark_duplicates(in, out, remove_dups, max_length, report_stats);
+
+ sam_close(in);
+
+ if (sam_close(out) < 0) {
+ fprintf(stderr, "[markdup] error closing output file\n");
+ ret = 1;
+ }
+
+ if (p.pool) hts_tpool_destroy(p.pool);
+
+ sam_global_args_free(&ga);
+
+ return ret;
+}
diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c
new file mode 100644
index 0000000..11b298c
--- /dev/null
+++ b/samtools/bam_markdup.c.pysam.c
@@ -0,0 +1,846 @@
+#include "pysam.h"
+
+/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
+ through fixmates with the mate scoring option on.
+
+ Copyright (C) 2017 Genome Research Ltd.
+
+ Author: Andrew Whitwham <aw7 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "htslib/thread_pool.h"
+#include "htslib/sam.h"
+#include "sam_opts.h"
+#include "samtools.h"
+#include "htslib/khash.h"
+#include "htslib/klist.h"
+
+typedef struct {
+ int32_t single;
+ int32_t this_ref;
+ int32_t this_coord;
+ int32_t other_ref;
+ int32_t other_coord;
+ int32_t leftmost;
+ int32_t orientation;
+} key_data_t;
+
+typedef struct {
+ bam1_t *p;
+} in_hash_t;
+
+typedef struct {
+ bam1_t *b;
+ int32_t pos;
+ key_data_t pair_key;
+ key_data_t single_key;
+} read_queue_t;
+
+
+
+static khint32_t do_hash(unsigned char *key, khint32_t len);
+
+static khint_t hash_key(key_data_t key) {
+ int i = 0;
+ khint_t hash;
+
+ if (key.single) {
+ unsigned char sig[12];
+
+ memcpy(sig + i, &key.this_ref, 4); i += 4;
+ memcpy(sig + i, &key.this_coord, 4); i += 4;
+ memcpy(sig + i, &key.orientation, 4); i += 4;
+
+ hash = do_hash(sig, i);
+ } else {
+ unsigned char sig[24];
+
+ memcpy(sig + i, &key.this_ref, 4); i += 4;
+ memcpy(sig + i, &key.this_coord, 4); i += 4;
+ memcpy(sig + i, &key.other_ref, 4); i += 4;
+ memcpy(sig + i, &key.other_coord, 4); i += 4;
+ memcpy(sig + i, &key.leftmost, 4); i += 4;
+ memcpy(sig + i, &key.orientation, 4); i += 4;
+
+ hash = do_hash(sig, i);
+ }
+
+ return hash;
+}
+
+
+static int key_equal(key_data_t a, key_data_t b) {
+ int match = 1;
+
+ if (a.this_coord != b.this_coord)
+ match = 0;
+ else if (a.orientation != b.orientation)
+ match = 0;
+ else if (a.this_ref != b.this_ref)
+ match = 0;
+ else if (a.single != b.single)
+ match = 0;
+
+ if (!a.single) {
+ if (a.other_coord != b.other_coord)
+ match = 0;
+ else if (a.leftmost != b.leftmost)
+ match = 0;
+ else if (a.other_ref != b.other_ref)
+ match = 0;
+ }
+
+ return match;
+}
+
+
+#define __free_queue_element(p)
+#define O_FF 2
+#define O_RR 3
+#define O_FR 5
+#define O_RF 7
+
+KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash
+KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
+
+
+/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
+
+static int32_t unclipped_other_start(int32_t op, char *cigar) {
+ char *c = cigar;
+ int32_t clipped = 0;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ if (*c == 'S' || *c == 'H') { // clips
+ clipped += num;
+ } else {
+ break;
+ }
+
+ c++;
+ }
+
+ return op - clipped + 1;
+}
+
+
+/* Calculate the current read's start based on the stored cigar string. */
+
+static int32_t unclipped_start(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ int32_t clipped = 0;
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; i++) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return b->core.pos - clipped + 1;
+}
+
+
+/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
+
+static int32_t unclipped_other_end(int32_t op, char *cigar) {
+ char *c = cigar;
+ int32_t refpos = 0;
+ int skip = 1;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ switch (*c) {
+ case 'M':
+ case 'D':
+ case 'N':
+ case '=':
+ case 'X':
+ refpos += num;
+ skip = 0; // ignore initial clips
+ break;
+
+ case 'S':
+ case 'H':
+ if (!skip) {
+ refpos += num;
+ }
+ break;
+ }
+
+ c++;
+ }
+
+ return op + refpos;
+}
+
+
+/* Calculate the current read's end based on the stored cigar string. */
+
+static int32_t unclipped_end(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ int32_t end_pos, clipped = 0;
+ int32_t i;
+
+ end_pos = bam_endpos(b);
+
+ // now get the clipped end bases (if any)
+ // if we get to the beginning of the cigar string
+ // without hitting a non-clip then the results are meaningless
+ for (i = b->core.n_cigar - 1; i >= 0; i--) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return end_pos + clipped;
+}
+
+
+/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
+
+static khint32_t do_hash(unsigned char *key, khint32_t len) {
+ khint32_t hash, i;
+
+ for (hash = 0, i = 0; i < len; ++i) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash;
+}
+
+
+/* Get mate score from tag. */
+
+static int64_t get_mate_score(bam1_t *b) {
+ uint8_t *data;
+ int64_t score;
+
+ if ((data = bam_aux_get(b, "ms"))) {
+ score = bam_aux2i(data);
+ } else {
+ fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n");
+ return -1;
+ }
+
+ return score;
+}
+
+
+/* Calc current score from quality. */
+
+static int64_t calc_score(bam1_t *b)
+{
+ int64_t score = 0;
+ uint8_t *qual = bam_get_qual(b);
+ int i;
+
+ for (i = 0; i < b->core.l_qseq; i++) {
+ if (qual[i] >= 15) score += qual[i];
+ }
+
+ return score;
+}
+
+
+/* Create a signature hash of the current read and its pair.
+ Uses the unclipped start (or end depending on orientation),
+ the reference id, orientation and whether the current
+ read is leftmost of the pair. */
+
+static int make_pair_key(key_data_t *key, bam1_t *bam) {
+ int32_t this_ref, this_coord, this_end;
+ int32_t other_ref, other_coord, other_end;
+ int32_t orientation, leftmost;
+ uint8_t *data;
+ char *cig;
+
+ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
+ other_ref = bam->core.mtid + 1;
+
+ this_coord = unclipped_start(bam);
+ this_end = unclipped_end(bam);
+
+ if ((data = bam_aux_get(bam, "MC"))) {
+ cig = bam_aux2Z(data);
+ other_end = unclipped_other_end(bam->core.mpos, cig);
+ other_coord = unclipped_other_start(bam->core.mpos, cig);
+ } else {
+ fprintf(pysam_stderr, "[markdup] error: no MC tag.\n");
+ return 1;
+ }
+
+ // work out orientations
+ if (this_ref != other_ref) {
+ leftmost = this_ref < other_ref;
+ } else {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ if (!bam_is_rev(bam)) {
+ leftmost = this_coord <= other_coord;
+ } else {
+ leftmost = this_end <= other_end;
+ }
+ } else {
+ if (bam_is_rev(bam)) {
+ leftmost = this_end <= other_coord;
+ } else {
+ leftmost = this_coord <= other_end;
+ }
+ }
+ }
+
+ // pair orientation
+ if (leftmost) {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ other_coord = other_end;
+
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
+ } else {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
+ }
+ } else {
+ if (!bam_is_rev(bam)) {
+ orientation = O_FR;
+ other_coord = other_end;
+ } else {
+ orientation = O_RF;
+ this_coord = this_end;
+ }
+ }
+ } else {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ this_coord = this_end;
+
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
+ } else {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
+ }
+ } else {
+ if (!bam_is_rev(bam)) {
+ orientation = O_RF;
+ other_coord = other_end;
+ } else {
+ orientation = O_FR;
+ this_coord = this_end;
+ }
+ }
+ }
+
+ if (!leftmost)
+ leftmost = 13;
+ else
+ leftmost = 11;
+
+ key->single = 0;
+ key->this_ref = this_ref;
+ key->this_coord = this_coord;
+ key->other_ref = other_ref;
+ key->other_coord = other_coord;
+ key->leftmost = leftmost;
+ key->orientation = orientation;
+
+ return 0;
+}
+
+
+/* Create a signature hash of single read (or read with an unmatched pair).
+ Uses unclipped start (or end depending on orientation), reference id,
+ and orientation. */
+
+static void make_single_key(key_data_t *key, bam1_t *bam) {
+ int32_t this_ref, this_coord;
+ int32_t orientation;
+
+ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
+
+ if (bam_is_rev(bam)) {
+ this_coord = unclipped_end(bam);
+ orientation = O_RR;
+ } else {
+ this_coord = unclipped_start(bam);
+ orientation = O_FF;
+ }
+
+ key->single = 1;
+ key->this_ref = this_ref;
+ key->this_coord = this_coord;
+ key->orientation = orientation;
+}
+
+
+/* Compare the reads near each other (coordinate sorted) and try to spot the duplicates.
+ Generally the highest quality scoring is chosen as the original and all others the duplicates.
+ The score is based on the sum of the quality values (<= 15) of the read and its mate (if any).
+ While single reads are compared to only one read of a pair, the pair will chosen as the original.
+ The comparison is done on position and orientation, see above for details. */
+
+static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32_t max_length, int do_stats) {
+ bam_hdr_t *header;
+ khiter_t k;
+ khash_t(reads) *pair_hash = kh_init(reads);
+ khash_t(reads) *single_hash = kh_init(reads);
+ klist_t(read_queue) *read_buffer = kl_init(read_queue);
+ kliter_t(read_queue) *rq;
+ int32_t prev_tid, prev_coord;
+ read_queue_t *in_read;
+ int ret;
+ int reading, writing, excluded, duplicate, single, pair, single_dup, examined;
+
+ if ((header = sam_hdr_read(in)) == NULL) {
+ fprintf(pysam_stderr, "[markdup] error reading header\n");
+ return 1;
+ }
+
+ // accept unknown, unsorted or coordinate sort order, but error on queryname sorted.
+ // only really works on coordinate sorted files.
+ if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) {
+ char *p, *q;
+
+ p = strstr(header->text, "\tSO:queryname");
+ q = strchr(header->text, '\n');
+
+ // looking for SO:queryname within @HD only
+ // (e.g. must ignore in a @CO comment line later in header)
+ if ((p != 0) && (p < q)) {
+ fprintf(pysam_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n");
+ return 1;
+ }
+ }
+
+ if (sam_hdr_write(out, header) < 0) {
+ fprintf(pysam_stderr, "[markdup] error writing header.\n");
+ return 1;
+ }
+
+ // used for coordinate order checks
+ prev_tid = prev_coord = 0;
+
+ // get the buffer going
+ in_read = kl_pushp(read_queue, read_buffer);
+
+
+ if ((in_read->b = bam_init1()) == NULL) {
+ fprintf(pysam_stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ return 1;
+ }
+
+ reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0;
+
+ while ((ret = sam_read1(in, header, in_read->b)) >= 0) {
+
+ // do some basic coordinate order checks
+ if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
+ if (in_read->b->core.tid < prev_tid ||
+ ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) {
+ fprintf(pysam_stderr, "[markdup] error: bad coordinate order.\n");
+ return 1;
+ }
+ }
+
+ prev_coord = in_read->pos = in_read->b->core.pos;
+ prev_tid = in_read->b->core.tid;
+ in_read->pair_key.single = 1;
+ in_read->single_key.single = 0;
+
+ reading++;
+
+ // read must not be secondary, supplementary, unmapped or failed QC
+ if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) {
+ examined++;
+
+ // look at the pairs first
+ if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
+ int ret, mate_tmp;
+ key_data_t pair_key;
+ key_data_t single_key;
+ in_hash_t *bp;
+
+ if (make_pair_key(&pair_key, in_read->b)) {
+ fprintf(pysam_stderr, "[markdup] error: unable to assign pair hash key.\n");
+ return 1;
+ }
+
+ make_single_key(&single_key, in_read->b);
+
+ pair++;
+ in_read->pos = single_key.this_coord; // cigar/orientation modified pos
+
+ // put in singles hash for checking against non paired reads
+ k = kh_put(reads, single_hash, single_key, &ret);
+
+ if (ret > 0) { // new
+ // add to single duplicate hash
+ bp = &kh_val(single_hash, k);
+ bp->p = in_read->b;
+ in_read->single_key = single_key;
+ } else if (ret == 0) { // exists
+ // look at singles only for duplication marking
+ bp = &kh_val(single_hash, k);
+
+ if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) {
+ bam1_t *dup = bp->p;
+
+ // singleton will always be marked duplicate even if
+ // scores more than one read of the pair
+
+ bp->p = in_read->b;
+ dup->core.flag |= BAM_FDUP;
+ single_dup++;
+ }
+ } else {
+ fprintf(pysam_stderr, "[markdup] error: single hashing failure.\n");
+ return 1;
+ }
+
+ // now do the pair
+ k = kh_put(reads, pair_hash, pair_key, &ret);
+
+ if (ret > 0) { // new
+ // add to the pair hash
+ bp = &kh_val(pair_hash, k);
+ bp->p = in_read->b;
+ in_read->pair_key = pair_key;
+ } else if (ret == 0) {
+ int64_t old_score, new_score, tie_add = 0;
+ bam1_t *dup;
+
+ bp = &kh_val(pair_hash, k);
+
+ if ((mate_tmp = get_mate_score(bp->p)) == -1) {
+ fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n");
+ return 1;
+ } else {
+ old_score = calc_score(bp->p) + mate_tmp;
+ }
+
+ if ((mate_tmp = get_mate_score(in_read->b)) == -1) {
+ fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n");
+ return 1;
+ } else {
+ new_score = calc_score(in_read->b) + mate_tmp;
+ }
+
+ // choose the highest score as the original
+ // and add it to the pair hash, mark the other as duplicate
+
+ if (new_score == old_score) {
+ if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) {
+ tie_add = 1;
+ } else {
+ tie_add = -1;
+ }
+ }
+
+ if (new_score + tie_add > old_score) { // swap reads
+ dup = bp->p;
+ bp->p = in_read->b;
+ } else {
+ dup = in_read->b;
+ }
+
+ dup->core.flag |= BAM_FDUP;
+
+ duplicate++;
+ } else {
+ fprintf(pysam_stderr, "[markdup] error: pair hashing failure.\n");
+ return 1;
+ }
+ } else { // do the single (or effectively single) reads
+ int ret;
+ key_data_t single_key;
+ in_hash_t *bp;
+
+ make_single_key(&single_key, in_read->b);
+
+ single++;
+ in_read->pos = single_key.this_coord; // cigar/orientation modified pos
+
+ k = kh_put(reads, single_hash, single_key, &ret);
+
+ if (ret > 0) { // new
+ bp = &kh_val(single_hash, k);
+ bp->p = in_read->b;
+ in_read->single_key = single_key;
+ } else if (ret == 0) { // exists
+ bp = &kh_val(single_hash, k);
+
+ if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) {
+ // if matched against one of a pair just mark as duplicate
+ in_read->b->core.flag |= BAM_FDUP;
+ } else {
+ int64_t old_score, new_score;
+ bam1_t *dup;
+
+ old_score = calc_score(bp->p);
+ new_score = calc_score(in_read->b);
+
+ // choose the highest score as the original, add it
+ // to the single hash and mark the other as duplicate
+ if (new_score > old_score) { // swap reads
+ dup = bp->p;
+ bp->p = in_read->b;
+ } else {
+ dup = in_read->b;
+ }
+
+ dup->core.flag |= BAM_FDUP;
+ }
+
+ single_dup++;
+ } else {
+ fprintf(pysam_stderr, "[markdup] error: single hashing failure.\n");
+ return 1;
+ }
+ }
+ } else {
+ excluded++;
+ }
+
+ // loop through the stored reads and write out those we
+ // no longer need
+ rq = kl_begin(read_buffer);
+ while (rq != kl_end(read_buffer)) {
+ in_read = &kl_val(rq);
+
+ /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads
+ should just be written as they cannot be matched as duplicates. */
+ if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
+ break;
+ }
+
+ if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (sam_write1(out, header, in_read->b) < 0) {
+ fprintf(pysam_stderr, "[markdup] error: writing output failed.\n");
+ return 1;
+ }
+
+ writing++;
+ }
+
+ // remove from hash
+ if (in_read->pair_key.single == 0) {
+ k = kh_get(reads, pair_hash, in_read->pair_key);
+ kh_del(reads, pair_hash, k);
+ }
+
+ if (in_read->single_key.single == 1) {
+ k = kh_get(reads, single_hash, in_read->single_key);
+ kh_del(reads, single_hash, k);
+ }
+
+ kl_shift(read_queue, read_buffer, NULL);
+ bam_destroy1(in_read->b);
+ rq = kl_begin(read_buffer);
+ }
+
+ // set the next one up for reading
+ in_read = kl_pushp(read_queue, read_buffer);
+
+ if ((in_read->b = bam_init1()) == NULL) {
+ fprintf(pysam_stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ return 1;
+ }
+ }
+
+ if (ret < -1) {
+ fprintf(pysam_stderr, "[markdup] error: truncated input file.\n");
+ return 1;
+ }
+
+ // write out the end of the list
+ rq = kl_begin(read_buffer);
+ while (rq != kl_end(read_buffer)) {
+ in_read = &kl_val(rq);
+
+ if (bam_get_qname(in_read->b)) { // last entry will be blank
+ if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (sam_write1(out, header, in_read->b) < 0) {
+ fprintf(pysam_stderr, "[markdup] error: writing final output failed.\n");
+ return 1;
+ }
+
+ writing++;
+ }
+ }
+
+ kl_shift(read_queue, read_buffer, NULL);
+ bam_destroy1(in_read->b);
+ rq = kl_begin(read_buffer);
+ }
+
+ if (do_stats) {
+ fprintf(pysam_stderr, "READ %d WRITTEN %d \n"
+ "EXCLUDED %d EXAMINED %d\n"
+ "PAIRED %d SINGLE %d\n"
+ "DULPICATE PAIR %d DUPLICATE SINGLE %d\n"
+ "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single,
+ duplicate, single_dup, single_dup + duplicate);
+ }
+
+ kh_destroy(reads, pair_hash);
+ kh_destroy(reads, single_hash);
+ kl_destroy(read_queue, read_buffer);
+ bam_hdr_destroy(header);
+
+ return 0;
+}
+
+
+static int markdup_usage(void) {
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Usage: samtools markdup <input.bam> <output.bam>\n\n");
+ fprintf(pysam_stderr, "Option: \n");
+ fprintf(pysam_stderr, " -r Remove duplicate reads\n");
+ fprintf(pysam_stderr, " -l Max read length (default 300 bases)\n");
+ fprintf(pysam_stderr, " -s Report stats.\n");
+
+ sam_global_opt_help(pysam_stderr, "-.O..@");
+
+ fprintf(pysam_stderr, "\nThe input file must be coordinate sorted and must have gone"
+ " through fixmates with the mate scoring option on.\n");
+
+ return 1;
+}
+
+
+int bam_markdup(int argc, char **argv) {
+ int c, ret, remove_dups = 0, report_stats = 0;
+ int32_t max_length = 300;
+ samFile *in = NULL, *out = NULL;
+ char wmode[3] = {'w', 'b', 0};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "rsl:O:@:", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'r': remove_dups = 1; break;
+ case 'l': max_length = atoi(optarg); break;
+ case 's': report_stats = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': return markdup_usage();
+ }
+ }
+
+ if (optind + 2 > argc)
+ return markdup_usage();
+
+ in = sam_open_format(argv[optind], "r", &ga.in);
+
+ if (!in) {
+ print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]);
+ return 1;
+ }
+
+ sam_open_mode(wmode + 1, argv[optind + 1], NULL);
+ out = sam_open_format(argv[optind + 1], wmode, &ga.out);
+
+ if (!out) {
+ print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]);
+ return 1;
+ }
+
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(pysam_stderr, "[markdup] error creating thread pool\n");
+ return 1;
+ }
+
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
+
+ // actual stuff happens here
+ ret = bam_mark_duplicates(in, out, remove_dups, max_length, report_stats);
+
+ sam_close(in);
+
+ if (sam_close(out) < 0) {
+ fprintf(pysam_stderr, "[markdup] error closing output file\n");
+ ret = 1;
+ }
+
+ if (p.pool) hts_tpool_destroy(p.pool);
+
+ sam_global_args_free(&ga);
+
+ return ret;
+}
diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c
index 75c2f51..1d6c55f 100644
--- a/samtools/bam_mate.c
+++ b/samtools/bam_mate.c
@@ -218,8 +218,39 @@ static int sync_mate(bam1_t* a, bam1_t* b)
return 0;
}
+
+static uint32_t calc_mate_score(bam1_t *b)
+{
+ uint32_t score = 0;
+ uint8_t *qual = bam_get_qual(b);
+ int i;
+
+ for (i = 0; i < b->core.l_qseq; i++) {
+ if (qual[i] >= 15) score += qual[i];
+ }
+
+ return score;
+}
+
+
+static int add_mate_score(bam1_t *src, bam1_t *dest)
+{
+ uint8_t *data_ms;
+ uint32_t mate_score = calc_mate_score(src);
+
+ if ((data_ms = bam_aux_get(dest, "ms")) != NULL) {
+ bam_aux_del(dest, data_ms);
+ }
+
+ if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) {
+ return -1;
+ }
+
+ return 0;
+}
+
// currently, this function ONLY works if each read has one hit
-static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct)
+static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring)
{
bam_hdr_t *header;
bam1_t *b[2] = { NULL, NULL };
@@ -295,6 +326,13 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop
cur->core.flag &= ~BAM_FPROPER_PAIR;
}
+ if (do_mate_scoring) {
+ if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) {
+ fprintf(stderr, "[bam_mating_core] ERROR: unable to add mate score.\n");
+ goto fail;
+ }
+ }
+
// Write out result
if ( !remove_reads ) {
if (sam_write1(out, header, pre) < 0) goto write_fail;
@@ -361,7 +399,8 @@ void usage(FILE* where)
"Options:\n"
" -r Remove unmapped reads and secondary alignments\n"
" -p Disable FR proper pair check\n"
-" -c Add template cigar ct tag\n");
+" -c Add template cigar ct tag\n"
+" -m Add mate score tag\n");
sam_global_opt_help(where, "-.O..@");
@@ -376,7 +415,7 @@ int bam_mating(int argc, char *argv[])
{
htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
@@ -386,11 +425,12 @@ int bam_mating(int argc, char *argv[])
// parse args
if (argc == 1) { usage(stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
case 'c': add_ct = 1; break;
+ case 'm': mate_score = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': usage(stderr); goto fail;
@@ -419,7 +459,7 @@ int bam_mating(int argc, char *argv[])
}
// run
- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score);
// cleanup
sam_close(in);
diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c
index a03de96..8857aeb 100644
--- a/samtools/bam_mate.c.pysam.c
+++ b/samtools/bam_mate.c.pysam.c
@@ -220,8 +220,39 @@ static int sync_mate(bam1_t* a, bam1_t* b)
return 0;
}
+
+static uint32_t calc_mate_score(bam1_t *b)
+{
+ uint32_t score = 0;
+ uint8_t *qual = bam_get_qual(b);
+ int i;
+
+ for (i = 0; i < b->core.l_qseq; i++) {
+ if (qual[i] >= 15) score += qual[i];
+ }
+
+ return score;
+}
+
+
+static int add_mate_score(bam1_t *src, bam1_t *dest)
+{
+ uint8_t *data_ms;
+ uint32_t mate_score = calc_mate_score(src);
+
+ if ((data_ms = bam_aux_get(dest, "ms")) != NULL) {
+ bam_aux_del(dest, data_ms);
+ }
+
+ if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) {
+ return -1;
+ }
+
+ return 0;
+}
+
// currently, this function ONLY works if each read has one hit
-static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct)
+static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring)
{
bam_hdr_t *header;
bam1_t *b[2] = { NULL, NULL };
@@ -297,6 +328,13 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop
cur->core.flag &= ~BAM_FPROPER_PAIR;
}
+ if (do_mate_scoring) {
+ if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) {
+ fprintf(pysam_stderr, "[bam_mating_core] ERROR: unable to add mate score.\n");
+ goto fail;
+ }
+ }
+
// Write out result
if ( !remove_reads ) {
if (sam_write1(out, header, pre) < 0) goto write_fail;
@@ -363,7 +401,8 @@ void usage(FILE* where)
"Options:\n"
" -r Remove unmapped reads and secondary alignments\n"
" -p Disable FR proper pair check\n"
-" -c Add template cigar ct tag\n");
+" -c Add template cigar ct tag\n"
+" -m Add mate score tag\n");
sam_global_opt_help(where, "-.O..@");
@@ -378,7 +417,7 @@ int bam_mating(int argc, char *argv[])
{
htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
@@ -388,11 +427,12 @@ int bam_mating(int argc, char *argv[])
// parse args
if (argc == 1) { usage(pysam_stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
case 'c': add_ct = 1; break;
+ case 'm': mate_score = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': usage(pysam_stderr); goto fail;
@@ -421,7 +461,7 @@ int bam_mating(int argc, char *argv[])
}
// run
- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score);
// cleanup
sam_close(in);
diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c
index d17e9d6..d451ffd 100644
--- a/samtools/bam_plcmd.c
+++ b/samtools/bam_plcmd.c
@@ -113,6 +113,7 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref
#define MPLP_PRINT_MAPQ (1<<10)
#define MPLP_PER_SAMPLE (1<<11)
#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_PRINT_QNAME (1<<13)
void *bed_read(const char *fn);
void bed_destroy(void *_h);
@@ -220,6 +221,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
fputs("\t0\t*\t*", fp);
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp);
+ if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp);
}
putc('\n', fp);
}
@@ -642,6 +644,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fputs("*\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
+ if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp);
} else {
int n = 0;
for (j = 0; j < n_plp[i]; ++j) {
@@ -698,6 +701,21 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
if (!n) putc('*', pileup_fp);
}
+
+ if (conf->flag & MPLP_PRINT_QNAME) {
+ n = 0;
+ putc('\t', pileup_fp);
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = &plp[i][j];
+ int c = bam_get_qual(p->b)[p->qpos];
+ if ( c < conf->min_baseQ ) continue;
+
+ if (n > 0) putc(',', pileup_fp);
+ fputs(bam_get_qname(p->b), pileup_fp);
+ n++;
+ }
+ if (!n) putc('*', pileup_fp);
+ }
}
}
putc('\n', pileup_fp);
@@ -898,6 +916,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
"Output options for mpileup format (without -g/-v):\n"
" -O, --output-BP output base positions on reads\n"
" -s, --output-MQ output mapping quality\n"
+" --output-QNAME output read names\n"
" -a output all positions (including zero depth)\n"
" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
@@ -960,6 +979,8 @@ int bam_mpileup(int argc, char *argv[])
{"excl-flags", required_argument, NULL, 2},
{"output", required_argument, NULL, 3},
{"open-prob", required_argument, NULL, 4},
+ {"output-QNAME", no_argument, NULL, 5},
+ {"output-qname", no_argument, NULL, 5},
{"illumina1.3+", no_argument, NULL, '6'},
{"count-orphans", no_argument, NULL, 'A'},
{"bam-list", required_argument, NULL, 'b'},
@@ -1016,6 +1037,7 @@ int bam_mpileup(int argc, char *argv[])
break;
case 3 : mplp.output_fname = optarg; break;
case 4 : mplp.openQ = atoi(optarg); break;
+ case 5 : mplp.flag |= MPLP_PRINT_QNAME; break;
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == NULL) return 1;
diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c
index 03e5f8a..7fd5bea 100644
--- a/samtools/bam_plcmd.c.pysam.c
+++ b/samtools/bam_plcmd.c.pysam.c
@@ -115,6 +115,7 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref
#define MPLP_PRINT_MAPQ (1<<10)
#define MPLP_PER_SAMPLE (1<<11)
#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_PRINT_QNAME (1<<13)
void *bed_read(const char *fn);
void bed_destroy(void *_h);
@@ -222,6 +223,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
fputs("\t0\t*\t*", fp);
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp);
+ if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp);
}
putc('\n', fp);
}
@@ -644,6 +646,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fputs("*\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
+ if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp);
} else {
int n = 0;
for (j = 0; j < n_plp[i]; ++j) {
@@ -700,6 +703,21 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
}
if (!n) putc('*', pileup_fp);
}
+
+ if (conf->flag & MPLP_PRINT_QNAME) {
+ n = 0;
+ putc('\t', pileup_fp);
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = &plp[i][j];
+ int c = bam_get_qual(p->b)[p->qpos];
+ if ( c < conf->min_baseQ ) continue;
+
+ if (n > 0) putc(',', pileup_fp);
+ fputs(bam_get_qname(p->b), pileup_fp);
+ n++;
+ }
+ if (!n) putc('*', pileup_fp);
+ }
}
}
putc('\n', pileup_fp);
@@ -900,6 +918,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
"Output options for mpileup format (without -g/-v):\n"
" -O, --output-BP output base positions on reads\n"
" -s, --output-MQ output mapping quality\n"
+" --output-QNAME output read names\n"
" -a output all positions (including zero depth)\n"
" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
@@ -962,6 +981,8 @@ int bam_mpileup(int argc, char *argv[])
{"excl-flags", required_argument, NULL, 2},
{"output", required_argument, NULL, 3},
{"open-prob", required_argument, NULL, 4},
+ {"output-QNAME", no_argument, NULL, 5},
+ {"output-qname", no_argument, NULL, 5},
{"illumina1.3+", no_argument, NULL, '6'},
{"count-orphans", no_argument, NULL, 'A'},
{"bam-list", required_argument, NULL, 'b'},
@@ -1018,6 +1039,7 @@ int bam_mpileup(int argc, char *argv[])
break;
case 3 : mplp.output_fname = optarg; break;
case 4 : mplp.openQ = atoi(optarg); break;
+ case 5 : mplp.flag |= MPLP_PRINT_QNAME; break;
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == NULL) return 1;
diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c
index f82686d..562c8e4 100644
--- a/samtools/bam_reheader.c.pysam.c
+++ b/samtools/bam_reheader.c.pysam.c
@@ -475,7 +475,7 @@ int main_reheader(int argc, char *argv[])
if (argc - optind != 2)
usage(pysam_stderr, 1);
-
+
{ // read the header
samFile *fph = sam_open(argv[optind], "r");
if (fph == 0) {
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c
index d32a241..b1d5898 100644
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -38,7 +38,9 @@ DEALINGS IN THE SOFTWARE. */
#include <getopt.h>
#include <assert.h>
#include <pthread.h>
+#include "htslib/bgzf.h"
#include "htslib/ksort.h"
+#include "htslib/hts_os.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
#include "htslib/kstring.h"
@@ -49,10 +51,10 @@ DEALINGS IN THE SOFTWARE. */
// Struct which contains the a record, and the pointer to the sort tag (if any)
// Used to speed up sort-by-tag.
-typedef struct bam1_p {
- bam1_t *b;
+typedef struct bam1_tag {
+ bam1_t *bam_record;
const uint8_t *tag;
-} bam1_p;
+} bam1_tag;
/* Minimum memory required in megabytes before sort will attempt to run. This
is to prevent accidents where failing to use the -m option correctly results
@@ -122,29 +124,36 @@ static int strnum_cmp(const char *_a, const char *_b)
typedef struct {
int i;
uint64_t pos, idx;
- bam1_p b;
+ bam1_tag entry;
} heap1_t;
-#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
-
-static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b);
+static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
// Function to compare reads in the heap and determine which one is < the other
static inline int heap_lt(const heap1_t a, const heap1_t b)
{
+ if (!a.entry.bam_record)
+ return 1;
+ if (!b.entry.bam_record)
+ return 0;
+
if (g_is_by_tag) {
int t;
- if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
- t = bam1_lt_by_tag(b.b,a.b);
- return t;
+ t = bam1_cmp_by_tag(a.entry, b.entry);
+ if (t != 0) return t > 0;
} else if (g_is_by_qname) {
- int t;
- if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
- t = strnum_cmp(bam_get_qname(a.b.b), bam_get_qname(b.b.b));
- return (t > 0 || (t == 0 && (a.b.b->core.flag&0xc0) > (b.b.b->core.flag&0xc0)));
+ int t, fa, fb;
+ t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
+ if (t != 0) return t > 0;
+ fa = a.entry.bam_record->core.flag & 0xc0;
+ fb = b.entry.bam_record->core.flag & 0xc0;
+ if (fa != fb) return fa > fb;
} else {
- return __pos_cmp(a, b);
+ if (a.pos != b.pos) return a.pos > b.pos;
}
+ // This compares by position in the input file(s)
+ if (a.i != b.i) return a.i > b.i;
+ return a.idx > b.idx;
}
KSORT_INIT(heap, heap1_t, heap_lt)
@@ -1351,25 +1360,25 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
heap1_t *h = heap + i;
int res;
h->i = i;
- h->b.b = bam_init1();
- h->b.tag = NULL;
- if (!h->b.b) goto mem_fail;
- res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b.b) : sam_read1(fp[i], hdr[i], h->b.b);
+ h->entry.bam_record = bam_init1();
+ h->entry.tag = NULL;
+ if (!h->entry.bam_record) goto mem_fail;
+ res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record);
if (res >= 0) {
- bam_translate(h->b.b, translation_tbl + i);
- h->pos = ((uint64_t)h->b.b->core.tid<<32) | (uint32_t)((int32_t)h->b.b->core.pos+1)<<1 | bam_is_rev(h->b.b);
+ bam_translate(h->entry.bam_record, translation_tbl + i);
+ h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1)<<1 | bam_is_rev(h->entry.bam_record);
h->idx = idx++;
if (g_is_by_tag) {
- h->b.tag = bam_aux_get(h->b.b, g_sort_tag);
+ h->entry.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
} else {
- h->b.tag = NULL;
+ h->entry.tag = NULL;
}
}
else if (res == -1 && (!iter[i] || iter[i]->finished)) {
h->pos = HEAP_EMPTY;
- bam_destroy1(h->b.b);
- h->b.b = NULL;
- h->b.tag = NULL;
+ bam_destroy1(h->entry.bam_record);
+ h->entry.bam_record = NULL;
+ h->entry.tag = NULL;
} else {
print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
@@ -1391,7 +1400,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
// Begin the actual merge
ks_heapmake(heap, n, heap);
while (heap->pos != HEAP_EMPTY) {
- bam1_t *b = heap->b.b;
+ bam1_t *b = heap->entry.bam_record;
if (flag & MERGE_RG) {
uint8_t *rg = bam_aux_get(b, "RG");
if (rg) bam_aux_del(b, rg);
@@ -1407,15 +1416,15 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
heap->idx = idx++;
if (g_is_by_tag) {
- heap->b.tag = bam_aux_get(heap->b.b, g_sort_tag);
+ heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
} else {
- heap->b.tag = NULL;
+ heap->entry.tag = NULL;
}
} else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) {
heap->pos = HEAP_EMPTY;
- bam_destroy1(heap->b.b);
- heap->b.b = NULL;
- heap->b.tag = NULL;
+ bam_destroy1(heap->entry.bam_record);
+ heap->entry.bam_record = NULL;
+ heap->entry.tag = NULL;
} else {
print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
goto fail;
@@ -1459,7 +1468,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
if (iter && iter[i]) hts_itr_destroy(iter[i]);
if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]);
if (fp && fp[i]) sam_close(fp[i]);
- if (heap && heap[i].b.b) bam_destroy1(heap[i].b.b);
+ if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record);
}
if (hout) bam_hdr_destroy(hout);
free(RG);
@@ -1615,6 +1624,169 @@ end:
* BAM sorting *
***************/
+typedef struct {
+ size_t from;
+ size_t to;
+} buf_region;
+
+/* Simplified version of bam_merge_core2() for merging part-sorted
+ temporary files. No need for header merging or translation,
+ it just needs to read data into the heap and push it out again. */
+
+static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp,
+ int num_in_mem, buf_region *in_mem,
+ bam1_tag *buf, uint64_t *idx, bam_hdr_t *hout) {
+ int i = heap->i, res;
+ if (i < nfiles) { // read from file
+ res = sam_read1(fp[i], hout, heap->entry.bam_record);
+ } else { // read from memory
+ if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) {
+ heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record;
+ res = 0;
+ } else {
+ res = -1;
+ }
+ }
+ if (res >= 0) {
+ heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32)
+ | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)<<1
+ | bam_is_rev(heap->entry.bam_record));
+ heap->idx = (*idx)++;
+ if (g_is_by_tag) {
+ heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
+ } else {
+ heap->entry.tag = NULL;
+ }
+ } else if (res == -1) {
+ heap->pos = HEAP_EMPTY;
+ if (i < nfiles) bam_destroy1(heap->entry.bam_record);
+ heap->entry.bam_record = NULL;
+ heap->entry.tag = NULL;
+ } else {
+ return -1;
+ }
+ return 0;
+}
+
+static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
+ const char *mode, bam_hdr_t *hout,
+ int n, char * const *fn, int num_in_mem,
+ buf_region *in_mem, bam1_tag *buf, int n_threads,
+ const char *cmd, const htsFormat *in_fmt,
+ const htsFormat *out_fmt) {
+ samFile *fpout = NULL, **fp = NULL;
+ heap1_t *heap = NULL;
+ uint64_t idx = 0;
+ int i, heap_size = n + num_in_mem;
+
+ g_is_by_qname = by_qname;
+ if (sort_tag) {
+ g_is_by_tag = 1;
+ g_sort_tag[0] = sort_tag[0];
+ g_sort_tag[1] = sort_tag[1];
+ }
+ if (n > 0) {
+ fp = (samFile**)calloc(n, sizeof(samFile*));
+ if (!fp) goto mem_fail;
+ }
+ heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t));
+ if (!heap) goto mem_fail;
+
+ // Open each file, read the header and put the first read into the heap
+ for (i = 0; i < heap_size; i++) {
+ bam_hdr_t *hin;
+ heap1_t *h = &heap[i];
+
+ if (i < n) {
+ fp[i] = sam_open_format(fn[i], "r", in_fmt);
+ if (fp[i] == NULL) {
+ print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
+ goto fail;
+ }
+
+ // Read header ...
+ hin = sam_hdr_read(fp[i]);
+ if (hin == NULL) {
+ print_error(cmd, "failed to read header from \"%s\"", fn[i]);
+ goto fail;
+ }
+ // ... and throw it away as we don't really need it
+ bam_hdr_destroy(hin);
+ }
+
+ // Get a read into the heap
+ h->i = i;
+ h->entry.tag = NULL;
+ if (i < n) {
+ h->entry.bam_record = bam_init1();
+ if (!h->entry.bam_record) goto mem_fail;
+ }
+ if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ assert(i < n);
+ print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
+ goto fail;
+ }
+ }
+
+ // Open output file and write header
+ if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
+ print_error_errno(cmd, "failed to create \"%s\"", out);
+ return -1;
+ }
+
+ hts_set_threads(fpout, n_threads);
+
+ if (sam_hdr_write(fpout, hout) != 0) {
+ print_error_errno(cmd, "failed to write header to \"%s\"", out);
+ sam_close(fpout);
+ return -1;
+ }
+
+ // Now do the merge
+ ks_heapmake(heap, heap_size, heap);
+ while (heap->pos != HEAP_EMPTY) {
+ bam1_t *b = heap->entry.bam_record;
+ if (sam_write1(fpout, hout, b) < 0) {
+ print_error_errno(cmd, "failed writing to \"%s\"", out);
+ sam_close(fpout);
+ return -1;
+ }
+ if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ assert(heap->i < n);
+ print_error(cmd, "Error reading \"%s\" : %s",
+ fn[heap->i], strerror(errno));
+ goto fail;
+ }
+ ks_heapadjust(heap, 0, heap_size, heap);
+ }
+ // Clean up and close
+ for (i = 0; i < n; i++) {
+ if (sam_close(fp[i]) != 0) {
+ print_error(cmd, "Error on closing \"%s\" : %s",
+ fn[i], strerror(errno));
+ }
+ }
+ free(fp);
+ free(heap);
+ if (sam_close(fpout) < 0) {
+ print_error(cmd, "error closing output file");
+ return -1;
+ }
+ return 0;
+ mem_fail:
+ print_error(cmd, "Out of memory");
+
+ fail:
+ for (i = 0; i < n; i++) {
+ if (fp && fp[i]) sam_close(fp[i]);
+ if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record);
+ }
+ free(fp);
+ free(heap);
+ if (fpout) sam_close(fpout);
+ return -1;
+}
+
static int change_SO(bam_hdr_t *h, const char *so)
{
char *p, *q, *beg = NULL, *end = NULL, *newtext;
@@ -1635,29 +1807,41 @@ static int change_SO(bam_hdr_t *h, const char *so)
if (beg == NULL) { // no @HD
h->l_text += strlen(so) + 15;
newtext = (char*)malloc(h->l_text + 1);
- sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so);
- strcat(newtext, h->text);
+ if (!newtext) return -1;
+ snprintf(newtext, h->l_text + 1,
+ "@HD\tVN:1.3\tSO:%s\n%s", so, h->text);
} else { // has @HD but different or no SO
h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end);
newtext = (char*)malloc(h->l_text + 1);
- strncpy(newtext, h->text, beg - h->text);
- sprintf(newtext + (beg - h->text), "\tSO:%s", so);
- strcat(newtext, end);
+ if (!newtext) return -1;
+ snprintf(newtext, h->l_text + 1, "%.*s\tSO:%s%s",
+ (int) (beg - h->text), h->text, so, end);
}
free(h->text);
h->text = newtext;
return 0;
}
-// Function to compare reads and determine which one is < the other
+// Function to compare reads and determine which one is < or > the other
// Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag.
-static inline int bam1_lt_core(const bam1_p a, const bam1_p b)
+// Returns a value less than, equal to or greater than zero if a is less than,
+// equal to or greater than b, respectively.
+static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b)
{
+ uint64_t pa, pb;
+ if (!a.bam_record)
+ return 1;
+ if (!b.bam_record)
+ return 0;
+
if (g_is_by_qname) {
- int t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b));
- return (t < 0 || (t == 0 && (a.b->core.flag&0xc0) < (b.b->core.flag&0xc0)));
+ int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record));
+ if (t != 0) return t;
+ return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0);
} else {
- return (((uint64_t)a.b->core.tid<<32|(a.b->core.pos+1)<<1|bam_is_rev(a.b)) < ((uint64_t)b.b->core.tid<<32|(b.b->core.pos+1)<<1|bam_is_rev(b.b)));
+ pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1)<<1|bam_is_rev(a.bam_record);
+ pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1)<<1|bam_is_rev(b.bam_record);
+ return pa < pb ? -1 : (pa > pb ? 1 : 0);
}
}
@@ -1675,17 +1859,19 @@ uint8_t normalize_type(const uint8_t* aux) {
// Sort record by tag, using pos or read name as a secondary key if tags are identical. Reads not carrying the tag sort first.
// Tags are first sorted by the type character (in case the types differ), or by the appropriate comparator for that type if they agree.
-static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b)
+// Returns a value less than, equal to or greater than zero if a is less than,
+// equal to or greater than b, respectively.
+static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b)
{
const uint8_t* aux_a = a.tag;
const uint8_t* aux_b = b.tag;
if (aux_a == NULL && aux_b != NULL) {
- return 1;
+ return -1;
} else if (aux_a != NULL && aux_b == NULL) {
- return 0;
+ return 1;
} else if (aux_a == NULL && aux_b == NULL) {
- return bam1_lt_core(a,b);
+ return bam1_cmp_core(a,b);
}
// 'Normalize' the letters of the datatypes to a canonical letter,
@@ -1702,57 +1888,62 @@ static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b)
b_type = 'f';
} else {
// Unfixable mismatched types
- return a_type < b_type ? 1 : 0;
+ return a_type < b_type ? -1 : 1;
}
}
if (a_type == 'c') {
int64_t va = bam_aux2i(aux_a);
int64_t vb = bam_aux2i(aux_b);
- return (va < vb || (va == vb && bam1_lt_core(a, b)));
+ if (va != vb) return va < vb ? -1 : 1;
+ return bam1_cmp_core(a, b);
} else if (a_type == 'f') {
double va = bam_aux2f(aux_a);
double vb = bam_aux2f(aux_b);
- return (va < vb || (va == vb && bam1_lt_core(a,b)));
+ if (va != vb) return va < vb ? -1 : 1;
+ return bam1_cmp_core(a, b);
} else if (a_type == 'A') {
- char va = bam_aux2A(aux_a);
- char vb = bam_aux2A(aux_b);
- return (va < vb || (va == vb && bam1_lt_core(a,b)));
+ unsigned char va = bam_aux2A(aux_a);
+ unsigned char vb = bam_aux2A(aux_b);
+ if (va != vb) return va < vb ? -1 : 1;
+ return bam1_cmp_core(a, b);
} else if (a_type == 'H') {
int t = strcmp(bam_aux2Z(aux_a), bam_aux2Z(aux_b));
- return (t < 0 || (t == 0 && bam1_lt_core(a,b)));
+ if (t) return t;
+ return bam1_cmp_core(a, b);
} else {
- return bam1_lt_core(a,b);
+ return bam1_cmp_core(a,b);
}
}
// Function to compare reads and determine which one is < the other
// Handle sort-by-pos, sort-by-name, or sort-by-tag
-static inline int bam1_lt(const bam1_p a, const bam1_p b)
+static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
{
if (g_is_by_tag) {
- return bam1_lt_by_tag(a, b);
+ return bam1_cmp_by_tag(a, b) < 0;
} else {
- return bam1_lt_core(a,b);
+ return bam1_cmp_core(a,b) < 0;
}
}
-KSORT_INIT(sort, bam1_p, bam1_lt)
+KSORT_INIT(sort, bam1_tag, bam1_lt)
typedef struct {
size_t buf_len;
const char *prefix;
- bam1_p *buf;
+ bam1_tag *buf;
const bam_hdr_t *h;
int index;
int error;
+ int no_save;
} worker_t;
// Returns 0 for success
// -1 for failure
-static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
+static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
{
size_t i;
samFile* fp;
@@ -1761,7 +1952,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf,
if (sam_hdr_write(fp, h) != 0) goto fail;
if (n_threads > 1) hts_set_threads(fp, n_threads);
for (i = 0; i < l; ++i) {
- if (sam_write1(fp, h, buf[i].b) < 0) goto fail;
+ if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail;
}
if (sam_close(fp) < 0) return -1;
return 0;
@@ -1776,6 +1967,10 @@ static void *worker(void *data)
char *name;
w->error = 0;
ks_mergesort(sort, w->buf_len, w->buf, 0);
+
+ if (w->no_save)
+ return 0;
+
name = (char*)calloc(strlen(w->prefix) + 20, 1);
if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
@@ -1783,7 +1978,7 @@ static void *worker(void *data)
uint32_t max_ncigar = 0;
int i;
for (i = 0; i < w->buf_len; i++) {
- uint32_t nc = w->buf[i].b->core.n_cigar;
+ uint32_t nc = w->buf[i].bam_record->core.n_cigar;
if (max_ncigar < nc)
max_ncigar = nc;
}
@@ -1808,11 +2003,11 @@ static void *worker(void *data)
return 0;
}
-static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_hdr_t *h, int n_threads)
+static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
+ const bam_hdr_t *h, int n_threads, buf_region *in_mem)
{
int i;
- size_t rest;
- bam1_p *b;
+ size_t pos, rest;
pthread_t *tid;
pthread_attr_t attr;
worker_t *w;
@@ -1823,15 +2018,24 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
w = (worker_t*)calloc(n_threads, sizeof(worker_t));
+ if (!w) return -1;
tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t));
- b = buf; rest = k;
+ if (!tid) { free(w); return -1; }
+ pos = 0; rest = k;
for (i = 0; i < n_threads; ++i) {
w[i].buf_len = rest / (n_threads - i);
- w[i].buf = b;
+ w[i].buf = &buf[pos];
w[i].prefix = prefix;
w[i].h = h;
w[i].index = n_files + i;
- b += w[i].buf_len; rest -= w[i].buf_len;
+ if (in_mem) {
+ w[i].no_save = 1;
+ in_mem[i].from = pos;
+ in_mem[i].to = pos + w[i].buf_len;
+ } else {
+ w[i].no_save = 0;
+ }
+ pos += w[i].buf_len; rest -= w[i].buf_len;
pthread_create(&tid[i], &attr, worker, &w[i]);
}
for (i = 0; i < n_threads; ++i) {
@@ -1843,7 +2047,9 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
}
}
free(tid); free(w);
- return (n_failed == 0)? n_files + n_threads : -1;
+ if (n_failed) return -1;
+ if (in_mem) return n_threads;
+ return n_files + n_threads;
}
/*!
@@ -1862,7 +2068,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
@return 0 for successful sorting, negative on errors
@discussion It may create multiple temporary subalignment files
- and then merge them by calling bam_merge_core2(). This function is
+ and then merge them by calling bam_merge_simple(). This function is
NOT thread safe.
*/
int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
@@ -1870,12 +2076,22 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
size_t _max_mem, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
- int ret = -1, i, n_files = 0;
- size_t mem, max_k, k, max_mem;
+ int ret = -1, res, i, n_files = 0;
+ size_t max_k, k, max_mem, bam_mem_offset;
bam_hdr_t *header = NULL;
samFile *fp;
- bam1_p *buf;
- bam1_t *b;
+ bam1_tag *buf = NULL;
+ bam1_t *b = bam_init1();
+ uint8_t *bam_mem = NULL;
+ char **fns = NULL;
+ const char *new_so;
+ buf_region *in_mem = NULL;
+ int num_in_mem = 0;
+
+ if (!b) {
+ print_error("sort", "couldn't allocate memory for bam record");
+ return -1;
+ }
if (n_threads < 2) n_threads = 1;
g_is_by_qname = is_by_qname;
@@ -1884,13 +2100,12 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
strncpy(g_sort_tag, sort_by_tag, 2);
}
- max_k = k = 0; mem = 0;
max_mem = _max_mem * n_threads;
buf = NULL;
fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
print_error_errno("sort", "can't open \"%s\"", fn);
- return -2;
+ goto err;
}
header = sam_hdr_read(fp);
if (header == NULL) {
@@ -1899,11 +2114,17 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
}
if (sort_by_tag != NULL)
- change_SO(header, "unknown");
+ new_so = "unknown";
else if (is_by_qname)
- change_SO(header, "queryname");
+ new_so = "queryname";
else
- change_SO(header, "coordinate");
+ new_so = "coordinate";
+
+ if (change_SO(header, new_so) != 0) {
+ print_error("sort",
+ "failed to change sort order header to '%s'\n", new_so);
+ goto err;
+ }
// No gain to using the thread pool here as the flow of this code
// is such that we are *either* reading *or* sorting. Hence a shared
@@ -1911,93 +2132,121 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
if (n_threads > 1)
hts_set_threads(fp, n_threads);
+ if ((bam_mem = malloc(max_mem)) == NULL) {
+ print_error("sort", "couldn't allocate memory for bam_mem");
+ goto err;
+ }
+
// write sub files
- for (;;) {
+ k = max_k = bam_mem_offset = 0;
+ while ((res = sam_read1(fp, header, b)) >= 0) {
+ int mem_full = 0;
+
if (k == max_k) {
- size_t kk, old_max = max_k;
+ bam1_tag *new_buf;
max_k = max_k? max_k<<1 : 0x10000;
- buf = (bam1_p*)realloc(buf, max_k * sizeof(bam1_p));
- for (kk = old_max; kk < max_k; ++kk) {
- buf[kk].b = NULL;
- buf[kk].tag = NULL;
+ if ((new_buf = realloc(buf, max_k * sizeof(bam1_tag))) == NULL) {
+ print_error("sort", "couldn't allocate memory for buf");
+ goto err;
}
+ buf = new_buf;
}
- if (buf[k].b == NULL) buf[k].b = bam_init1();
- b = buf[k].b;
- if ((ret = sam_read1(fp, header, b)) < 0) break;
- if (b->l_data < b->m_data>>2) { // shrink
- b->m_data = b->l_data;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
+
+ // Check if the BAM record will fit in the memory limit
+ if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) {
+ // Copy record into the memory block
+ buf[k].bam_record = (bam1_t *)(bam_mem + bam_mem_offset);
+ *buf[k].bam_record = *b;
+ buf[k].bam_record->data = (uint8_t *)((char *)buf[k].bam_record + sizeof(bam1_t));
+ memcpy(buf[k].bam_record->data, b->data, b->l_data);
+ // store next BAM record in next 8-byte-aligned address after
+ // current one
+ bam_mem_offset = (bam_mem_offset + sizeof(*b) + b->l_data + 8 - 1) & ~((size_t)(8 - 1));
+ } else {
+ // Add a pointer to the remaining record
+ buf[k].bam_record = b;
+ mem_full = 1;
}
// Pull out the pointer to the sort tag if applicable
if (g_is_by_tag) {
- buf[k].tag = bam_aux_get(b, g_sort_tag);
+ buf[k].tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
} else {
buf[k].tag = NULL;
}
-
- mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
++k;
- if (mem >= max_mem) {
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+
+ if (mem_full) {
+ n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads,
+ NULL);
if (n_files < 0) {
- ret = -1;
goto err;
}
- mem = k = 0;
+ k = 0;
+ bam_mem_offset = 0;
}
}
- if (ret != -1) {
+ if (res != -1) {
print_error("sort", "truncated file. Aborting");
- ret = -1;
goto err;
}
+ // Sort last records
+ if (k > 0) {
+ in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
+ if (!in_mem) goto err;
+ num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
+ in_mem);
+ if (num_in_mem < 0) goto err;
+ } else {
+ num_in_mem = 0;
+ }
+
// write the final output
- if (n_files == 0) { // a single block
+ if (n_files == 0 && num_in_mem < 2) { // a single block
ks_mergesort(sort, k, buf, 0);
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
print_error_errno("sort", "failed to create \"%s\"", fnout);
- ret = -1;
goto err;
}
} else { // then merge
- char **fns;
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
- if (n_files == -1) {
- ret = -1;
- goto err;
- }
- fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
+ fprintf(stderr,
+ "[bam_sort_core] merging from %d files and %d in-memory blocks...\n",
+ n_files, num_in_mem);
fns = (char**)calloc(n_files, sizeof(char*));
+ if (!fns) goto err;
for (i = 0; i < n_files; ++i) {
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+ if (!fns[i]) goto err;
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
- if (bam_merge_core2(is_by_qname, sort_by_tag, fnout, modeout, NULL, n_files, fns,
- MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
- NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
- // Propagate bam_merge_core2() failure; it has already emitted a
+ if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header,
+ n_files, fns, num_in_mem, in_mem, buf,
+ n_threads, "sort", in_fmt, out_fmt) < 0) {
+ // Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
}
- for (i = 0; i < n_files; ++i) {
- unlink(fns[i]);
- free(fns[i]);
- }
- free(fns);
}
ret = 0;
err:
// free
- for (k = 0; k < max_k; ++k) bam_destroy1(buf[k].b);
+ if (fns) {
+ for (i = 0; i < n_files; ++i) {
+ if (fns[i]) {
+ unlink(fns[i]);
+ free(fns[i]);
+ }
+ }
+ free(fns);
+ }
+ bam_destroy1(b);
free(buf);
+ free(bam_mem);
bam_hdr_destroy(header);
- sam_close(fp);
+ if (fp) sam_close(fp);
return ret;
}
@@ -2006,6 +2255,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
{
int ret;
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
+ if (!fnout) return -1;
sprintf(fnout, "%s.bam", prefix);
ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
free(fnout);
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c
index 524f724..8989fc5 100644
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -40,7 +40,9 @@ DEALINGS IN THE SOFTWARE. */
#include <getopt.h>
#include <assert.h>
#include <pthread.h>
+#include "htslib/bgzf.h"
#include "htslib/ksort.h"
+#include "htslib/hts_os.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
#include "htslib/kstring.h"
@@ -51,10 +53,10 @@ DEALINGS IN THE SOFTWARE. */
// Struct which contains the a record, and the pointer to the sort tag (if any)
// Used to speed up sort-by-tag.
-typedef struct bam1_p {
- bam1_t *b;
+typedef struct bam1_tag {
+ bam1_t *bam_record;
const uint8_t *tag;
-} bam1_p;
+} bam1_tag;
/* Minimum memory required in megabytes before sort will attempt to run. This
is to prevent accidents where failing to use the -m option correctly results
@@ -124,29 +126,36 @@ static int strnum_cmp(const char *_a, const char *_b)
typedef struct {
int i;
uint64_t pos, idx;
- bam1_p b;
+ bam1_tag entry;
} heap1_t;
-#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
-
-static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b);
+static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
// Function to compare reads in the heap and determine which one is < the other
static inline int heap_lt(const heap1_t a, const heap1_t b)
{
+ if (!a.entry.bam_record)
+ return 1;
+ if (!b.entry.bam_record)
+ return 0;
+
if (g_is_by_tag) {
int t;
- if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
- t = bam1_lt_by_tag(b.b,a.b);
- return t;
+ t = bam1_cmp_by_tag(a.entry, b.entry);
+ if (t != 0) return t > 0;
} else if (g_is_by_qname) {
- int t;
- if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
- t = strnum_cmp(bam_get_qname(a.b.b), bam_get_qname(b.b.b));
- return (t > 0 || (t == 0 && (a.b.b->core.flag&0xc0) > (b.b.b->core.flag&0xc0)));
+ int t, fa, fb;
+ t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
+ if (t != 0) return t > 0;
+ fa = a.entry.bam_record->core.flag & 0xc0;
+ fb = b.entry.bam_record->core.flag & 0xc0;
+ if (fa != fb) return fa > fb;
} else {
- return __pos_cmp(a, b);
+ if (a.pos != b.pos) return a.pos > b.pos;
}
+ // This compares by position in the input file(s)
+ if (a.i != b.i) return a.i > b.i;
+ return a.idx > b.idx;
}
KSORT_INIT(heap, heap1_t, heap_lt)
@@ -1353,25 +1362,25 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
heap1_t *h = heap + i;
int res;
h->i = i;
- h->b.b = bam_init1();
- h->b.tag = NULL;
- if (!h->b.b) goto mem_fail;
- res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b.b) : sam_read1(fp[i], hdr[i], h->b.b);
+ h->entry.bam_record = bam_init1();
+ h->entry.tag = NULL;
+ if (!h->entry.bam_record) goto mem_fail;
+ res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record);
if (res >= 0) {
- bam_translate(h->b.b, translation_tbl + i);
- h->pos = ((uint64_t)h->b.b->core.tid<<32) | (uint32_t)((int32_t)h->b.b->core.pos+1)<<1 | bam_is_rev(h->b.b);
+ bam_translate(h->entry.bam_record, translation_tbl + i);
+ h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1)<<1 | bam_is_rev(h->entry.bam_record);
h->idx = idx++;
if (g_is_by_tag) {
- h->b.tag = bam_aux_get(h->b.b, g_sort_tag);
+ h->entry.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
} else {
- h->b.tag = NULL;
+ h->entry.tag = NULL;
}
}
else if (res == -1 && (!iter[i] || iter[i]->finished)) {
h->pos = HEAP_EMPTY;
- bam_destroy1(h->b.b);
- h->b.b = NULL;
- h->b.tag = NULL;
+ bam_destroy1(h->entry.bam_record);
+ h->entry.bam_record = NULL;
+ h->entry.tag = NULL;
} else {
print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
@@ -1393,7 +1402,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
// Begin the actual merge
ks_heapmake(heap, n, heap);
while (heap->pos != HEAP_EMPTY) {
- bam1_t *b = heap->b.b;
+ bam1_t *b = heap->entry.bam_record;
if (flag & MERGE_RG) {
uint8_t *rg = bam_aux_get(b, "RG");
if (rg) bam_aux_del(b, rg);
@@ -1409,15 +1418,15 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
heap->idx = idx++;
if (g_is_by_tag) {
- heap->b.tag = bam_aux_get(heap->b.b, g_sort_tag);
+ heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
} else {
- heap->b.tag = NULL;
+ heap->entry.tag = NULL;
}
} else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) {
heap->pos = HEAP_EMPTY;
- bam_destroy1(heap->b.b);
- heap->b.b = NULL;
- heap->b.tag = NULL;
+ bam_destroy1(heap->entry.bam_record);
+ heap->entry.bam_record = NULL;
+ heap->entry.tag = NULL;
} else {
print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
goto fail;
@@ -1461,7 +1470,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
if (iter && iter[i]) hts_itr_destroy(iter[i]);
if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]);
if (fp && fp[i]) sam_close(fp[i]);
- if (heap && heap[i].b.b) bam_destroy1(heap[i].b.b);
+ if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record);
}
if (hout) bam_hdr_destroy(hout);
free(RG);
@@ -1617,6 +1626,169 @@ end:
* BAM sorting *
***************/
+typedef struct {
+ size_t from;
+ size_t to;
+} buf_region;
+
+/* Simplified version of bam_merge_core2() for merging part-sorted
+ temporary files. No need for header merging or translation,
+ it just needs to read data into the heap and push it out again. */
+
+static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp,
+ int num_in_mem, buf_region *in_mem,
+ bam1_tag *buf, uint64_t *idx, bam_hdr_t *hout) {
+ int i = heap->i, res;
+ if (i < nfiles) { // read from file
+ res = sam_read1(fp[i], hout, heap->entry.bam_record);
+ } else { // read from memory
+ if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) {
+ heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record;
+ res = 0;
+ } else {
+ res = -1;
+ }
+ }
+ if (res >= 0) {
+ heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32)
+ | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)<<1
+ | bam_is_rev(heap->entry.bam_record));
+ heap->idx = (*idx)++;
+ if (g_is_by_tag) {
+ heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
+ } else {
+ heap->entry.tag = NULL;
+ }
+ } else if (res == -1) {
+ heap->pos = HEAP_EMPTY;
+ if (i < nfiles) bam_destroy1(heap->entry.bam_record);
+ heap->entry.bam_record = NULL;
+ heap->entry.tag = NULL;
+ } else {
+ return -1;
+ }
+ return 0;
+}
+
+static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
+ const char *mode, bam_hdr_t *hout,
+ int n, char * const *fn, int num_in_mem,
+ buf_region *in_mem, bam1_tag *buf, int n_threads,
+ const char *cmd, const htsFormat *in_fmt,
+ const htsFormat *out_fmt) {
+ samFile *fpout = NULL, **fp = NULL;
+ heap1_t *heap = NULL;
+ uint64_t idx = 0;
+ int i, heap_size = n + num_in_mem;
+
+ g_is_by_qname = by_qname;
+ if (sort_tag) {
+ g_is_by_tag = 1;
+ g_sort_tag[0] = sort_tag[0];
+ g_sort_tag[1] = sort_tag[1];
+ }
+ if (n > 0) {
+ fp = (samFile**)calloc(n, sizeof(samFile*));
+ if (!fp) goto mem_fail;
+ }
+ heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t));
+ if (!heap) goto mem_fail;
+
+ // Open each file, read the header and put the first read into the heap
+ for (i = 0; i < heap_size; i++) {
+ bam_hdr_t *hin;
+ heap1_t *h = &heap[i];
+
+ if (i < n) {
+ fp[i] = sam_open_format(fn[i], "r", in_fmt);
+ if (fp[i] == NULL) {
+ print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
+ goto fail;
+ }
+
+ // Read header ...
+ hin = sam_hdr_read(fp[i]);
+ if (hin == NULL) {
+ print_error(cmd, "failed to read header from \"%s\"", fn[i]);
+ goto fail;
+ }
+ // ... and throw it away as we don't really need it
+ bam_hdr_destroy(hin);
+ }
+
+ // Get a read into the heap
+ h->i = i;
+ h->entry.tag = NULL;
+ if (i < n) {
+ h->entry.bam_record = bam_init1();
+ if (!h->entry.bam_record) goto mem_fail;
+ }
+ if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ assert(i < n);
+ print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
+ goto fail;
+ }
+ }
+
+ // Open output file and write header
+ if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
+ print_error_errno(cmd, "failed to create \"%s\"", out);
+ return -1;
+ }
+
+ hts_set_threads(fpout, n_threads);
+
+ if (sam_hdr_write(fpout, hout) != 0) {
+ print_error_errno(cmd, "failed to write header to \"%s\"", out);
+ sam_close(fpout);
+ return -1;
+ }
+
+ // Now do the merge
+ ks_heapmake(heap, heap_size, heap);
+ while (heap->pos != HEAP_EMPTY) {
+ bam1_t *b = heap->entry.bam_record;
+ if (sam_write1(fpout, hout, b) < 0) {
+ print_error_errno(cmd, "failed writing to \"%s\"", out);
+ sam_close(fpout);
+ return -1;
+ }
+ if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ assert(heap->i < n);
+ print_error(cmd, "Error reading \"%s\" : %s",
+ fn[heap->i], strerror(errno));
+ goto fail;
+ }
+ ks_heapadjust(heap, 0, heap_size, heap);
+ }
+ // Clean up and close
+ for (i = 0; i < n; i++) {
+ if (sam_close(fp[i]) != 0) {
+ print_error(cmd, "Error on closing \"%s\" : %s",
+ fn[i], strerror(errno));
+ }
+ }
+ free(fp);
+ free(heap);
+ if (sam_close(fpout) < 0) {
+ print_error(cmd, "error closing output file");
+ return -1;
+ }
+ return 0;
+ mem_fail:
+ print_error(cmd, "Out of memory");
+
+ fail:
+ for (i = 0; i < n; i++) {
+ if (fp && fp[i]) sam_close(fp[i]);
+ if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record);
+ }
+ free(fp);
+ free(heap);
+ if (fpout) sam_close(fpout);
+ return -1;
+}
+
static int change_SO(bam_hdr_t *h, const char *so)
{
char *p, *q, *beg = NULL, *end = NULL, *newtext;
@@ -1637,29 +1809,41 @@ static int change_SO(bam_hdr_t *h, const char *so)
if (beg == NULL) { // no @HD
h->l_text += strlen(so) + 15;
newtext = (char*)malloc(h->l_text + 1);
- sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so);
- strcat(newtext, h->text);
+ if (!newtext) return -1;
+ snprintf(newtext, h->l_text + 1,
+ "@HD\tVN:1.3\tSO:%s\n%s", so, h->text);
} else { // has @HD but different or no SO
h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end);
newtext = (char*)malloc(h->l_text + 1);
- strncpy(newtext, h->text, beg - h->text);
- sprintf(newtext + (beg - h->text), "\tSO:%s", so);
- strcat(newtext, end);
+ if (!newtext) return -1;
+ snprintf(newtext, h->l_text + 1, "%.*s\tSO:%s%s",
+ (int) (beg - h->text), h->text, so, end);
}
free(h->text);
h->text = newtext;
return 0;
}
-// Function to compare reads and determine which one is < the other
+// Function to compare reads and determine which one is < or > the other
// Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag.
-static inline int bam1_lt_core(const bam1_p a, const bam1_p b)
+// Returns a value less than, equal to or greater than zero if a is less than,
+// equal to or greater than b, respectively.
+static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b)
{
+ uint64_t pa, pb;
+ if (!a.bam_record)
+ return 1;
+ if (!b.bam_record)
+ return 0;
+
if (g_is_by_qname) {
- int t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b));
- return (t < 0 || (t == 0 && (a.b->core.flag&0xc0) < (b.b->core.flag&0xc0)));
+ int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record));
+ if (t != 0) return t;
+ return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0);
} else {
- return (((uint64_t)a.b->core.tid<<32|(a.b->core.pos+1)<<1|bam_is_rev(a.b)) < ((uint64_t)b.b->core.tid<<32|(b.b->core.pos+1)<<1|bam_is_rev(b.b)));
+ pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1)<<1|bam_is_rev(a.bam_record);
+ pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1)<<1|bam_is_rev(b.bam_record);
+ return pa < pb ? -1 : (pa > pb ? 1 : 0);
}
}
@@ -1677,17 +1861,19 @@ uint8_t normalize_type(const uint8_t* aux) {
// Sort record by tag, using pos or read name as a secondary key if tags are identical. Reads not carrying the tag sort first.
// Tags are first sorted by the type character (in case the types differ), or by the appropriate comparator for that type if they agree.
-static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b)
+// Returns a value less than, equal to or greater than zero if a is less than,
+// equal to or greater than b, respectively.
+static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b)
{
const uint8_t* aux_a = a.tag;
const uint8_t* aux_b = b.tag;
if (aux_a == NULL && aux_b != NULL) {
- return 1;
+ return -1;
} else if (aux_a != NULL && aux_b == NULL) {
- return 0;
+ return 1;
} else if (aux_a == NULL && aux_b == NULL) {
- return bam1_lt_core(a,b);
+ return bam1_cmp_core(a,b);
}
// 'Normalize' the letters of the datatypes to a canonical letter,
@@ -1704,57 +1890,62 @@ static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b)
b_type = 'f';
} else {
// Unfixable mismatched types
- return a_type < b_type ? 1 : 0;
+ return a_type < b_type ? -1 : 1;
}
}
if (a_type == 'c') {
int64_t va = bam_aux2i(aux_a);
int64_t vb = bam_aux2i(aux_b);
- return (va < vb || (va == vb && bam1_lt_core(a, b)));
+ if (va != vb) return va < vb ? -1 : 1;
+ return bam1_cmp_core(a, b);
} else if (a_type == 'f') {
double va = bam_aux2f(aux_a);
double vb = bam_aux2f(aux_b);
- return (va < vb || (va == vb && bam1_lt_core(a,b)));
+ if (va != vb) return va < vb ? -1 : 1;
+ return bam1_cmp_core(a, b);
} else if (a_type == 'A') {
- char va = bam_aux2A(aux_a);
- char vb = bam_aux2A(aux_b);
- return (va < vb || (va == vb && bam1_lt_core(a,b)));
+ unsigned char va = bam_aux2A(aux_a);
+ unsigned char vb = bam_aux2A(aux_b);
+ if (va != vb) return va < vb ? -1 : 1;
+ return bam1_cmp_core(a, b);
} else if (a_type == 'H') {
int t = strcmp(bam_aux2Z(aux_a), bam_aux2Z(aux_b));
- return (t < 0 || (t == 0 && bam1_lt_core(a,b)));
+ if (t) return t;
+ return bam1_cmp_core(a, b);
} else {
- return bam1_lt_core(a,b);
+ return bam1_cmp_core(a,b);
}
}
// Function to compare reads and determine which one is < the other
// Handle sort-by-pos, sort-by-name, or sort-by-tag
-static inline int bam1_lt(const bam1_p a, const bam1_p b)
+static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
{
if (g_is_by_tag) {
- return bam1_lt_by_tag(a, b);
+ return bam1_cmp_by_tag(a, b) < 0;
} else {
- return bam1_lt_core(a,b);
+ return bam1_cmp_core(a,b) < 0;
}
}
-KSORT_INIT(sort, bam1_p, bam1_lt)
+KSORT_INIT(sort, bam1_tag, bam1_lt)
typedef struct {
size_t buf_len;
const char *prefix;
- bam1_p *buf;
+ bam1_tag *buf;
const bam_hdr_t *h;
int index;
int error;
+ int no_save;
} worker_t;
// Returns 0 for success
// -1 for failure
-static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
+static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
{
size_t i;
samFile* fp;
@@ -1763,7 +1954,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf,
if (sam_hdr_write(fp, h) != 0) goto fail;
if (n_threads > 1) hts_set_threads(fp, n_threads);
for (i = 0; i < l; ++i) {
- if (sam_write1(fp, h, buf[i].b) < 0) goto fail;
+ if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail;
}
if (sam_close(fp) < 0) return -1;
return 0;
@@ -1778,6 +1969,10 @@ static void *worker(void *data)
char *name;
w->error = 0;
ks_mergesort(sort, w->buf_len, w->buf, 0);
+
+ if (w->no_save)
+ return 0;
+
name = (char*)calloc(strlen(w->prefix) + 20, 1);
if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
@@ -1785,7 +1980,7 @@ static void *worker(void *data)
uint32_t max_ncigar = 0;
int i;
for (i = 0; i < w->buf_len; i++) {
- uint32_t nc = w->buf[i].b->core.n_cigar;
+ uint32_t nc = w->buf[i].bam_record->core.n_cigar;
if (max_ncigar < nc)
max_ncigar = nc;
}
@@ -1810,11 +2005,11 @@ static void *worker(void *data)
return 0;
}
-static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_hdr_t *h, int n_threads)
+static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
+ const bam_hdr_t *h, int n_threads, buf_region *in_mem)
{
int i;
- size_t rest;
- bam1_p *b;
+ size_t pos, rest;
pthread_t *tid;
pthread_attr_t attr;
worker_t *w;
@@ -1825,15 +2020,24 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
w = (worker_t*)calloc(n_threads, sizeof(worker_t));
+ if (!w) return -1;
tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t));
- b = buf; rest = k;
+ if (!tid) { free(w); return -1; }
+ pos = 0; rest = k;
for (i = 0; i < n_threads; ++i) {
w[i].buf_len = rest / (n_threads - i);
- w[i].buf = b;
+ w[i].buf = &buf[pos];
w[i].prefix = prefix;
w[i].h = h;
w[i].index = n_files + i;
- b += w[i].buf_len; rest -= w[i].buf_len;
+ if (in_mem) {
+ w[i].no_save = 1;
+ in_mem[i].from = pos;
+ in_mem[i].to = pos + w[i].buf_len;
+ } else {
+ w[i].no_save = 0;
+ }
+ pos += w[i].buf_len; rest -= w[i].buf_len;
pthread_create(&tid[i], &attr, worker, &w[i]);
}
for (i = 0; i < n_threads; ++i) {
@@ -1845,7 +2049,9 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
}
}
free(tid); free(w);
- return (n_failed == 0)? n_files + n_threads : -1;
+ if (n_failed) return -1;
+ if (in_mem) return n_threads;
+ return n_files + n_threads;
}
/*!
@@ -1864,7 +2070,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
@return 0 for successful sorting, negative on errors
@discussion It may create multiple temporary subalignment files
- and then merge them by calling bam_merge_core2(). This function is
+ and then merge them by calling bam_merge_simple(). This function is
NOT thread safe.
*/
int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
@@ -1872,12 +2078,22 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
size_t _max_mem, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
- int ret = -1, i, n_files = 0;
- size_t mem, max_k, k, max_mem;
+ int ret = -1, res, i, n_files = 0;
+ size_t max_k, k, max_mem, bam_mem_offset;
bam_hdr_t *header = NULL;
samFile *fp;
- bam1_p *buf;
- bam1_t *b;
+ bam1_tag *buf = NULL;
+ bam1_t *b = bam_init1();
+ uint8_t *bam_mem = NULL;
+ char **fns = NULL;
+ const char *new_so;
+ buf_region *in_mem = NULL;
+ int num_in_mem = 0;
+
+ if (!b) {
+ print_error("sort", "couldn't allocate memory for bam record");
+ return -1;
+ }
if (n_threads < 2) n_threads = 1;
g_is_by_qname = is_by_qname;
@@ -1886,13 +2102,12 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
strncpy(g_sort_tag, sort_by_tag, 2);
}
- max_k = k = 0; mem = 0;
max_mem = _max_mem * n_threads;
buf = NULL;
fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
print_error_errno("sort", "can't open \"%s\"", fn);
- return -2;
+ goto err;
}
header = sam_hdr_read(fp);
if (header == NULL) {
@@ -1901,11 +2116,17 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
}
if (sort_by_tag != NULL)
- change_SO(header, "unknown");
+ new_so = "unknown";
else if (is_by_qname)
- change_SO(header, "queryname");
+ new_so = "queryname";
else
- change_SO(header, "coordinate");
+ new_so = "coordinate";
+
+ if (change_SO(header, new_so) != 0) {
+ print_error("sort",
+ "failed to change sort order header to '%s'\n", new_so);
+ goto err;
+ }
// No gain to using the thread pool here as the flow of this code
// is such that we are *either* reading *or* sorting. Hence a shared
@@ -1913,93 +2134,121 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
if (n_threads > 1)
hts_set_threads(fp, n_threads);
+ if ((bam_mem = malloc(max_mem)) == NULL) {
+ print_error("sort", "couldn't allocate memory for bam_mem");
+ goto err;
+ }
+
// write sub files
- for (;;) {
+ k = max_k = bam_mem_offset = 0;
+ while ((res = sam_read1(fp, header, b)) >= 0) {
+ int mem_full = 0;
+
if (k == max_k) {
- size_t kk, old_max = max_k;
+ bam1_tag *new_buf;
max_k = max_k? max_k<<1 : 0x10000;
- buf = (bam1_p*)realloc(buf, max_k * sizeof(bam1_p));
- for (kk = old_max; kk < max_k; ++kk) {
- buf[kk].b = NULL;
- buf[kk].tag = NULL;
+ if ((new_buf = realloc(buf, max_k * sizeof(bam1_tag))) == NULL) {
+ print_error("sort", "couldn't allocate memory for buf");
+ goto err;
}
+ buf = new_buf;
}
- if (buf[k].b == NULL) buf[k].b = bam_init1();
- b = buf[k].b;
- if ((ret = sam_read1(fp, header, b)) < 0) break;
- if (b->l_data < b->m_data>>2) { // shrink
- b->m_data = b->l_data;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
+
+ // Check if the BAM record will fit in the memory limit
+ if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) {
+ // Copy record into the memory block
+ buf[k].bam_record = (bam1_t *)(bam_mem + bam_mem_offset);
+ *buf[k].bam_record = *b;
+ buf[k].bam_record->data = (uint8_t *)((char *)buf[k].bam_record + sizeof(bam1_t));
+ memcpy(buf[k].bam_record->data, b->data, b->l_data);
+ // store next BAM record in next 8-byte-aligned address after
+ // current one
+ bam_mem_offset = (bam_mem_offset + sizeof(*b) + b->l_data + 8 - 1) & ~((size_t)(8 - 1));
+ } else {
+ // Add a pointer to the remaining record
+ buf[k].bam_record = b;
+ mem_full = 1;
}
// Pull out the pointer to the sort tag if applicable
if (g_is_by_tag) {
- buf[k].tag = bam_aux_get(b, g_sort_tag);
+ buf[k].tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
} else {
buf[k].tag = NULL;
}
-
- mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
++k;
- if (mem >= max_mem) {
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+
+ if (mem_full) {
+ n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads,
+ NULL);
if (n_files < 0) {
- ret = -1;
goto err;
}
- mem = k = 0;
+ k = 0;
+ bam_mem_offset = 0;
}
}
- if (ret != -1) {
+ if (res != -1) {
print_error("sort", "truncated file. Aborting");
- ret = -1;
goto err;
}
+ // Sort last records
+ if (k > 0) {
+ in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
+ if (!in_mem) goto err;
+ num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
+ in_mem);
+ if (num_in_mem < 0) goto err;
+ } else {
+ num_in_mem = 0;
+ }
+
// write the final output
- if (n_files == 0) { // a single block
+ if (n_files == 0 && num_in_mem < 2) { // a single block
ks_mergesort(sort, k, buf, 0);
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
print_error_errno("sort", "failed to create \"%s\"", fnout);
- ret = -1;
goto err;
}
} else { // then merge
- char **fns;
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
- if (n_files == -1) {
- ret = -1;
- goto err;
- }
- fprintf(pysam_stderr, "[bam_sort_core] merging from %d files...\n", n_files);
+ fprintf(pysam_stderr,
+ "[bam_sort_core] merging from %d files and %d in-memory blocks...\n",
+ n_files, num_in_mem);
fns = (char**)calloc(n_files, sizeof(char*));
+ if (!fns) goto err;
for (i = 0; i < n_files; ++i) {
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+ if (!fns[i]) goto err;
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
- if (bam_merge_core2(is_by_qname, sort_by_tag, fnout, modeout, NULL, n_files, fns,
- MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
- NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
- // Propagate bam_merge_core2() failure; it has already emitted a
+ if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header,
+ n_files, fns, num_in_mem, in_mem, buf,
+ n_threads, "sort", in_fmt, out_fmt) < 0) {
+ // Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
}
- for (i = 0; i < n_files; ++i) {
- unlink(fns[i]);
- free(fns[i]);
- }
- free(fns);
}
ret = 0;
err:
// free
- for (k = 0; k < max_k; ++k) bam_destroy1(buf[k].b);
+ if (fns) {
+ for (i = 0; i < n_files; ++i) {
+ if (fns[i]) {
+ unlink(fns[i]);
+ free(fns[i]);
+ }
+ }
+ free(fns);
+ }
+ bam_destroy1(b);
free(buf);
+ free(bam_mem);
bam_hdr_destroy(header);
- sam_close(fp);
+ if (fp) sam_close(fp);
return ret;
}
@@ -2008,6 +2257,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
{
int ret;
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
+ if (!fnout) return -1;
sprintf(fnout, "%s.bam", prefix);
ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
free(fnout);
diff --git a/samtools/bamtk.c b/samtools/bamtk.c
index bd520b6..9316386 100644
--- a/samtools/bamtk.c
+++ b/samtools/bamtk.c
@@ -44,6 +44,7 @@ int bam_rmdup(int argc, char *argv[]);
int bam_flagstat(int argc, char *argv[]);
int bam_fillmd(int argc, char *argv[]);
int bam_idxstats(int argc, char *argv[]);
+int bam_markdup(int argc, char *argv[]);
int main_samview(int argc, char *argv[]);
int main_import(int argc, char *argv[]);
int main_reheader(int argc, char *argv[]);
@@ -92,6 +93,7 @@ static void usage(FILE *fp)
" rmdup remove PCR duplicates\n"
" targetcut cut fosmid regions (for fosmid pool only)\n"
" addreplacerg adds or replaces RG tags\n"
+" markdup mark duplicates\n"
"\n"
" -- File operations\n"
" collate shuffle and group alignments by name\n"
@@ -126,6 +128,18 @@ static void usage(FILE *fp)
#endif
}
+// This is a tricky one, but on Windows the filename wildcard expansion is done by
+// the application and not by the shell, as traditionally it never had a "shell".
+// Even now, DOS and Powershell do not do this expansion (but bash does).
+//
+// This means that Mingw/Msys implements code before main() that takes e.g. "*" and
+// expands it up to a list of matching filenames. This in turn breaks things like
+// specifying "*" as a region (all the unmapped reads). We take a hard line here -
+// filename expansion is the task of the shell, not our application!
+#ifdef _WIN32
+int _CRT_glob = 0;
+#endif
+
int main(int argc, char *argv[])
{
#ifdef _WIN32
@@ -156,6 +170,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1);
else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1);
else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1);
diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c
index 248bc81..67c09c8 100644
--- a/samtools/bamtk.c.pysam.c
+++ b/samtools/bamtk.c.pysam.c
@@ -40,12 +40,14 @@ int bam_mpileup(int argc, char *argv[]);
int bam_merge(int argc, char *argv[]);
int bam_index(int argc, char *argv[]);
int bam_sort(int argc, char *argv[]);
-/* AH: int bam_tview_main(int argc, char *argv[]); */
+/* AH: removed */
+/* int bam_tview_main(int argc, char *argv[]); */
int bam_mating(int argc, char *argv[]);
int bam_rmdup(int argc, char *argv[]);
int bam_flagstat(int argc, char *argv[]);
int bam_fillmd(int argc, char *argv[]);
int bam_idxstats(int argc, char *argv[]);
+int bam_markdup(int argc, char *argv[]);
int main_samview(int argc, char *argv[]);
int main_import(int argc, char *argv[]);
int main_reheader(int argc, char *argv[]);
@@ -94,6 +96,7 @@ static void usage(FILE *fp)
" rmdup remove PCR duplicates\n"
" targetcut cut fosmid regions (for fosmid pool only)\n"
" addreplacerg adds or replaces RG tags\n"
+" markdup mark duplicates\n"
"\n"
" -- File operations\n"
" collate shuffle and group alignments by name\n"
@@ -128,6 +131,18 @@ static void usage(FILE *fp)
#endif
}
+// This is a tricky one, but on Windows the filename wildcard expansion is done by
+// the application and not by the shell, as traditionally it never had a "shell".
+// Even now, DOS and Powershell do not do this expansion (but bash does).
+//
+// This means that Mingw/Msys implements code before main() that takes e.g. "*" and
+// expands it up to a list of matching filenames. This in turn breaks things like
+// specifying "*" as a region (all the unmapped reads). We take a hard line here -
+// filename expansion is the task of the shell, not our application!
+#ifdef _WIN32
+int _CRT_glob = 0;
+#endif
+
int samtools_main(int argc, char *argv[])
{
#ifdef _WIN32
@@ -158,6 +173,7 @@ int samtools_main(int argc, char *argv[])
else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1);
else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1);
else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1);
@@ -183,7 +199,9 @@ int samtools_main(int argc, char *argv[])
fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
return 1;
}
- /* else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); */
+/* AH:
+ else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
+*/
else if (strcmp(argv[1], "--version") == 0) {
fprintf(pysam_stdout,
"samtools %s\n"
diff --git a/samtools/bedidx.c b/samtools/bedidx.c
index c1954ad..86d2338 100644
--- a/samtools/bedidx.c
+++ b/samtools/bedidx.c
@@ -32,10 +32,6 @@ DEALINGS IN THE SOFTWARE. */
#include <errno.h>
#include <zlib.h>
-#ifdef _WIN32
-#define drand48() ((double)rand() / RAND_MAX)
-#endif
-
#include "htslib/ksort.h"
KSORT_INIT_GENERIC(uint64_t)
diff --git a/samtools/bedidx.c.pysam.c b/samtools/bedidx.c.pysam.c
index 5b7df0c..1998435 100644
--- a/samtools/bedidx.c.pysam.c
+++ b/samtools/bedidx.c.pysam.c
@@ -34,10 +34,6 @@ DEALINGS IN THE SOFTWARE. */
#include <errno.h>
#include <zlib.h>
-#ifdef _WIN32
-#define drand48() ((double)rand() / RAND_MAX)
-#endif
-
#include "htslib/ksort.h"
KSORT_INIT_GENERIC(uint64_t)
diff --git a/samtools/dict.c b/samtools/dict.c
index fa64a16..cb5622e 100644
--- a/samtools/dict.c
+++ b/samtools/dict.c
@@ -82,7 +82,11 @@ static void write_dict(const char *fn, args_t *args)
if (args->uri)
fprintf(out, "\tUR:%s", args->uri);
else if (strcmp(fn, "-") != 0) {
+#ifdef _WIN32
+ char *real_path = _fullpath(NULL, fn, PATH_MAX);
+#else
char *real_path = realpath(fn, NULL);
+#endif
fprintf(out, "\tUR:file://%s", real_path);
free(real_path);
}
diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c
index 5368851..c4e4045 100644
--- a/samtools/dict.c.pysam.c
+++ b/samtools/dict.c.pysam.c
@@ -84,7 +84,11 @@ static void write_dict(const char *fn, args_t *args)
if (args->uri)
fprintf(out, "\tUR:%s", args->uri);
else if (strcmp(fn, "-") != 0) {
+#ifdef _WIN32
+ char *real_path = _fullpath(NULL, fn, PATH_MAX);
+#else
char *real_path = realpath(fn, NULL);
+#endif
fprintf(out, "\tUR:file://%s", real_path);
free(real_path);
}
diff --git a/samtools/padding.c b/samtools/padding.c
index 2f10e86..650aff8 100644
--- a/samtools/padding.c
+++ b/samtools/padding.c
@@ -382,6 +382,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai)
{
int i = 0, unpadded_len = 0;
bam_hdr_t *header = 0 ;
+ unsigned short ln_found;
header = bam_hdr_dup(old);
for (i = 0; i < old->n_targets; ++i) {
@@ -418,27 +419,45 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai)
name += 4;
for (name_end = name; name_end != end && *name_end != '\t'; name_end++);
strcat(newtext, "@SQ");
+ ln_found = 0;
/* Parse the @SQ lines */
while (cp != end) {
- if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) {
+ if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) {
// Rewrite the length
char len_buf[100];
int tid;
+ unsigned int old_length, new_length;
+ const char *old_cp = cp;
+
+ ln_found = 1;
+
+ while (cp != end && *cp++ != '\t');
+ old_length = (int)(cp - old_cp);
+
for (tid = 0; tid < header->n_targets; tid++) {
// may want to hash this, but new header API incoming.
if (strncmp(name, header->target_name[tid], name_end - name) == 0) {
- sprintf(len_buf, "LN:%d", header->target_len[tid]);
- strcat(newtext, len_buf);
+ new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]);
+ if (new_length <= old_length) {
+ strcat(newtext, len_buf);
+ }
+ else {
+ fprintf(stderr, "LN value of the reference is larger than the original!\n");
+ exit(1);
+ }
break;
}
}
- while (cp != end && *cp++ != '\t');
+
if (cp != end)
strcat(newtext, "\t");
} else if (end-cp >= 2 &&
- (strncmp(cp, "M5", 2) == 0 ||
- strncmp(cp, "UR", 2) == 0)) {
+ ((ln_found && strncmp(cp, "LN", 2) == 0) ||
+ strncmp(cp, "M5", 2) == 0 ||
+ strncmp(cp, "UR", 2) == 0))
+ {
+ // skip secondary LNs
// MD5 changed during depadding; ditch it.
// URLs are also invalid.
while (cp != end && *cp++ != '\t');
diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c
index a3461e4..901f027 100644
--- a/samtools/padding.c.pysam.c
+++ b/samtools/padding.c.pysam.c
@@ -384,6 +384,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai)
{
int i = 0, unpadded_len = 0;
bam_hdr_t *header = 0 ;
+ unsigned short ln_found;
header = bam_hdr_dup(old);
for (i = 0; i < old->n_targets; ++i) {
@@ -420,27 +421,45 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai)
name += 4;
for (name_end = name; name_end != end && *name_end != '\t'; name_end++);
strcat(newtext, "@SQ");
+ ln_found = 0;
/* Parse the @SQ lines */
while (cp != end) {
- if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) {
+ if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) {
// Rewrite the length
char len_buf[100];
int tid;
+ unsigned int old_length, new_length;
+ const char *old_cp = cp;
+
+ ln_found = 1;
+
+ while (cp != end && *cp++ != '\t');
+ old_length = (int)(cp - old_cp);
+
for (tid = 0; tid < header->n_targets; tid++) {
// may want to hash this, but new header API incoming.
if (strncmp(name, header->target_name[tid], name_end - name) == 0) {
- sprintf(len_buf, "LN:%d", header->target_len[tid]);
- strcat(newtext, len_buf);
+ new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]);
+ if (new_length <= old_length) {
+ strcat(newtext, len_buf);
+ }
+ else {
+ fprintf(pysam_stderr, "LN value of the reference is larger than the original!\n");
+ exit(1);
+ }
break;
}
}
- while (cp != end && *cp++ != '\t');
+
if (cp != end)
strcat(newtext, "\t");
} else if (end-cp >= 2 &&
- (strncmp(cp, "M5", 2) == 0 ||
- strncmp(cp, "UR", 2) == 0)) {
+ ((ln_found && strncmp(cp, "LN", 2) == 0) ||
+ strncmp(cp, "M5", 2) == 0 ||
+ strncmp(cp, "UR", 2) == 0))
+ {
+ // skip secondary LNs
// MD5 changed during depadding; ditch it.
// URLs are also invalid.
while (cp != end && *cp++ != '\t');
diff --git a/samtools/phase.c b/samtools/phase.c
index 584334d..0e00d9b 100644
--- a/samtools/phase.c
+++ b/samtools/phase.c
@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/kstring.h"
#include "sam_opts.h"
#include "samtools.h"
+#include "htslib/hts_os.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c
index 4226c03..2cfb3ae 100644
--- a/samtools/phase.c.pysam.c
+++ b/samtools/phase.c.pysam.c
@@ -38,6 +38,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/kstring.h"
#include "sam_opts.h"
#include "samtools.h"
+#include "htslib/hts_os.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
diff --git a/samtools/sam_view.c b/samtools/sam_view.c
index ee65fcd..ceb1080 100644
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -969,7 +969,7 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li
}
/*
- * Create FASTQ lines from the barcode tag using the index-format
+ * Create FASTQ lines from the barcode tag using the index-format
*/
static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
{
@@ -1072,7 +1072,7 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t
if (state->use_oq) {
oq = bam_aux_get(b, "OQ");
if (oq) {
- oq++;
+ oq++;
qual = strdup(bam_aux2Z(oq));
if (!qual) goto fail;
if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
@@ -1208,6 +1208,13 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
return false;
}
+ if (nIndex==0 && opts->index_file[0]) {
+ fprintf(stderr, "index_format not specified, but index file given\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
if (opts->def_qual < 0 || 93 < opts->def_qual) {
fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
bam2fq_usage(stderr, argv[0]);
@@ -1375,7 +1382,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
}
}
for (i = 0; i < 2; i++) {
- if (state->fpi[i] && bgzf_close(state->fpi[i])) {
+ if (state->fpi[i] && bgzf_close(state->fpi[i])) {
print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
valid = false;
}
@@ -1435,14 +1442,22 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
// print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- } else if ((score[1] > 0 || score[2] > 0) && state->fpse) {
- // print whichever one exists to fpse
- if (score[1] > 0) {
- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+ } else if (score[1] > 0 || score[2] > 0) {
+ if (state->fpse) {
+ // print whichever one exists to fpse
+ if (score[1] > 0) {
+ if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+ } else {
+ if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+ }
+ ++n_singletons;
} else {
- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+ if (score[1] > 0) {
+ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+ } else {
+ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+ }
}
- ++n_singletons;
}
if (score[0]) { // TODO: check this
// print linebuf[0] to fpr[0]
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c
index f46cc9f..5113339 100644
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -971,7 +971,7 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li
}
/*
- * Create FASTQ lines from the barcode tag using the index-format
+ * Create FASTQ lines from the barcode tag using the index-format
*/
static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
{
@@ -1074,7 +1074,7 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t
if (state->use_oq) {
oq = bam_aux_get(b, "OQ");
if (oq) {
- oq++;
+ oq++;
qual = strdup(bam_aux2Z(oq));
if (!qual) goto fail;
if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
@@ -1210,6 +1210,13 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
return false;
}
+ if (nIndex==0 && opts->index_file[0]) {
+ fprintf(pysam_stderr, "index_format not specified, but index file given\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
if (opts->def_qual < 0 || 93 < opts->def_qual) {
fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
bam2fq_usage(pysam_stderr, argv[0]);
@@ -1377,7 +1384,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
}
}
for (i = 0; i < 2; i++) {
- if (state->fpi[i] && bgzf_close(state->fpi[i])) {
+ if (state->fpi[i] && bgzf_close(state->fpi[i])) {
print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
valid = false;
}
@@ -1437,14 +1444,22 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
// print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- } else if ((score[1] > 0 || score[2] > 0) && state->fpse) {
- // print whichever one exists to fpse
- if (score[1] > 0) {
- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+ } else if (score[1] > 0 || score[2] > 0) {
+ if (state->fpse) {
+ // print whichever one exists to fpse
+ if (score[1] > 0) {
+ if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+ } else {
+ if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+ }
+ ++n_singletons;
} else {
- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+ if (score[1] > 0) {
+ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+ } else {
+ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+ }
}
- ++n_singletons;
}
if (score[0]) { // TODO: check this
// print linebuf[0] to fpr[0]
diff --git a/samtools/version.h b/samtools/version.h
index 1f3fa45..e74ad87 100644
--- a/samtools/version.h
+++ b/samtools/version.h
@@ -1 +1 @@
-#define SAMTOOLS_VERSION "1.5"
+#define SAMTOOLS_VERSION "1.6"
diff --git a/setup.py b/setup.py
index 66783ae..608badb 100644
--- a/setup.py
+++ b/setup.py
@@ -364,6 +364,8 @@ else:
define_macros = []
+samtools_include_dirs = [os.path.abspath("samtools")]
+
chtslib = Extension(
"pysam.libchtslib",
[source_pattern % "htslib",
@@ -390,7 +392,7 @@ csamfile = Extension(
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- include_dirs=["pysam", "samtools", "."] + include_os + htslib_include_dirs,
+ include_dirs=["pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
@@ -409,7 +411,7 @@ calignmentfile = Extension(
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- include_dirs=["pysam", "samtools"] + include_os + htslib_include_dirs,
+ include_dirs=["pysam"] + samtools_include_dirs + include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
@@ -428,7 +430,7 @@ calignedsegment = Extension(
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- include_dirs=["pysam", "samtools", "."] + include_os + htslib_include_dirs,
+ include_dirs=["pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
extra_compile_args=extra_compile_args,
@@ -472,7 +474,7 @@ csamtools = Extension(
htslib_sources +
os_c_files,
library_dirs=["pysam"] + htslib_library_dirs,
- include_dirs=["samtools", "pysam", "."] +
+ include_dirs=["pysam", "."] + samtools_include_dirs +
include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
@@ -487,7 +489,7 @@ cbcftools = Extension(
htslib_sources +
os_c_files,
library_dirs=["pysam"] + htslib_library_dirs,
- include_dirs=["bcftools", "pysam", "."] +
+ include_dirs=["bcftools", "pysam", "."] + samtools_include_dirs +
include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py
index aafa826..920ddbc 100644
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -11,7 +11,7 @@ from TestUtils import checkFieldEqual, BAM_DATADIR, WORKDIR
class ReadTest(unittest.TestCase):
- def buildRead(self):
+ def build_read(self):
'''build an example read.'''
a = pysam.AlignedSegment()
@@ -26,7 +26,6 @@ class ReadTest(unittest.TestCase):
a.next_reference_start = 200
a.template_length = 167
a.query_qualities = pysam.qualitystring_to_array("1234") * 10
- # todo: create tags
return a
@@ -66,8 +65,8 @@ class TestAlignedSegment(ReadTest):
def testCompare(self):
'''check comparison functions.'''
- a = self.buildRead()
- b = self.buildRead()
+ a = self.build_read()
+ b = self.build_read()
self.assertEqual(0, a.compare(b))
self.assertEqual(0, b.compare(a))
@@ -83,8 +82,8 @@ class TestAlignedSegment(ReadTest):
self.assertTrue(b != a)
def testHashing(self):
- a = self.buildRead()
- b = self.buildRead()
+ a = self.build_read()
+ b = self.build_read()
self.assertEqual(hash(a), hash(b))
b.tid = 2
self.assertNotEqual(hash(a), hash(b))
@@ -92,8 +91,8 @@ class TestAlignedSegment(ReadTest):
def testUpdate(self):
'''check if updating fields affects other variable length data
'''
- a = self.buildRead()
- b = self.buildRead()
+ a = self.build_read()
+ b = self.build_read()
# check qname
b.query_name = "read_123"
@@ -124,7 +123,7 @@ class TestAlignedSegment(ReadTest):
checkFieldEqual(self, a, b, ("query_qualities",))
# reset qual
- b = self.buildRead()
+ b = self.build_read()
# check flags:
for x in (
@@ -147,11 +146,11 @@ class TestAlignedSegment(ReadTest):
This does not work as setting the sequence will erase
the quality scores.
'''
- a = self.buildRead()
+ a = self.build_read()
a.query_sequence = a.query_sequence[5:10]
self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None)
- a = self.buildRead()
+ a = self.build_read()
s = pysam.qualities_to_qualitystring(a.query_qualities)
a.query_sequence = a.query_sequence[5:10]
a.query_qualities = pysam.qualitystring_to_array(s[5:10])
@@ -178,14 +177,14 @@ class TestAlignedSegment(ReadTest):
def testUpdateTlen(self):
'''check if updating tlen works'''
- a = self.buildRead()
+ a = self.build_read()
oldlen = a.template_length
oldlen *= 2
a.template_length = oldlen
self.assertEqual(a.template_length, oldlen)
def testPositions(self):
- a = self.buildRead()
+ a = self.build_read()
self.assertEqual(a.get_reference_positions(),
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
31, 32, 33, 34, 35, 36, 37, 38, 39,
@@ -216,20 +215,20 @@ class TestAlignedSegment(ReadTest):
def testFullReferencePositions(self):
'''see issue 26'''
- a = self.buildRead()
+ a = self.build_read()
a.cigar = [(4, 30), (0, 20), (1, 3), (0, 47)]
self.assertEqual(100,
len(a.get_reference_positions(full_length=True)))
def testBlocks(self):
- a = self.buildRead()
+ a = self.build_read()
self.assertEqual(a.get_blocks(),
[(20, 30), (31, 40), (40, 60)])
def test_infer_query_length(self):
'''Test infer_query_length on M|=|X|I|D|H|S cigar ops'''
- a = self.buildRead()
+ a = self.build_read()
a.cigarstring = '40M'
self.assertEqual(a.infer_query_length(), 40)
a.cigarstring = '40='
@@ -253,7 +252,7 @@ class TestAlignedSegment(ReadTest):
def test_infer_read_length(self):
'''Test infer_read_length on M|=|X|I|D|H|S cigar ops'''
- a = self.buildRead()
+ a = self.build_read()
a.cigarstring = '40M'
self.assertEqual(a.infer_read_length(), 40)
a.cigarstring = '40='
@@ -276,7 +275,7 @@ class TestAlignedSegment(ReadTest):
self.assertEqual(a.infer_read_length(), None)
def test_get_aligned_pairs_soft_clipping(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigartuples = ((4, 2), (0, 35), (4, 3))
self.assertEqual(a.get_aligned_pairs(),
[(0, None), (1, None)] +
@@ -292,7 +291,7 @@ class TestAlignedSegment(ReadTest):
)
def test_get_aligned_pairs_hard_clipping(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigartuples = ((5, 2), (0, 35), (5, 3))
self.assertEqual(a.get_aligned_pairs(),
# No seq, no seq pos
@@ -303,7 +302,7 @@ class TestAlignedSegment(ReadTest):
range(0, 0 + 35), range(20, 20 + 35))])
def test_get_aligned_pairs_skip(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigarstring = "2M100D38M"
self.assertEqual(a.get_aligned_pairs(),
[(0, 20), (1, 21)] +
@@ -319,7 +318,7 @@ class TestAlignedSegment(ReadTest):
range(20 + 2 + 100, 20 + 2 + 100 + 38))])
def test_get_aligned_pairs_match_mismatch(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigartuples = ((7, 20), (8, 20))
self.assertEqual(a.get_aligned_pairs(),
[(qpos, refpos) for (qpos, refpos) in zip(
@@ -329,7 +328,7 @@ class TestAlignedSegment(ReadTest):
range(0, 0 + 40), range(20, 20 + 40))])
def test_get_aligned_pairs_padding(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigartuples = ((7, 20), (6, 1), (8, 19))
def inner():
@@ -338,7 +337,7 @@ class TestAlignedSegment(ReadTest):
self.assertRaises(NotImplementedError, inner)
def test_get_aligned_pairs(self):
- a = self.buildRead()
+ a = self.build_read()
a.query_sequence = "A" * 9
a.cigarstring = "9M"
a.set_tag("MD", "9")
@@ -377,7 +376,7 @@ class TestAlignedSegment(ReadTest):
)
def test_get_aligned_pairs_skip_reference(self):
- a = self.buildRead()
+ a = self.build_read()
a.query_sequence = "A" * 10
a.cigarstring = "5M1N5M"
a.set_tag("MD", "10")
@@ -408,7 +407,7 @@ class TestAlignedSegment(ReadTest):
'''issue 176: retrieving length without query sequence
with soft-clipping.
'''
- a = self.buildRead()
+ a = self.build_read()
a.query_sequence = None
a.cigarstring = "20M"
self.assertEqual(a.query_alignment_length, 20)
@@ -427,7 +426,7 @@ class TestAlignedSegment(ReadTest):
def test_query_length_is_limited(self):
- a = self.buildRead()
+ a = self.build_read()
a.query_name = "A" * 1
a.query_name = "A" * 251
self.assertRaises(
@@ -438,11 +437,30 @@ class TestAlignedSegment(ReadTest):
"A" * 252)
+class TestCigar(ReadTest):
+
+ def testCigarString(self):
+ r = self.build_read()
+ self.assertEqual(r.cigarstring, "10M1D9M1I20M")
+ r.cigarstring = "20M10D20M"
+ self.assertEqual(r.cigartuples, [(0, 20), (2, 10), (0, 20)])
+ # unsetting cigar string
+ r.cigarstring = None
+ self.assertEqual(r.cigarstring, None)
+
+ def testCigar(self):
+ r = self.build_read()
+ self.assertEqual(r.cigartuples, [(0, 10), (2, 1), (0, 9), (1, 1), (0, 20)])
+ # unsetting cigar string
+ r.cigartuples = None
+ self.assertEqual(r.cigartuples, None)
+
+
class TestCigarStats(ReadTest):
def testStats(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigarstring = None
self.assertEqual(
@@ -508,15 +526,15 @@ class TestAlignedPairs(unittest.TestCase):
class TestTags(ReadTest):
def testMissingTag(self):
- a = self.buildRead()
+ a = self.build_read()
self.assertRaises(KeyError, a.get_tag, "XP")
def testEmptyTag(self):
- a = self.buildRead()
+ a = self.build_read()
self.assertRaises(KeyError, a.get_tag, "XT")
def testSetTag(self):
- a = self.buildRead()
+ a = self.build_read()
self.assertEqual(False, a.has_tag("NM"))
a.set_tag("NM", 2)
self.assertEqual(True, a.has_tag("NM"))
@@ -530,7 +548,7 @@ class TestTags(ReadTest):
a.set_tag("NM", None)
def testArrayTags(self):
- read = self.buildRead()
+ read = self.build_read()
supported_dtypes = "bhBHf"
unsupported_dtypes = "lLd"
@@ -547,7 +565,7 @@ class TestTags(ReadTest):
array.array(dtype, range(10)))
def testAddTagsType(self):
- a = self.buildRead()
+ a = self.build_read()
a.tags = None
self.assertEqual(a.tags, [])
@@ -579,10 +597,10 @@ class TestTags(ReadTest):
('X5', 5)]))
# test setting invalid type code
- self.assertRaises(ValueError, a.setTag, 'X6', 5.2, 'g')
+ self.assertRaises(ValueError, a.set_tag, 'X6', 5.2, 'g')
def testTagsUpdatingFloat(self):
- a = self.buildRead()
+ a = self.build_read()
a.tags = [('NM', 1), ('RG', 'L1'),
('PG', 'P1'), ('XT', 'U')]
@@ -595,7 +613,7 @@ class TestTags(ReadTest):
('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)])
def testAddTags(self):
- a = self.buildRead()
+ a = self.build_read()
a.tags = [('NM', 1), ('RG', 'L1'),
('PG', 'P1'), ('XT', 'U')]
@@ -643,7 +661,7 @@ class TestTags(ReadTest):
self.assertEqual(after, before)
def testMDTagMatchOnly(self):
- a = self.buildRead()
+ a = self.build_read()
# Substitutions only
a.cigarstring = "21M"
@@ -668,7 +686,7 @@ class TestTags(ReadTest):
a.get_reference_sequence())
def testMDTagInsertions(self):
- a = self.buildRead()
+ a = self.build_read()
# insertions are silent in the reference sequence
a.cigarstring = "5M1I5M"
@@ -691,7 +709,7 @@ class TestTags(ReadTest):
"A" * 10)
def testMDTagDeletions(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigarstring = "5M1D5M"
a.query_sequence = "A" * 10
@@ -708,7 +726,7 @@ class TestTags(ReadTest):
a.get_reference_sequence())
def testMDTagRefSkipping(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigarstring = "5M1N5M"
a.query_sequence = "A" * 10
@@ -725,7 +743,7 @@ class TestTags(ReadTest):
a.get_reference_sequence())
def testMDTagSoftClipping(self):
- a = self.buildRead()
+ a = self.build_read()
# softclipping
a.cigarstring = "5S5M1D5M5S"
@@ -744,7 +762,7 @@ class TestTags(ReadTest):
a.get_reference_sequence())
def testMDTagComplex(self):
- a = self.buildRead()
+ a = self.build_read()
a.cigarstring = "5S5M1I2D5M5S"
a.query_sequence = "G" * 5 + "A" * 11 + "G" * 5
@@ -777,11 +795,81 @@ class TestTags(ReadTest):
"AAAAcTTAA",
a.get_reference_sequence())
+ def testArrayTags(self):
+
+ r = self.build_read()
+
+ def c(r, l):
+ r.tags = [('ZM', l)]
+ self.assertEqual(list(r.opt("ZM")), list(l))
+
+ # signed integers
+ c(r, (-1, 1))
+ c(r, (-1, 100))
+ c(r, (-1, 200))
+ c(r, (-1, 1000))
+ c(r, (-1, 30000))
+ c(r, (-1, 50000))
+ c(r, (1, -1))
+ c(r, (1, -100))
+ c(r, (1, -200))
+ c(r, (1, -1000))
+ c(r, (1, -30000))
+ c(r, (1, -50000))
+
+ # unsigned integers
+ c(r, (1, 100))
+ c(r, (1, 1000))
+ c(r, (1, 10000))
+ c(r, (1, 100000))
+
+ # floats
+ c(r, (1.0, 100.0))
+
+ def testLongTags(self):
+ '''see issue 115'''
+
+ r = self.build_read()
+ rg = 'HS2000-899_199.L3'
+ tags = [('XC', 85), ('XT', 'M'), ('NM', 5),
+ ('SM', 29), ('AM', 29), ('XM', 1),
+ ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'),
+ ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')]
+
+ r.tags = tags
+ r.tags += [("RG", rg)] * 100
+ tags += [("RG", rg)] * 100
+
+ self.assertEqual(tags, r.tags)
+
+ def testNegativeIntegers(self):
+ x = -2
+ aligned_read = self.build_read()
+ aligned_read.tags = [("XD", int(x))]
+ self.assertEqual(aligned_read.opt('XD'), x)
+ # print (aligned_read.tags)
+
+ def testNegativeIntegersWrittenToFile(self):
+ r = self.build_read()
+ x = -2
+ r.tags = [("XD", x)]
+ with pysam.AlignmentFile(
+ "tests/test.bam",
+ "wb",
+ referencenames=("chr1",),
+ referencelengths = (1000,)) as outf:
+ outf.write(r)
+ with pysam.AlignmentFile("tests/test.bam") as inf:
+ r = next(inf)
+
+ self.assertEqual(r.tags, [("XD", x)])
+ os.unlink("tests/test.bam")
+
class TestCopy(ReadTest):
def testCopy(self):
- a = self.buildRead()
+ a = self.build_read()
b = copy.copy(a)
# check if a and be are the same
self.assertEqual(a, b)
@@ -793,7 +881,7 @@ class TestCopy(ReadTest):
self.assertEqual(b.query_name, 'ReadB')
def testDeepCopy(self):
- a = self.buildRead()
+ a = self.build_read()
b = copy.deepcopy(a)
# check if a and be are the same
self.assertEqual(a, b)
@@ -805,6 +893,93 @@ class TestCopy(ReadTest):
self.assertEqual(b.query_name, 'ReadB')
+class TestSetTagGetTag(ReadTest):
+
+ def check_tag(self, tag, value, value_type, alt_value_type=None):
+ a = self.build_read()
+ a.set_tag(tag, value, value_type=value_type)
+ v, t = a.get_tag(tag, with_value_type=True)
+ self.assertEqual(v, value)
+
+ if alt_value_type:
+ self.assertEqual(t, alt_value_type)
+ else:
+ self.assertEqual(t, value_type)
+
+ def test_set_tag_with_A(self):
+ self.check_tag('TT', "x", value_type="A")
+
+ def test_set_tag_with_a(self):
+ self.check_tag('TT', "x", value_type="a", alt_value_type="A")
+
+ def test_set_tag_with_C(self):
+ self.check_tag('TT', 12, value_type="C")
+
+ def test_set_tag_with_c(self):
+ self.check_tag('TT', 12, value_type="c")
+
+ def test_set_tag_with_S(self):
+ self.check_tag('TT', 12, value_type="S")
+
+ def test_set_tag_with_s(self):
+ self.check_tag('TT', 12, value_type="s")
+
+ def test_set_tag_with_I(self):
+ self.check_tag('TT', 12, value_type="I")
+
+ def test_set_tag_with_i(self):
+ self.check_tag('TT', 12, value_type="i")
+
+ def test_set_tag_with_f(self):
+ self.check_tag('TT', 2.5, value_type="f")
+
+ def test_set_tag_with_d(self):
+ self.check_tag('TT', 2.5, value_type="d")
+
+ def test_set_tag_with_H(self):
+ self.check_tag('TT', "AE12", value_type="H")
+
+ def test_set_tag_with_automated_type_detection(self):
+ self.check_tag('TT', -(1 << 7), value_type=None, alt_value_type="c")
+ self.check_tag('TT', -(1 << 7) - 1, value_type=None, alt_value_type="s")
+ self.check_tag('TT', -(1 << 15), value_type=None, alt_value_type="s")
+ self.check_tag('TT', -(1 << 15) - 1, value_type=None, alt_value_type="i")
+ self.check_tag('TT', -(1 << 31), value_type=None, alt_value_type="i")
+ self.assertRaises(
+ ValueError,
+ self.check_tag,
+ 'TT',
+ -(1 << 31) - 1,
+ value_type=None,
+ alt_value_type="i")
+
+ self.check_tag('TT', (1 << 8) - 1, value_type=None, alt_value_type="C")
+ self.check_tag('TT', (1 << 8), value_type=None, alt_value_type="S")
+ self.check_tag('TT', (1 << 16) - 1, value_type=None, alt_value_type="S")
+ self.check_tag('TT', (1 << 16), value_type=None, alt_value_type="I")
+ self.check_tag('TT', (1 << 32) - 1, value_type=None, alt_value_type="I")
+ self.assertRaises(
+ ValueError,
+ self.check_tag,
+ 'TT',
+ (1 << 32),
+ value_type=None,
+ alt_value_type="I")
+
+
+class TestSetTagsGetTag(TestSetTagGetTag):
+
+ def check_tag(self, tag, value, value_type, alt_value_type=None):
+ a = self.build_read()
+ a.set_tags([(tag, value, value_type)])
+ v, t = a.get_tag(tag, with_value_type=True)
+ if alt_value_type:
+ self.assertEqual(t, alt_value_type)
+ else:
+ self.assertEqual(t, value_type)
+ self.assertEqual(v, value)
+
+
class TestAsString(unittest.TestCase):
def testAsString(self):
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py
index f81d752..e6f9bdb 100644
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -1169,116 +1169,7 @@ class TestLargeFieldBug(unittest.TestCase):
new_read.tags = read.tags
self.assertEqual(new_read.tags, read.tags)
-
-class TestTagParsing(unittest.TestCase):
-
- '''tests checking the accuracy of tag setting and retrieval.'''
-
- def makeRead(self):
- a = pysam.AlignedSegment()
- a.query_name = "read_12345"
- a.reference_id = 0
- a.query_sequence = "ACGT" * 3
- a.flag = 0
- a.reference_id = 0
- a.reference_start = 1
- a.mapping_quality = 20
- a.cigartuples = ((0, 10), (2, 1), (0, 25))
- a.next_reference_id = 0
- a.next_reference_start = 200
- a.template_length = 0
- a.query_qualities = pysam.qualitystring_to_array("1234") * 3
- # todo: create tags
- return a
-
- def testNegativeIntegers(self):
- x = -2
- aligned_read = self.makeRead()
- aligned_read.tags = [("XD", int(x))]
- self.assertEqual(aligned_read.opt('XD'), x)
- # print (aligned_read.tags)
-
- def testNegativeIntegers2(self):
- x = -2
- r = self.makeRead()
- r.tags = [("XD", x)]
- outfile = pysam.AlignmentFile(
- "tests/test.bam",
- "wb",
- referencenames=("chr1",),
- referencelengths = (1000,))
- outfile.write(r)
- outfile.close()
- infile = pysam.AlignmentFile("tests/test.bam")
- r = next(infile)
- self.assertEqual(r.tags, [("XD", x)])
- infile.close()
- os.unlink("tests/test.bam")
-
- def testCigarString(self):
- r = self.makeRead()
- self.assertEqual(r.cigarstring, "10M1D25M")
- r.cigarstring = "20M10D20M"
- self.assertEqual(r.cigartuples, [(0, 20), (2, 10), (0, 20)])
- # unsetting cigar string
- r.cigarstring = None
- self.assertEqual(r.cigarstring, None)
-
- def testCigar(self):
- r = self.makeRead()
- self.assertEqual(r.cigartuples, [(0, 10), (2, 1), (0, 25)])
- # unsetting cigar string
- r.cigartuples = None
- self.assertEqual(r.cigartuples, None)
-
- def testLongTags(self):
- '''see issue 115'''
-
- r = self.makeRead()
- rg = 'HS2000-899_199.L3'
- tags = [('XC', 85), ('XT', 'M'), ('NM', 5),
- ('SM', 29), ('AM', 29), ('XM', 1),
- ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'),
- ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')]
-
- r.tags = tags
- r.tags += [("RG", rg)] * 100
- tags += [("RG", rg)] * 100
-
- self.assertEqual(tags, r.tags)
-
- def testArrayTags(self):
-
- r = self.makeRead()
-
- def c(r, l):
- r.tags = [('ZM', l)]
- self.assertEqual(list(r.opt("ZM")), list(l))
-
- # signed integers
- c(r, (-1, 1))
- c(r, (-1, 100))
- c(r, (-1, 200))
- c(r, (-1, 1000))
- c(r, (-1, 30000))
- c(r, (-1, 50000))
- c(r, (1, -1))
- c(r, (1, -100))
- c(r, (1, -200))
- c(r, (1, -1000))
- c(r, (1, -30000))
- c(r, (1, -50000))
-
- # unsigned integers
- c(r, (1, 100))
- c(r, (1, 1000))
- c(r, (1, 10000))
- c(r, (1, 100000))
-
- # floats
- c(r, (1.0, 100.0))
-
-
+
class TestClipping(unittest.TestCase):
def testClipping(self):
@@ -1773,10 +1664,12 @@ class TestDeNovoConstruction(unittest.TestCase):
'''check if individual reads are binary equal.'''
infile = pysam.AlignmentFile(self.bamfile, "rb")
- others = list(infile)
- for denovo, other in zip(others, self.reads):
- checkFieldEqual(self, other, denovo)
- self.assertEqual(other.compare(denovo), 0)
+ references = list(infile)
+ for denovo, reference in zip(references, self.reads):
+ checkFieldEqual(self, reference, denovo)
+ print("reference", str(reference), reference.get_tags(with_value_type=True))
+ print("denovo", str(denovo), denovo.get_tags(with_value_type=True))
+ self.assertEqual(reference.compare(denovo), 0)
# TODO
# def testSAMPerRead(self):
@@ -2132,7 +2025,7 @@ class TestPileup(unittest.TestCase):
def setUp(self):
self.samfile = pysam.AlignmentFile(self.samfilename)
- self.fastafile = pysam.Fastafile(self.fastafilename)
+ self.fastafile = pysam.FastaFile(self.fastafilename)
def tearDown(self):
self.samfile.close()
@@ -2173,7 +2066,8 @@ class TestPileup(unittest.TestCase):
self.checkEqual(refs, iterator)
-class TestCountCoverage(unittest.TestCase):
+class TestPileupFastafile(TestPileup):
+ '''test pileup functionality - backwards compatibility'''
samfilename = os.path.join(BAM_DATADIR, "ex1.bam")
fastafilename = os.path.join(BAM_DATADIR, "ex1.fa")
@@ -2183,6 +2077,17 @@ class TestCountCoverage(unittest.TestCase):
self.samfile = pysam.AlignmentFile(self.samfilename)
self.fastafile = pysam.Fastafile(self.fastafilename)
+
+class TestCountCoverage(unittest.TestCase):
+
+ samfilename = os.path.join(BAM_DATADIR, "ex1.bam")
+ fastafilename = os.path.join(BAM_DATADIR, "ex1.fa")
+
+ def setUp(self):
+
+ self.samfile = pysam.AlignmentFile(self.samfilename)
+ self.fastafile = pysam.FastaFile(self.fastafilename)
+
samfile = pysam.AlignmentFile(
"tests/test_count_coverage_read_all.bam", 'wb',
template=self.samfile)
diff --git a/tests/TestUtils.py b/tests/TestUtils.py
index dc95e09..c5572d3 100644
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -21,8 +21,12 @@ CBCF_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
LINKDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "linker_tests"))
+TESTS_TEMPDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "tmp"))
+
+
IS_PYTHON3 = sys.version_info[0] >= 3
+
if IS_PYTHON3:
from itertools import zip_longest
from urllib.request import urlopen
@@ -192,11 +196,15 @@ def check_lines_equal(cls, a, b, sort=False, filter_f=None, msg=None):
def get_temp_filename(suffix=""):
caller_name = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
+ try:
+ os.makedirs(TESTS_TEMPDIR)
+ except OSError:
+ pass
f = tempfile.NamedTemporaryFile(
prefix="tmp_{}_".format(caller_name),
suffix=suffix,
delete=False,
- dir="tests")
+ dir=TESTS_TEMPDIR)
f.close()
return f.name
diff --git a/tests/linking_test.py b/tests/linking_test.py
index 623c3a2..25b9b04 100644
--- a/tests/linking_test.py
+++ b/tests/linking_test.py
@@ -20,7 +20,7 @@ def check_import(statement):
raise
-def check_tests_pass(statement):
+def check_pass(statement):
try:
output = subprocess.check_output(
statement, stderr=subprocess.STDOUT, shell=True)
@@ -31,6 +31,9 @@ def check_tests_pass(statement):
return True
+ at unittest.skipUnless(
+ os.environ.get("PYSAM_LINKING_TESTS", None),
+ "enable linking tests by setting PYSAM_LINKING_TESTS environment variable")
class TestLinking(unittest.TestCase):
package_name = "link_with_rpath"
@@ -43,15 +46,22 @@ class TestLinking(unittest.TestCase):
"cd {} && rm -rf build && python setup.py install".format(self.workdir),
shell=True)
+
+ at unittest.skipUnless(
+ os.environ.get("PYSAM_LINKING_TESTS", None),
+ "enable linking tests by setting PYSAM_LINKING_TESTS environment variable")
class TestLinkWithRpath(TestLinking):
package_name = "link_with_rpath"
def test_package_tests_pass(self):
- self.assertTrue(check_tests_pass(
+ self.assertTrue(check_pass(
"cd {} && python test_module.py".format(os.path.join(self.workdir, "tests"))))
+ at unittest.skipUnless(
+ os.environ.get("PYSAM_LINKING_TESTS", None),
+ "enable linking tests by setting PYSAM_LINKING_TESTS environment variable")
class TestLinkWithoutRpath(TestLinking):
package_name = "link_without_rpath"
@@ -69,7 +79,7 @@ class TestLinkWithoutRpath(TestLinking):
pysam_libdirs, pysam_libs = zip(*[os.path.split(x) for x in pysam_libraries])
pysam_libdir = pysam_libdirs[0]
- self.assertTrue(check_tests_pass(
+ self.assertTrue(check_pass(
"export LD_LIBRARY_PATH={}:$PATH && cd {} && python test_module.py".format(
pysam_libdir,
os.path.join(self.workdir, "tests"))))
diff --git a/tests/samtools_test.py b/tests/samtools_test.py
index 5494e1b..a926f5c 100644
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -5,9 +5,7 @@ Execute in the :file:`tests` directory as it requires the Makefile
and data files located there.
'''
-import pysam
-import pysam.samtools
-import pysam.bcftools
+import warnings
import unittest
import os
import re
@@ -15,6 +13,9 @@ import glob
import sys
import subprocess
import shutil
+import pysam
+import pysam.samtools
+import pysam.bcftools
from TestUtils import checkBinaryEqual, check_lines_equal, \
check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \
BAM_DATADIR
@@ -130,7 +131,7 @@ class SamtoolsTest(unittest.TestCase):
return re.sub("[^0-9.]", "", s)
if _r(samtools_version) != _r(pysam.__samtools_version__):
- raise ValueError(
+ warnings.warn(
"versions of pysam.%s and %s differ: %s != %s" %
(self.executable,
self.executable,
@@ -222,7 +223,7 @@ class SamtoolsTest(unittest.TestCase):
error_msg = "%s failed: files %s and %s are not the same" % (command, s, p)
if binary_equal:
continue
- if s.endswith(".bam"):
+ elif s.endswith(".bam"):
self.assertTrue(
check_samtools_view_equal(
s, p, without_header=True),
@@ -236,7 +237,9 @@ class SamtoolsTest(unittest.TestCase):
def testStatements(self):
for statement in self.statements:
command = self.get_command(statement, map_to_internal=False)
- if command in ("bedcov", "stats", "dict"):
+ # bam2fq differs between version 1.5 and 1.6 - reenable if
+ # bioconda samtools will be available.
+ if command in ("bedcov", "stats", "dict", "bam2fq"):
continue
if (command == "calmd" and
@@ -268,6 +271,7 @@ class SamtoolsTest(unittest.TestCase):
self.assertTrue(re.search(expected, usage_msg) is not None)
def tearDown(self):
+ return
if os.path.exists(WORKDIR):
shutil.rmtree(WORKDIR)
os.chdir(self.savedir)
diff --git a/tests/tabix_test.py b/tests/tabix_test.py
index 1b6d450..890130d 100644
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -78,6 +78,15 @@ class TestIndexing(unittest.TestCase):
pysam.tabix_index(self.tmpfilename, preset="gff")
self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
+ def test_indexing_to_custom_location_works(self):
+ '''test indexing a file with a non-default location.'''
+
+ index_path = get_temp_filename(suffix='custom.tbi')
+ pysam.tabix_index(self.tmpfilename, preset="gff", index=index_path, force=True)
+ self.assertTrue(checkBinaryEqual(index_path, self.filename_idx))
+ os.unlink(index_path)
+
+
def test_indexing_with_explict_columns_works(self):
'''test indexing via preset.'''
@@ -101,7 +110,8 @@ class TestIndexing(unittest.TestCase):
def tearDown(self):
os.unlink(self.tmpfilename)
- os.unlink(self.tmpfilename + ".tbi")
+ if os.path.exists(self.tmpfilename + ".tbi"):
+ os.unlink(self.tmpfilename + ".tbi")
class TestCompression(unittest.TestCase):
@@ -362,7 +372,7 @@ class TestIterationWithoutComments(IterationTest):
x = x.decode("ascii")
if not x.startswith("#"):
break
- ref.append(x[:-1].encode('ascii'))
+ ref.append(x[:-1])
header = list(self.tabix.header)
self.assertEqual(ref, header)
@@ -592,7 +602,9 @@ if IS_PYTHON3:
self.vcf = pysam.VCF()
self.assertRaises(
UnicodeDecodeError,
- self.vcf.connect, self.tmpfilename + ".gz", "ascii")
+ self.vcf.connect,
+ self.tmpfilename + ".gz",
+ "ascii")
self.vcf.connect(self.tmpfilename + ".gz", encoding="utf-8")
v = self.vcf.getsamples()[0]
@@ -1023,16 +1035,16 @@ for vcf_file in vcf_files:
class TestRemoteFileHTTP(unittest.TestCase):
- url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example_htslib.gtf.gz"
+ url = "http://www.cgat.org/downloads/public/pysam/test/example.gtf.gz"
region = "chr1:1-1000"
local = os.path.join(TABIX_DATADIR, "example.gtf.gz")
def setUp(self):
- if not checkURL(self.url):
+ if not pysam.config.HAVE_LIBCURL or not checkURL(self.url):
self.remote_file = None
- return
-
- self.remote_file = pysam.TabixFile(self.url, "r")
+ else:
+ self.remote_file = pysam.TabixFile(self.url, "r")
+
self.local_file = pysam.TabixFile(self.local, "r")
def tearDown(self):
@@ -1058,12 +1070,29 @@ class TestRemoteFileHTTP(unittest.TestCase):
return
self.assertEqual(list(self.local_file.header), [])
- self.assertRaises(AttributeError,
- getattr,
- self.remote_file,
- "header")
+class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP):
+
+ url = "http://www.cgat.org/downloads/public/pysam/test/example_comments.gtf.gz"
+ region = "chr1:1-1000"
+ local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz")
+
+ def setUp(self):
+ if not pysam.config.HAVE_LIBCURL or not checkURL(self.url):
+ self.remote_file = None
+ else:
+ self.remote_file = pysam.TabixFile(self.url, "r")
+ self.local_file = pysam.TabixFile(self.local, "r")
+
+ def testHeader(self):
+ if self.remote_file is None:
+ return
+
+ self.assertEqual(list(self.local_file.header), ["# comment at start"])
+ self.assertEqual(list(self.local_file.header), self.remote_file.header)
+
+
class TestIndexArgument(unittest.TestCase):
filename_src = os.path.join(TABIX_DATADIR, "example.vcf.gz")
diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py
index ff68c81..35ad8fc 100644
--- a/tests/tabixproxies_test.py
+++ b/tests/tabixproxies_test.py
@@ -145,7 +145,7 @@ class TestGTF(TestParser):
self.assertEqual("\t".join(map(str, c)),
str(r))
- def testSetting(self):
+ def test_setting_fields(self):
r = self.tabix.fetch(parser=self.parser()).next()
@@ -166,6 +166,14 @@ class TestGTF(TestParser):
self.assertTrue("gene_id \"0001\"" in sr)
self.assertTrue("transcript_id \"0002\"" in sr)
+ def test_setAttribute_makes_changes(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.setAttribute("transcript_id", "abcd")
+ sr = str(r)
+ self.assertEqual(r.transcript_id, "abcd")
+ self.assertTrue("transcript_id \"abcd\"" in sr)
+
def test_added_attribute_is_output(self):
r = self.tabix.fetch(parser=self.parser()).next()
@@ -311,7 +319,7 @@ class TestGFF3(TestGTF):
str(r))
self.assertTrue(r.ID.startswith("MI00"))
- def testSetting(self):
+ def test_setting_fields(self):
for r in self.tabix.fetch(parser=self.parser()):
r.contig = r.contig + "_test_contig"
@@ -328,7 +336,15 @@ class TestGFF3(TestGTF):
self.assertTrue("test_source" in sr)
self.assertTrue("test_feature" in sr)
self.assertTrue("ID=test" in sr)
-
+
+ def test_setAttribute_makes_changes(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.setAttribute("transcript_id", "abcd")
+ sr = str(r)
+ self.assertEqual(r.transcript_id, "abcd")
+ self.assertTrue("transcript_id=abcd" in sr)
+
def test_added_attribute_is_output(self):
r = self.tabix.fetch(parser=self.parser()).next()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pysam.git
More information about the debian-med-commit
mailing list