[med-svn] [python-pysam] 01/08: Imported Upstream version 0.8.3
Afif Elghraoui
afif-guest at moszumanska.debian.org
Mon Jun 8 08:23:05 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository python-pysam.
commit 91c30fda77e187b3a1b240064771a90168ab09e6
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Sun Jun 7 22:30:02 2015 -0700
Imported Upstream version 0.8.3
---
.gitignore | 2 +-
AUTHORS | 10 +-
INSTALL | 2 +-
MANIFEST.in | 2 +-
doc/api.rst | 13 +-
doc/conf.py | 2 +-
doc/developer.rst | 17 +-
doc/faq.rst | 85 +-
doc/index.rst | 2 +-
doc/release.rst | 61 +-
doc/usage.rst | 14 +-
htslib/INSTALL | 88 +-
htslib/Makefile | 93 +-
htslib/NEWS | 50 +
htslib/bgzf.c | 58 +-
htslib/bgzip.c | 2 +-
htslib/config.mk | 72 +
htslib/config.mk.in | 72 +
htslib/configure | 4012 +++++++++++++++++++++
htslib/configure.ac | 93 +
htslib/cram/cram.h | 9 +-
htslib/cram/cram_codecs.c | 230 +-
htslib/cram/cram_codecs.h | 22 +-
htslib/cram/cram_decode.c | 1339 +++++--
htslib/cram/cram_encode.c | 1994 ++++++----
htslib/cram/cram_index.c | 82 +-
htslib/cram/cram_io.c | 1382 ++++---
htslib/cram/cram_io.h | 98 +-
htslib/cram/cram_samtools.c | 5 +-
htslib/cram/cram_stats.c | 123 +-
htslib/cram/cram_structs.h | 329 +-
htslib/cram/os.h | 2 +
htslib/cram/rANS_byte.h | 336 ++
htslib/cram/rANS_static.c | 841 +++++
htslib/cram/rANS_static.h | 44 +
htslib/cram/sam_header.c | 8 +-
htslib/cram/sam_header.h | 8 -
htslib/cram/thread_pool.c | 178 +-
htslib/cram/thread_pool.h | 15 +-
htslib/cram/vlen.c | 2 +-
htslib/faidx.c | 46 +-
htslib/hfile.c | 23 +-
htslib/hfile_internal.h | 3 +-
htslib/hfile_irods.c | 243 ++
htslib/hts.c | 531 ++-
htslib/htsfile.1 | 71 +
htslib/htsfile.c | 168 +
htslib/htslib.mk | 8 +-
htslib/htslib/bgzf.h | 7 +-
htslib/htslib/faidx.h | 2 +-
htslib/htslib/hfile.h | 10 +-
htslib/htslib/hts.h | 146 +-
htslib/htslib/khash.h | 8 +-
htslib/htslib/khash_str2int.h | 9 +
htslib/htslib/kseq.h | 10 +-
htslib/htslib/regidx.h | 147 +
htslib/htslib/sam.h | 2 +-
htslib/htslib/synced_bcf_reader.h | 15 +-
htslib/htslib/vcf.h | 15 +-
htslib/htslib_vars.mk | 1 +
htslib/knetfile.c | 11 +-
htslib/regidx.c | 338 ++
htslib/sam.c | 134 +-
htslib/synced_bcf_reader.c | 88 +-
htslib/tabix.1 | 121 +-
htslib/tabix.c | 268 +-
htslib/tbx.c | 5 +
htslib/test/{aux#aux.sam => auxf#values.sam} | 0
htslib/test/{aux.fa => auxf.fa} | 0
htslib/test/{aux.fa.fai => auxf.fa.fai} | 0
htslib/test/hfile.c | 4 +-
htslib/test/sam.c | 52 +-
htslib/test/test-regidx.c | 116 +
htslib/test/test-vcf-api.c | 57 +-
htslib/test/test_view.c | 92 +-
htslib/vcf.c | 207 +-
htslib/vcfutils.c | 63 +-
htslib/version.h | 2 +-
install-CGAT-tools.sh | 8 +-
pysam/TabProxies.pyx | 142 +-
pysam/__init__.py | 10 +-
pysam/calignmentfile.pxd | 30 +-
pysam/calignmentfile.pyx | 1109 ++++--
pysam/cbcf.pxd | 158 +
pysam/cbcf.pyx | 2419 +++++++++++++
pysam/cfaidx.pxd | 9 +-
pysam/cfaidx.pyx | 39 +-
pysam/chtslib.pxd | 891 ++++-
pysam/chtslib.pyx | 2 +-
pysam/csamtools.pyx | 2 +
pysam/ctabix.pyx | 4 +-
pysam/cvcf.pyx | 2 +-
pysam/htslib_util.c | 18 +
pysam/htslib_util.h | 20 +-
pysam/version.py | 6 +-
requires.txt | 2 +-
samtools/bam.h | 2 +-
samtools/bam2bcf_indel.c | 40 +-
samtools/bam2bcf_indel.c.pysam.c | 40 +-
samtools/bam2depth.c | 16 +-
samtools/bam2depth.c.pysam.c | 16 +-
samtools/bam_mate.c | 3 +-
samtools/bam_mate.c.pysam.c | 3 +-
samtools/bam_md.c | 1 -
samtools/bam_md.c.pysam.c | 1 -
samtools/bam_plcmd.c | 45 +-
samtools/bam_plcmd.c.pysam.c | 45 +-
samtools/bam_sort.c | 61 +-
samtools/bam_sort.c.pysam.c | 61 +-
samtools/bam_stat.c | 40 +-
samtools/bam_stat.c.pysam.c | 40 +-
samtools/errmod.c | 7 +-
samtools/errmod.c.pysam.c | 7 +-
samtools/kaln.c | 486 ---
samtools/kaln.c.pysam.c | 488 ---
samtools/kaln.h | 67 -
samtools/misc/ace2sam.c | 2 +-
samtools/misc/ace2sam.c.pysam.c | 2 +-
samtools/padding.c | 2 +-
samtools/padding.c.pysam.c | 2 +-
samtools/sam.c | 8 +-
samtools/sam.c.pysam.c | 8 +-
samtools/sam.h | 3 +-
samtools/sam_view.c | 27 +-
samtools/sam_view.c.pysam.c | 27 +-
samtools/stats.c | 21 +-
samtools/stats.c.pysam.c | 21 +-
samtools/test/merge/test_rtrans_build.c | 4 +-
samtools/test/merge/test_rtrans_build.c.pysam.c | 4 +-
samtools/test/merge/test_trans_tbl_init.c | 2 +-
samtools/test/merge/test_trans_tbl_init.c.pysam.c | 2 +-
samtools/version.h | 2 +-
setup.py | 169 +-
tests/AlignedSegment_test.py | 447 +++
tests/AlignmentFile_test.py | 1387 +++----
tests/SamFile_test.py | 14 +-
tests/TestUtils.py | 72 +-
tests/cython_flagstat.py | 1 -
tests/pysam_data/Makefile | 13 +-
tests/python_flagstat.py | 2 -
tests/samtools_test.py | 45 +-
tests/tabix_test.py | 7 +-
142 files changed, 18515 insertions(+), 4939 deletions(-)
diff --git a/.gitignore b/.gitignore
index b69096e..4bd469f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,7 @@ tests/*.c
tests/*.pyxbldc
tests/*.sam
tests/*.fai
-tests/pysam_test_work
+tests/pysam_data
# cython files
pysam/TabProxies.c
diff --git a/AUTHORS b/AUTHORS
index 308641e..4b00536 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,10 +1,12 @@
List of contributors:
-Andreas Heger, Tildon Grant Belgard, Kevin B. Jacobs, Florian
-Finkernagel, Leo Goodstadt, Martin Goodson all contributed code
-to pysam.
+Andreas Heger, Tildon Grant Belgard, Florian Finkernagel, Leo
+Goodstadt, Martin Goodson all contributed code to pysam.
-Gerton Lunter provided a VCF parser.
+Kevin B. Jacobs implemented a Cython wrapper for the VCF/BCF
+reader/writer in htslib.
+
+Gerton Lunter provided a validating VCF parser.
Marcel Martin implemented python 3 compatibility.
Ben Schiller contributed a Windows compatible clone.
diff --git a/INSTALL b/INSTALL
index 2ce7add..865daa7 100644
--- a/INSTALL
+++ b/INSTALL
@@ -13,7 +13,7 @@ most of the modern Linux/Unix distributions. If you do not have this
library installed, you can still compile the rest of SAMtools by
manually modifying one line in Makefile.
-Pysam requires pyrex (0.9.8 or greater) and python (2.6 or greater).
+Pysam requires Python (2.6 or greater) and Cython (0.22 or greater).
It has not been tested on many other platforms.
Compilation
diff --git a/MANIFEST.in b/MANIFEST.in
index 7f2082e..9df7dae 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -26,6 +26,7 @@ include pysam.py
include tests/00README.txt
include tests/pysam_data
include tests/tabix_data
+include tests/*.py
#ex1.fa
#include tests/ex1.sam.gz
#include tests/ex3.sam
@@ -49,7 +50,6 @@ include tests/tabix_data
#include tests/issue100.bam
# tabix tests
-#include tests/tabix_test.py
#include tests/example.gtf.gz
#include tests/example.gtf.gz.tbi
#include tests/example.bed.gz
diff --git a/doc/api.rst b/doc/api.rst
index 38b7b7e..d700ac5 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -58,12 +58,13 @@ reads are represented as :class:`~pysam.PileupRead` objects in the
import pysam
samfile = pysam.AlignmentFile("ex1.bam", "rb" )
for pileupcolumn in samfile.pileup("chr1", 100, 120):
- print ("\ncoverage at base %s = %s" %
- (pileupcolumn.pos, pileupcolumn.n))
- for pileupread in pileupcolumn.pileups:
- print ('\tbase in read %s = %s' %
- (pileupread.alignment.query_name,
- pileupread.alignment.query_sequence[pileupread.query_position]))
+ print ("\ncoverage at base %s = %s" %
+ (pileupcolumn.pos, pileupcolumn.n))
+ for pileupread in pileupcolumn.pileups:
+ if not pileupread.is_del and not pileupread.is_refskip: # query position is None if is_del or is_refskip is set.
+ print ('\tbase in read %s = %s' %
+ (pileupread.alignment.query_name,
+ pileupread.alignment.query_sequence[pileupread.query_position]))
samfile.close()
diff --git a/doc/conf.py b/doc/conf.py
index c9b8ac6..19a4563 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -113,7 +113,7 @@ pygments_style = 'sphinx'
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'default'
+html_theme = 'classic'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
diff --git a/doc/developer.rst b/doc/developer.rst
index 6abd22c..9931854 100644
--- a/doc/developer.rst
+++ b/doc/developer.rst
@@ -20,7 +20,7 @@ directories:
Code and data for testing
:file:`htslib`
- Source code from :term:`htslib` shipped with pysam. See
+ Source code from htslib_ shipped with pysam. See
:file:`setup.py` about importing.
:file:`samtools`
@@ -48,19 +48,12 @@ captpuring standard output.
Contributors
============
-The following people have contributed to pysam:
-
-* Andreas Heger
-* Tildon Grant Belgrad
-* Kevin Jacobs
-* Florian Finkernagel
-* Ben Schiller
-* Marcel Martin
-* Gerton Lunter
-* Martin Goodson
-* Leo Goodstadt
+Please see github for a list of all contributors:
+https://github.com/pysam-developers/pysam/graphs/contributors
+Many thanks to all contributors for helping in making pysam
+useful.
diff --git a/doc/faq.rst b/doc/faq.rst
index 5d2a048..412a647 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -2,6 +2,14 @@
FAQ
===
+How should I cite pysam
+=======================
+
+Pysam has not been published in print. When refering pysam, please
+use the github URL: https://github.com/pysam-developers/pysam.
+As pysam is a wrapper around htslib and the samtools package, I
+suggest cite `Li et al (2009) <http://www.ncbi.nlm.nih.gov/pubmed/19505943>`.
+
pysam coordinates are wrong
===========================
@@ -22,6 +30,78 @@ convention of the samtools command line utilities. The same is true
for any coordinates passed to the samtools command utilities directly,
such as :meth:`pysam.mpileup`.
+Calling pysam.fetch() confuses existing iterators
+=================================================
+
+The following code will cause unexpected behaviour::
+
+ samfile = pysam.AlignmentFile("pysam_ex1.bam", "rb")
+
+ iter1 = samfile.fetch("chr1")
+ print iter1.next().reference_id
+ iter2 = samfile.fetch("chr2")
+ print iter2.next().reference_id
+ print iter1.next().reference_id
+
+This will give the following output::
+
+ 0
+ 1
+ Traceback (most recent call last):
+ File "xx.py", line 8, in <module>
+ print iter1.next().reference_id
+ File "calignmentfile.pyx", line 1408, in
+ pysam.calignmentfile.IteratorRowRegion.__next__
+ (pysam/calignmentfile.c:16461)
+ StopIteration
+
+Note how the second iterator stops as the file pointer has moved to
+chr2. The correct way to work with multiple iterators is::
+
+ samfile = pysam.AlignmentFile("pysam_ex1.bam", "rb")
+
+ iter1 = samfile.fetch("chr1", all)
+ print iter1.next().reference_id
+ iter2 = samfile.fetch("chr2")
+ print iter2.next().reference_id
+ print iter1.next().reference_id
+
+Here, the output is::
+
+ 0
+ 1
+ 0
+
+The reason for this behaviour is that every iterator needs to keep
+track of its current position in the file. Within pysam, each opened
+file can only keep track of one file position and hence there can only
+be one iterator per file. Using the option ``mulitple_iterators=True``
+will return an iterator within a newly opened file. This iterator will
+not interfere with existing iterators as it has its own file handle
+associated with it.
+
+Note that re-opening files incurs a performance penalty which can
+become severe when calling :meth:`~pysam.AlignmentFile.fetch` often.
+Thus, ``multiple_iterators`` is set to ``False`` by default.
+
+AlignmentFile.fetch does not show unmapped reads
+================================================
+
+:meth:`~pysam.AlignmentFile.fetch` will only iterate over alignments
+in the SAM/BAM file. The following thus always works::
+
+ bf = pysam.AlignemFile(fname, "rb")
+ for r in bf.fetch():
+ assert not r.is_unmapped
+
+If the SAM/BAM file contains unaligned reads, they can be included
+in the iteration by adding the ``until_eof=True`` flag::
+
+ bf = pysam.AlignemFile(fname, "rb")
+ for r in bf.fetch(until_eof=True):
+ if r.is_unmapped:
+ print "read is unmapped"
+
BAM files with a large number of reference sequences is slow
============================================================
@@ -38,7 +118,7 @@ header. This might require a lot of jumping around in the file. To
avoid this, use::
track = pysam.AlignmentFile(fname, "rb")
- for aln in track.fetch( until_eof = True ):
+ for aln in track.fetch(until_eof=True):
pass
This will iterate through reads as they appear in the file.
@@ -66,7 +146,6 @@ the quality scores need to be taken. Consider trimming for example::
read.seq = read.seq[5:10]
read.qual = q[5:10]
-
Why is there no SNPCaller class anymore?
=========================================
@@ -108,7 +187,7 @@ to the iterator that remains alive::
for pp in p.pileups:
print pp
-Psyam won't compile
+Pysam won't compile
===================
Compiling pysam can be tricky as there are numerous variables that
diff --git a/doc/index.rst b/doc/index.rst
index 10aadb9..7b032cb 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -15,7 +15,7 @@ in *SAM/BAM* formatted files. Also included is an interface to the
samtools_ command line utilities and the tabix_ C-API for reading
compressed and indexed tabular data.
-The current version wraps *htslib-1.1* and *samtools-1.1*.
+The current version wraps *htslib-1.2.1* and *samtools-1.2*.
Contents
--------
diff --git a/doc/release.rst b/doc/release.rst
index eefbf04..29d21ab 100644
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,10 +2,49 @@
Release notes
=============
+Release 0.8.3
+=============
+
+* samtools command now accept the "catch_stdout" option.
+
+* get_aligned_pairs now works for soft-clipped reads.
+
+* query_position is now None when a PileupRead is not aligned
+ to a particular position.
+
+* AlignedSegments are now comparable and hashable.
+
+Release 0.8.2.1
+===============
+
+* Installation bugfix release.
+
+Release 0.8.2
+=============
+
+* Pysam now wraps htslib 1.2.1 and samtools version 1.2.
+
+* Added CRAM file support to pysam.
+
+* New alignment info interface.
+ * opt() and setTag are deprecated, use get_tag() and set_tag()
+ instead.
+ * added has_tag()
+ * tags is deprecated, use get_tags() and set_tags() instead.
+
+* FastqFile is now FastxFile to reflect that the latter permits
+ iteration over both fastq- and fasta-formatted files.
+
+* A Cython wrapper for htslib VCF/BCF reader/writer. The wrapper
+ provides a nearly complete Pythonic interface to VCF/BCF metadata
+ with reading and writing capability. However, the interface is still
+ incomplete and preliminary and lacks capability to mutate the
+ resulting data.
+
Release 0.8.1
=============
-* Pysam now wraps htslib and samtools versions 1.1
+* Pysam now wraps htslib and samtools versions 1.1.
* Bugfixes, most notable:
* issue #43: uncompressed BAM output
@@ -29,13 +68,13 @@ Release 0.8.1
* mapq -> mapping_quality
* rnext -> next_reference_id
* pnext -> next_reference_start
- * cigar = alignment
- * cigarstring = cigarstring
- * tlen -> query_length
+ * cigar -> cigartuples
+ * cigarstring -> cigarstring
+ * tlen -> template_length
* seq -> query_sequence
* qual -> query_qualities, now returns array
* qqual -> query_alignment_qualities, now returns array
- * tags = tags
+ * tags -> tags
* alen -> reference_length, reference is always "alignment", so removed
* aend -> reference_end
* rlen -> query_length
@@ -57,9 +96,9 @@ Release 0.8.1
as strings, no more bytes.
Other changes:
- * AlignmentFile.fetch(reopen) option is now multiple_iterators. The
- default changed to not reopen a file unless requested by the user.
- * FastaFile.getReferenceLength is now FastaFile.get_reference_length
+ * AlignmentFile.fetch(reopen) option is now multiple_iterators. The
+ default changed to not reopen a file unless requested by the user.
+ * FastaFile.getReferenceLength is now FastaFile.get_reference_length
Backwards incompatible changes
@@ -77,11 +116,11 @@ Release 0.8.0
=============
* Disabled features
- * IteratorColumn.setMask() disabled as htslib does not implement
- this functionality?
+ * IteratorColumn.setMask() disabled as htslib does not implement
+ this functionality?
* Not implemented yet:
- * reading SAM files without header
+ * reading SAM files without header
Tabix files between version 0.7.8 and 0.8.0 are
not compatible and need to be re-indexed.
diff --git a/doc/usage.rst b/doc/usage.rst
index 7e79298..f4dd4d5 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -118,7 +118,7 @@ Here, we use a header dictionary::
a.next_reference_id = 0
a.next_reference_start=199
a.template_length=167
- a.query_qualities="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"
+ a.query_qualities = pysam.fromQualityString("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
a.tags = (("NM", 1),
("RG", "L1"))
outfile.write(a)
@@ -191,6 +191,18 @@ available using the :meth:`getMessages` method::
Note that only the output from the last invocation of a command
is stored.
+In order for pysam to make the output of samtools commands accessible
+the stdout stream needs to be redirected. This is the default
+behaviour, but can cause problems in environments such as the ipython
+notebook. A solution is to pass the ``catch_stdout`` keyword
+argument::
+
+ pysam.sort(catch_stdout=False)
+
+Note that this means that output from commands which produce output on
+stdout will not be available. The only solution is to run samtools
+commands through subprocess.
+
================================
Working with tabix-indexed files
================================
diff --git a/htslib/INSTALL b/htslib/INSTALL
index 1e8df2f..ba65bd3 100644
--- a/htslib/INSTALL
+++ b/htslib/INSTALL
@@ -1,25 +1,79 @@
-System Requirements
-===================
+Basic Installation
+==================
-HTSlib depends on the zlib library <http://zlib.net>. Building HTSlib requires
-zlib development files to be installed on the build machine; you may need to
-ensure a package such as zlib1g-dev (on Debian or Ubuntu Linux) or zlib-devel
-(on RPM/yum-based distributions) is installed.
+To build and install HTSlib, 'cd' to the htslib-1.x directory containing
+the package's source and type the following commands:
+ ./configure
+ make
+ make install
-Compilation
-===========
+The './configure' command checks your build environment and allows various
+optional functionality to be enabled (see Configuration below). If you
+don't want to select any optional functionality, you may wish to omit
+configure and just type 'make; make install' as for previous versions
+of HTSlib. However if the build fails you should run './configure' as
+it can diagnose the common reasons for build failures.
-'cd' to the htslib-1.x directory containing the package's source and type
-'make' to compile HTSlib.
+The 'make' command builds the HTSlib library and and various useful
+utilities: bgzip, htsfile, and tabix. If compilation fails you should
+run './configure' as it can diagnose problems with your build environment
+that cause build failures.
+The 'make install' command installs the libraries, library header files,
+utilities, several manual pages, and a pkgconfig file to /usr/local.
+The installation location can be changed by configuring with --prefix=DIR
+or via 'make prefix=DIR install' (see Installation Locations below).
-Installation
-============
-Type 'make install' to install the bgzip and tabix utilities, library headers,
-library archives, several manual pages, and a pkgconfig file to /usr/local.
+Configuration
+=============
-Type 'make prefix=/path/to/dir install' to install everything under your
-choice of installation directory. The install target also understands
-DESTDIR and the other usual installation directory variables.
+By default, './configure' examines your build environment, checking for
+requirements such as the zlib development files, and arranges for a plain
+HTSlib build. The following configure options can be used to enable
+various features and specify further optional external requirements:
+
+--with-irods[=DIR]
+ Specifies the location of the iRODS client library to use to enable
+ access to data objects stored in iRODS (<http://irods.org/>) via file
+ paths like 'irods:DATAOBJ'. DIR is the base of an iRODS source tree
+ such that the library is present as DIR/lib/core/obj/libRodsAPI.* and
+ headers are present under DIR/lib/api/include and so on. If '=DIR' is
+ omitted, $IRODS_HOME will be used as a base directory.
+
+The configure script also accepts the usual options and environment variables
+for tuning installation locations and compilers: type './configure --help'
+for details. For example,
+
+ ./configure CC=icc --prefix=/opt/icc-compiled
+
+would specify that HTSlib is to be built with icc and installed into bin,
+lib, etc subdirectories under /opt/icc-compiled.
+
+
+Installation Locations
+======================
+
+By default, 'make install' installs HTSlib libraries under /usr/local/lib,
+HTSlib header files under /usr/local/include, utility programs under
+/usr/local/bin, etc. (To be precise, the header files are installed within
+a fixed 'htslib' subdirectory under the specified .../include location.)
+
+You can specify a different location to install HTSlib by configuring
+with --prefix=DIR or specify locations for particular parts of HTSlib by
+configuring with --libdir=DIR and so on. Type './configure --help' for
+the full list of such install directory options.
+
+Alternatively you can specify different locations at install time by
+typing 'make prefix=DIR install' or 'make libdir=DIR install' and so on.
+Consult the list of prefix/exec_prefix/etc variables near the top of the
+Makefile for the full list of such variables that can be overridden.
+
+You can also specify a staging area by typing 'make DESTDIR=DIR install',
+possibly in conjunction with other --prefix or prefix=DIR settings.
+For example,
+
+ make DESTDIR=/tmp/staging prefix=/opt
+
+would install into bin, lib, etc subdirectories under /tmp/staging/opt.
diff --git a/htslib/Makefile b/htslib/Makefile
index 6919903..5120b24 100644
--- a/htslib/Makefile
+++ b/htslib/Makefile
@@ -1,6 +1,6 @@
# Makefile for htslib, a C library for high-throughput sequencing data formats.
#
-# Copyright (C) 2013-2014 Genome Research Ltd.
+# Copyright (C) 2013-2015 Genome Research Ltd.
#
# Author: John Marshall <jm18 at sanger.ac.uk>
#
@@ -26,20 +26,38 @@ CC = gcc
AR = ar
RANLIB = ranlib
-# TODO: edit cram code to remove need for -DSAMTOOLS
-CPPFLAGS = -I. -DSAMTOOLS=1
+CPPFLAGS = -I.
# TODO: probably update cram code to make it compile cleanly with -Wc++-compat
CFLAGS = -g -Wall -O2
EXTRA_CFLAGS_PIC = -fpic
LDFLAGS =
LDLIBS =
-prefix = /ifs/apps/bio/htslib-1.1
+# For now these don't work too well as samtools also needs to know to
+# add -lbz2 and -llzma if linking against the static libhts.a library.
+# TODO This needs configury and adding to htslib.pc.in.
+#
+# # Bzip2 support; optionally used by CRAM.
+# HAVE_LIBBZ2 := $(shell echo -e "\#include <bzlib.h>\012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -lbz2 2>/dev/null && echo yes)
+# ifeq "$(HAVE_LIBBZ2)" "yes"
+# CPPFLAGS += -DHAVE_LIBBZ2
+# LDLIBS += -lbz2
+# endif
+#
+# # Lzma support; optionally used by CRAM.
+# HAVE_LIBLZMA := $(shell echo -e "\#include <lzma.h>\012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -llzma 2>/dev/null && echo yes)
+# ifeq "$(HAVE_LIBLZMA)" "yes"
+# CPPFLAGS += -DHAVE_LIBLZMA
+# LDLIBS += -llzma
+# endif
+
+prefix = /usr/local
exec_prefix = $(prefix)
bindir = $(exec_prefix)/bin
includedir = $(prefix)/include
libdir = $(exec_prefix)/lib
-mandir = $(prefix)/share/man
+datarootdir = $(prefix)/share
+mandir = $(datarootdir)/man
man1dir = $(mandir)/man1
man5dir = $(mandir)/man5
pkgconfigdir= $(libdir)/pkgconfig
@@ -52,12 +70,14 @@ INSTALL_DIR = $(MKDIR_P) -m 755
BUILT_PROGRAMS = \
bgzip \
+ htsfile \
tabix
BUILT_TEST_PROGRAMS = \
test/fieldarith \
test/hfile \
test/sam \
+ test/test-regidx \
test/test_view \
test/test-vcf-api \
test/test-vcf-sweep
@@ -81,7 +101,7 @@ lib-shared: libhts.so
endif
-PACKAGE_VERSION = 1.1
+PACKAGE_VERSION = 1.2.1
LIBHTS_SOVERSION = 1
@@ -113,6 +133,9 @@ endif
version.h:
echo '#define HTS_VERSION "$(PACKAGE_VERSION)"' > $@
+print-version:
+ @echo $(PACKAGE_VERSION)
+
.SUFFIXES: .c .o .pico
@@ -132,6 +155,7 @@ LIBHTS_OBJS = \
hfile.o \
hfile_net.o \
hts.o \
+ regidx.o \
sam.o \
synced_bcf_reader.o \
vcf_sweep.o \
@@ -150,12 +174,33 @@ LIBHTS_OBJS = \
cram/md5.o \
cram/open_trace_file.o \
cram/pooled_alloc.o \
+ cram/rANS_static.o \
cram/sam_header.o \
cram/string_alloc.o \
cram/thread_pool.o \
cram/vlen.o \
cram/zfio.o
+cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h
+cram_io_h = cram/cram_io.h $(cram_misc_h)
+cram_misc_h = cram/misc.h cram/os.h
+cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h htslib/khash.h htslib/kstring.h
+cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h)
+cram_structs_h = cram/cram_structs.h cram/thread_pool.h cram/string_alloc.h htslib/khash.h
+cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h
+hfile_internal_h = hfile_internal.h $(htslib_hfile_h)
+
+
+# To be effective, config.mk needs to appear after most Makefile variables are
+# set but before most rules appear, so that it can both use previously-set
+# variables in its own rules' prerequisites and also update variables for use
+# in later rules' prerequisites.
+
+# sinclude is GNU Make-specific. If you don't have GNU Make or another make
+# that understands sinclude, change this to 'include' if you are using the
+# configure script or just comment the line out if you are not.
+sinclude config.mk
+
libhts.a: $(LIBHTS_OBJS)
@-rm -f $@
@@ -181,35 +226,28 @@ libhts.dylib: $(LIBHTS_OBJS)
ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib
-cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h
-cram_io_h = cram/cram_io.h $(cram_misc_h)
-cram_misc_h = cram/misc.h cram/os.h
-cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h htslib/khash.h htslib/kstring.h
-cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h)
-cram_structs_h = cram/cram_structs.h cram/thread_pool.h cram/string_alloc.h htslib/khash.h
-cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h
-hfile_internal_h = hfile_internal.h $(htslib_hfile_h)
-
-bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) htslib/khash.h
+bgzf.o bgzf.pico: bgzf.c $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) htslib/khash.h
kstring.o kstring.pico: kstring.c htslib/kstring.h
knetfile.o knetfile.pico: knetfile.c htslib/knetfile.h
hfile.o hfile.pico: hfile.c $(htslib_hfile_h) $(hfile_internal_h)
+hfile_irods.o hfile_irods.pico: hfile_irods.c $(hfile_internal_h)
hfile_net.o hfile_net.pico: hfile_net.c $(hfile_internal_h) htslib/knetfile.h
hts.o hts.pico: hts.c version.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/ksort.h
vcf.o vcf.pico: vcf.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h
sam.o sam.pico: sam.c $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h
tbx.o tbx.pico: tbx.c $(htslib_tbx_h) $(htslib_bgzf_h) htslib/khash.h
-faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) htslib/khash.h htslib/knetfile.h
+faidx.o faidx.pico: faidx.c $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) htslib/khash.h
synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_bcf_reader_h) htslib/kseq.h htslib/khash_str2int.h
vcf_sweep.o vcf_sweep.pico: vcf_sweep.c $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
vcfutils.o vcfutils.pico: vcfutils.c $(htslib_vcfutils_h)
kfunc.o kfunc.pico: kfunc.c htslib/kfunc.h
+regidx.o regidx.pico: regidx.c $(htslib_hts_h) $(HTSPREFIX)htslib/kstring.h $(HTSPREFIX)htslib/kseq.h $(HTSPREFIX)htslib/khash_str2int.h $(htslib_regidx_h)
cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c $(cram_h)
cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c $(cram_h) cram/os.h cram/md5.h
cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c $(cram_h) cram/os.h cram/md5.h
cram/cram_index.o cram/cram_index.pico: cram/cram_index.c $(htslib_hfile_h) $(cram_h) cram/os.h cram/zfio.h
-cram/cram_io.o cram/cram_io.pico: cram/cram_io.c $(cram_h) cram/os.h cram/md5.h $(cram_open_trace_file_h) $(htslib_hfile_h)
+cram/cram_io.o cram/cram_io.pico: cram/cram_io.c $(cram_h) cram/os.h cram/md5.h $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h)
cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c $(cram_h) $(htslib_sam_h)
cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c $(cram_h) cram/os.h
cram/files.o cram/files.pico: cram/files.c $(cram_misc_h)
@@ -217,6 +255,7 @@ cram/mFILE.o cram/mFILE.pico: cram/mFILE.c cram/os.h cram/mFILE.h cram/vlen.h
cram/md5.o cram/md5.pico: cram/md5.c cram/md5.h
cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h)
cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c cram/pooled_alloc.h
+cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c cram/rANS_static.h cram/rANS_byte.h
cram/sam_header.o cram/sam_header.pico: cram/sam_header.c $(cram_sam_header_h) cram/string_alloc.h
cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c cram/string_alloc.h
cram/thread_pool.o cram/thread_pool.pico: cram/thread_pool.c cram/thread_pool.h
@@ -227,10 +266,14 @@ cram/zfio.o cram/zfio.pico: cram/zfio.c cram/os.h cram/zfio.h
bgzip: bgzip.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ bgzip.o libhts.a $(LDLIBS) -lz
+htsfile: htsfile.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ htsfile.o libhts.a $(LDLIBS) -lz
+
tabix: tabix.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ tabix.o libhts.a $(LDLIBS) -lz
bgzip.o: bgzip.c $(htslib_bgzf_h) $(htslib_hts_h)
+htsfile.o: htsfile.c $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $(htslib_bgzf_h) $(htslib_hts_h)
@@ -239,7 +282,8 @@ tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $
check test: $(BUILT_TEST_PROGRAMS)
test/fieldarith test/fieldarith.sam
test/hfile
- test/sam
+ test/sam test/ce.fa
+ test/test-regidx
cd test && REF_PATH=: ./test_view.pl
cd test && ./test.pl
@@ -252,6 +296,9 @@ test/hfile: test/hfile.o libhts.a
test/sam: test/sam.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/sam.o libhts.a $(LDLIBS) -lz
+test/test-regidx: test/test-regidx.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LDLIBS) -lz
+
test/test_view: test/test_view.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LDLIBS) -lz
@@ -263,7 +310,8 @@ test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
test/fieldarith.o: test/fieldarith.c $(htslib_sam_h)
test/hfile.o: test/hfile.c $(htslib_hfile_h) $(htslib_hts_defs_h)
-test/sam.o: test/sam.c $(htslib_sam_h) htslib/kstring.h
+test/test-regidx.o: test/test-regidx.c $(htslib_regidx_h)
+test/sam.o: test/sam.c $(htslib_sam_h) $(htslib_faidx_h) htslib/kstring.h
test/test_view.o: test/test_view.c $(cram_h) $(htslib_sam_h)
test/test-vcf-api.o: test/test-vcf-api.c $(htslib_hts_h) $(htslib_vcf_h) htslib/kstring.h
test/test-vcf-sweep.o: test/test-vcf-sweep.c $(htslib_vcf_sweep_h)
@@ -273,7 +321,7 @@ install: libhts.a $(BUILT_PROGRAMS) installdirs install-$(SHLIB_FLAVOUR) install
$(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir)
$(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib
$(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a
- $(INSTALL_DATA) tabix.1 $(DESTDIR)$(man1dir)
+ $(INSTALL_DATA) htsfile.1 tabix.1 $(DESTDIR)$(man1dir)
$(INSTALL_DATA) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir)
installdirs:
@@ -315,6 +363,7 @@ clean: mostlyclean clean-$(SHLIB_FLAVOUR)
-rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
distclean: clean
+ -rm -f config.cache config.log config.mk config.status
-rm -f TAGS *-uninstalled.pc
clean-so:
@@ -332,6 +381,6 @@ force:
.PHONY: all check clean distclean force install install-pkgconfig installdirs
-.PHONY: lib-shared lib-static mostlyclean tags test testclean
+.PHONY: lib-shared lib-static mostlyclean print-version tags test testclean
.PHONY: clean-so install-so
.PHONY: clean-dylib install-dylib
diff --git a/htslib/NEWS b/htslib/NEWS
new file mode 100644
index 0000000..c135613
--- /dev/null
+++ b/htslib/NEWS
@@ -0,0 +1,50 @@
+Noteworthy changes in release 1.2.1 (3 February 2015)
+
+* Reinstated hts_file_type() and FT_* macros, which were available until 1.1
+ but briefly removed in 1.2. This function is deprecated and will be removed
+ in a future release -- you should use hts_detect_format() etc instead
+
+
+Noteworthy changes in release 1.2 (2 February 2015)
+
+* HTSlib now has a configure script which checks your build environment
+ and allows for selection of optional extras. See INSTALL for details
+
+* By default, reference sequences are fetched from the EBI CRAM Reference
+ Registry and cached in your $HOME cache directory. This behaviour can
+ be controlled by setting REF_PATH and REF_CACHE enviroment variables
+ (see the samtools(1) man page for details)
+
+* Numerous CRAM improvements:
+ - Support for CRAM v3.0, an upcoming revision to CRAM supporting
+ better compression and per-container checksums
+ - EOF checking for v2.1 and v3.0 (similar to checking BAM EOF blocks)
+ - Non-standard values for PNEXT and TLEN fields are now preserved
+ - hts_set_fai_filename() now provides a reference file when encoding
+ - Generated read names are now numbered from 1, rather than being
+ labelled 'slice:record-in-slice'
+ - Multi-threading and speed improvements
+
+* New htsfile command for identifying file formats, and corresponding
+ file format detection APIs
+
+* New tabix --regions FILE, --targets FILE options for filtering via BED files
+
+* Optional iRODS file access, disabled by default. Configure with --with-irods
+ to enable accessing iRODS data objects directly via 'irods:DATAOBJ'
+
+* All occurences of 2^29 in the source have been eliminated, so indexing
+ and querying against reference sequences larger than 512Mbp works (when
+ using CSI indices)
+
+* Support for plain GZIP compression in various places
+
+* VCF header editing speed improvements
+
+* Added seq_nt16_int[] (equivalent to the samtools API's bam_nt16_nt4_table)
+
+* Reinstated faidx_fetch_nseq(), which was accidentally removed from 1.1.
+ Now faidx_fetch_nseq() and faidx_nseq() are equivalent; eventually
+ faidx_fetch_nseq() will be deprecated and removed [#156]
+
+* Fixed bugs #141, #152, #155, #158, #159, and various memory leaks
diff --git a/htslib/bgzf.c b/htslib/bgzf.c
index 090bec7..5306458 100644
--- a/htslib/bgzf.c
+++ b/htslib/bgzf.c
@@ -23,8 +23,6 @@
THE SOFTWARE.
*/
-#include "config.h"
-
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -39,6 +37,9 @@
#include "htslib/bgzf.h"
#include "htslib/hfile.h"
+#define BGZF_CACHE
+#define BGZF_MT
+
#define BLOCK_HEADER_LENGTH 18
#define BLOCK_FOOTER_LENGTH 8
@@ -108,8 +109,8 @@ static inline void packInt32(uint8_t *buffer, uint32_t value)
static BGZF *bgzf_read_init(hFILE *hfpr)
{
BGZF *fp;
- uint8_t magic[2];
- ssize_t n = hpeek(hfpr, magic, 2);
+ uint8_t magic[18];
+ ssize_t n = hpeek(hfpr, magic, 18);
if (n < 0) return NULL;
fp = (BGZF*)calloc(1, sizeof(BGZF));
@@ -119,17 +120,30 @@ static BGZF *bgzf_read_init(hFILE *hfpr)
fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b);
fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b) ? 1 : 0;
+ fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
#ifdef BGZF_CACHE
fp->cache = kh_init(cache);
#endif
return fp;
}
-static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level, -2 plain uncompressed
+// get the compress level from the mode string: compress_level==-1 for the default level, -2 plain uncompressed
+static int mode2level(const char *__restrict mode)
+{
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = -2;
+ return compress_level;
+}
+static BGZF *bgzf_write_init(const char *mode)
{
BGZF *fp;
fp = (BGZF*)calloc(1, sizeof(BGZF));
fp->is_write = 1;
+ int compress_level = mode2level(mode);
if ( compress_level==-2 )
{
fp->is_compressed = 0;
@@ -140,18 +154,17 @@ static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the d
fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+ if ( strchr(mode,'g') )
+ {
+ // gzip output
+ fp->is_gzip = 1;
+ fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream));
+ fp->gz_stream->zalloc = NULL;
+ fp->gz_stream->zfree = NULL;
+ if ( deflateInit2(fp->gz_stream, fp->compress_level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return NULL;
+ }
return fp;
}
-// get the compress level from the mode string
-static int mode2level(const char *__restrict mode)
-{
- int i, compress_level = -1;
- for (i = 0; mode[i]; ++i)
- if (mode[i] >= '0' && mode[i] <= '9') break;
- if (mode[i]) compress_level = (int)mode[i] - '0';
- if (strchr(mode, 'u')) compress_level = -2;
- return compress_level;
-}
BGZF *bgzf_open(const char *path, const char *mode)
{
@@ -166,7 +179,7 @@ BGZF *bgzf_open(const char *path, const char *mode)
} else if (strchr(mode, 'w') || strchr(mode, 'a')) {
hFILE *fpw;
if ((fpw = hopen(path, mode)) == 0) return 0;
- fp = bgzf_write_init(mode2level(mode));
+ fp = bgzf_write_init(mode);
fp->fp = fpw;
}
else { errno = EINVAL; return 0; }
@@ -188,7 +201,7 @@ BGZF *bgzf_dopen(int fd, const char *mode)
} else if (strchr(mode, 'w') || strchr(mode, 'a')) {
hFILE *fpw;
if ((fpw = hdopen(fd, mode)) == 0) return 0;
- fp = bgzf_write_init(mode2level(mode));
+ fp = bgzf_write_init(mode);
fp->fp = fpw;
}
else { errno = EINVAL; return 0; }
@@ -205,7 +218,7 @@ BGZF *bgzf_hopen(hFILE *hfp, const char *mode)
fp = bgzf_read_init(hfp);
if (fp == NULL) return NULL;
} else if (strchr(mode, 'w') || strchr(mode, 'a')) {
- fp = bgzf_write_init(mode2level(mode));
+ fp = bgzf_write_init(mode);
}
else { errno = EINVAL; return 0; }
@@ -244,13 +257,6 @@ static int bgzf_gzip_compress(BGZF *fp, void *_dst, int *dlen, void *src, int sl
{
uint8_t *dst = (uint8_t*)_dst;
z_stream *zs = fp->gz_stream;
- if ( !zs )
- {
- zs = fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream));
- zs->zalloc = NULL;
- zs->zfree = NULL;
- if ( deflateInit2(zs, level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return -1; // gzip output
- }
int flush = slen ? Z_NO_FLUSH : Z_FINISH;
zs->next_in = (Bytef*)src;
zs->avail_in = slen;
@@ -433,7 +439,7 @@ int bgzf_read_block(BGZF *fp)
// Reading compressed file
int64_t block_address;
block_address = htell(fp->fp);
- if ( fp->is_gzip )
+ if ( fp->is_gzip && fp->gz_stream ) // is this is a initialized gzip stream?
{
count = inflate_gzip_block(fp, 0);
if ( count<0 )
diff --git a/htslib/bgzip.c b/htslib/bgzip.c
index a8a88af..2eeff3d 100644
--- a/htslib/bgzip.c
+++ b/htslib/bgzip.c
@@ -156,8 +156,8 @@ int main(int argc, char **argv)
strcpy(name, argv[optind]);
strcat(name, ".gz");
f_dst = write_open(name, is_forced);
- if (f_dst < 0) return 1;
free(name);
+ if (f_dst < 0) return 1;
}
}
else if (!pstdout && isatty(fileno((FILE *)stdout)) )
diff --git a/htslib/config.mk b/htslib/config.mk
new file mode 100644
index 0000000..c4452d2
--- /dev/null
+++ b/htslib/config.mk
@@ -0,0 +1,72 @@
+# Optional configure Makefile overrides for htslib.
+#
+# Copyright (C) 2015 Genome Research Ltd.
+#
+# Author: John Marshall <jm18 at sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# This is config.mk. Generated from config.mk.in by configure.
+#
+# If you use configure, this file overrides variables and augments rules
+# in the Makefile to reflect your configuration choices. If you don't run
+# configure, the main Makefile contains suitable conservative defaults.
+
+prefix = /usr/local
+exec_prefix = ${prefix}
+bindir = ${exec_prefix}/bin
+includedir = ${prefix}/include
+libdir = ${exec_prefix}/lib
+datarootdir = ${prefix}/share
+mandir = ${datarootdir}/man
+
+CC = gcc
+RANLIB = ranlib
+
+CFLAGS = -g -O2
+LDFLAGS =
+LDLIBS =
+
+
+# ifeq/.../endif, +=, and target-specific variables are GNU Make-specific.
+# If you don't have GNU Make, comment out this conditional and note that
+# to enable iRODS you will need to implement the following elsewhere.
+ifeq "iRODS-disabled" "iRODS-enabled"
+
+IRODS_HOME ?= /disabled
+
+EXTRA_CPPFLAGS_IRODS = \
+ -I$(IRODS_HOME)/lib/api/include \
+ -I$(IRODS_HOME)/lib/core/include \
+ -I$(IRODS_HOME)/lib/md5/include \
+ -I$(IRODS_HOME)/lib/sha1/include \
+ -I$(IRODS_HOME)/server/core/include \
+ -I$(IRODS_HOME)/server/drivers/include \
+ -I$(IRODS_HOME)/server/icat/include
+
+LDFLAGS += -L$(IRODS_HOME)/lib/core/obj
+LDLIBS += -lRodsAPIs -lgssapi_krb5
+
+LIBHTS_OBJS += hfile_irods.o
+
+hfile.o hfile.pico: CPPFLAGS += -DHAVE_IRODS
+
+hfile_irods.o hfile_irods.pico: CPPFLAGS += $(EXTRA_CPPFLAGS_IRODS)
+
+endif
diff --git a/htslib/config.mk.in b/htslib/config.mk.in
new file mode 100644
index 0000000..e058ee5
--- /dev/null
+++ b/htslib/config.mk.in
@@ -0,0 +1,72 @@
+# Optional configure Makefile overrides for htslib.
+#
+# Copyright (C) 2015 Genome Research Ltd.
+#
+# Author: John Marshall <jm18 at sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# This is @configure_input@
+#
+# If you use configure, this file overrides variables and augments rules
+# in the Makefile to reflect your configuration choices. If you don't run
+# configure, the main Makefile contains suitable conservative defaults.
+
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+bindir = @bindir@
+includedir = @includedir@
+libdir = @libdir@
+datarootdir = @datarootdir@
+mandir = @mandir@
+
+CC = @CC@
+RANLIB = @RANLIB@
+
+CFLAGS = @CFLAGS@
+LDFLAGS = @LDFLAGS@
+LDLIBS = @LIBS@
+
+
+# ifeq/.../endif, +=, and target-specific variables are GNU Make-specific.
+# If you don't have GNU Make, comment out this conditional and note that
+# to enable iRODS you will need to implement the following elsewhere.
+ifeq "iRODS- at irods@" "iRODS-enabled"
+
+ at define_IRODS_HOME@
+
+EXTRA_CPPFLAGS_IRODS = \
+ -I$(IRODS_HOME)/lib/api/include \
+ -I$(IRODS_HOME)/lib/core/include \
+ -I$(IRODS_HOME)/lib/md5/include \
+ -I$(IRODS_HOME)/lib/sha1/include \
+ -I$(IRODS_HOME)/server/core/include \
+ -I$(IRODS_HOME)/server/drivers/include \
+ -I$(IRODS_HOME)/server/icat/include
+
+LDFLAGS += -L$(IRODS_HOME)/lib/core/obj
+LDLIBS += -lRodsAPIs -lgssapi_krb5
+
+LIBHTS_OBJS += hfile_irods.o
+
+hfile.o hfile.pico: CPPFLAGS += -DHAVE_IRODS
+
+hfile_irods.o hfile_irods.pico: CPPFLAGS += $(EXTRA_CPPFLAGS_IRODS)
+
+endif
diff --git a/htslib/configure b/htslib/configure
new file mode 100755
index 0000000..9fc4bd2
--- /dev/null
+++ b/htslib/configure
@@ -0,0 +1,4012 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.68 for HTSlib 1.2.1.
+#
+# Report bugs to <samtools-help at lists.sourceforge.net>.
+#
+#
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software
+# Foundation, Inc.
+#
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+#
+# Portions copyright (C) 2015 Genome Research Ltd.
+#
+# This configure script is free software: you are free to change and
+# redistribute it. There is NO WARRANTY, to the extent permitted by law.
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in #(
+ *posix*) :
+ set -o posix ;; #(
+ *) :
+ ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+ && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='print -r --'
+ as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='printf %s\n'
+ as_echo_n='printf %s'
+else
+ if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+ as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+ as_echo_n='/usr/ucb/echo -n'
+ else
+ as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+ as_echo_n_body='eval
+ arg=$1;
+ case $arg in #(
+ *"$as_nl"*)
+ expr "X$arg" : "X\\(.*\\)$as_nl";
+ arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+ esac;
+ expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+ '
+ export as_echo_n_body
+ as_echo_n='sh -c $as_echo_n_body as_echo'
+ fi
+ export as_echo_body
+ as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ PATH_SEPARATOR=:
+ (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+ (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+ PATH_SEPARATOR=';'
+ }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order. Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" "" $as_nl"
+
+# Find who we are. Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+ done
+IFS=$as_save_IFS
+
+ ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+ as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+ $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+ exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there. '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+ && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+if test "x$CONFIG_SHELL" = x; then
+ as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '\${1+\"\$@\"}'='\"\$@\"'
+ setopt NO_GLOB_SUBST
+else
+ case \`(set -o) 2>/dev/null\` in #(
+ *posix*) :
+ set -o posix ;; #(
+ *) :
+ ;;
+esac
+fi
+"
+ as_required="as_fn_return () { (exit \$1); }
+as_fn_success () { as_fn_return 0; }
+as_fn_failure () { as_fn_return 1; }
+as_fn_ret_success () { return 0; }
+as_fn_ret_failure () { return 1; }
+
+exitcode=0
+as_fn_success || { exitcode=1; echo as_fn_success failed.; }
+as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
+as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
+as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
+if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+
+else
+ exitcode=1; echo positional parameters were not saved.
+fi
+test x\$exitcode = x0 || exit 1"
+ as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
+ as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
+ eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
+ test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1"
+ if (eval "$as_required") 2>/dev/null; then :
+ as_have_required=yes
+else
+ as_have_required=no
+fi
+ if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+
+else
+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ as_found=:
+ case $as_dir in #(
+ /*)
+ for as_base in sh bash ksh sh5; do
+ # Try only shells that exist, to save several forks.
+ as_shell=$as_dir/$as_base
+ if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+ { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+ CONFIG_SHELL=$as_shell as_have_required=yes
+ if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+ break 2
+fi
+fi
+ done;;
+ esac
+ as_found=false
+done
+$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+ { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
+ CONFIG_SHELL=$SHELL as_have_required=yes
+fi; }
+IFS=$as_save_IFS
+
+
+ if test "x$CONFIG_SHELL" != x; then :
+ # We cannot yet assume a decent shell, so we have to provide a
+ # neutralization value for shells without unset; and this also
+ # works around shells that cannot unset nonexistent variables.
+ # Preserve -v and -x to the replacement shell.
+ BASH_ENV=/dev/null
+ ENV=/dev/null
+ (unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+ export CONFIG_SHELL
+ case $- in # ((((
+ *v*x* | *x*v* ) as_opts=-vx ;;
+ *v* ) as_opts=-v ;;
+ *x* ) as_opts=-x ;;
+ * ) as_opts= ;;
+ esac
+ exec "$CONFIG_SHELL" $as_opts "$as_myself" ${1+"$@"}
+fi
+
+ if test x$as_have_required = xno; then :
+ $as_echo "$0: This script requires a shell more modern than all"
+ $as_echo "$0: the shells that I found on your system."
+ if test x${ZSH_VERSION+set} = xset ; then
+ $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+ $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+ else
+ $as_echo "$0: Please tell bug-autoconf at gnu.org and
+$0: samtools-help at lists.sourceforge.net about your system,
+$0: including any error possibly output before this
+$0: message. Then install a modern shell, or manually run
+$0: the script under such a shell if you do have one."
+ fi
+ exit 1
+fi
+fi
+fi
+SHELL=${CONFIG_SHELL-/bin/sh}
+export SHELL
+# Unset more variables known to interfere with behavior of common tools.
+CLICOLOR_FORCE= GREP_OPTIONS=
+unset CLICOLOR_FORCE GREP_OPTIONS
+
+## --------------------- ##
+## M4sh Shell Functions. ##
+## --------------------- ##
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+ { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+ return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+ set +e
+ as_fn_set_status $1
+ exit $1
+} # as_fn_exit
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+ case $as_dir in #(
+ -*) as_dir=./$as_dir;;
+ esac
+ test -d "$as_dir" || eval $as_mkdir_p || {
+ as_dirs=
+ while :; do
+ case $as_dir in #(
+ *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+ *) as_qdir=$as_dir;;
+ esac
+ as_dirs="'$as_qdir' $as_dirs"
+ as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_dir" : 'X\(//\)[^/]' \| \
+ X"$as_dir" : 'X\(//\)$' \| \
+ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ test -d "$as_dir" && break
+ done
+ test -z "$as_dirs" || eval "mkdir $as_dirs"
+ } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+ eval 'as_fn_append ()
+ {
+ eval $1+=\$2
+ }'
+else
+ as_fn_append ()
+ {
+ eval $1=\$$1\$2
+ }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+ eval 'as_fn_arith ()
+ {
+ as_val=$(( $* ))
+ }'
+else
+ as_fn_arith ()
+ {
+ as_val=`expr "$@" || test $? -eq 1`
+ }
+fi # as_fn_arith
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+ as_status=$1; test $as_status -eq 0 && as_status=1
+ if test "$4"; then
+ as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+ $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+ fi
+ $as_echo "$as_me: error: $2" >&2
+ as_fn_exit $as_status
+} # as_fn_error
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+ as_dirname=dirname
+else
+ as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+ as_lineno_1=$LINENO as_lineno_1a=$LINENO
+ as_lineno_2=$LINENO as_lineno_2a=$LINENO
+ eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
+ test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
+ # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-)
+ sed -n '
+ p
+ /[$]LINENO/=
+ ' <$as_myself |
+ sed '
+ s/[$]LINENO.*/&-/
+ t lineno
+ b
+ :lineno
+ N
+ :loop
+ s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+ t loop
+ s/-\n.*//
+ ' >$as_me.lineno &&
+ chmod +x "$as_me.lineno" ||
+ { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+
+ # Don't try to exec as it changes $[0], causing all sort of problems
+ # (the dirname of $[0] is not the place where we might find the
+ # original and so on. Autoconf is especially sensitive to this).
+ . "./$as_me.lineno"
+ # Exit status is that of the last command.
+ exit
+}
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+ case `echo 'xy\c'` in
+ *c*) ECHO_T=' ';; # ECHO_T is single tab character.
+ xy) ECHO_C='\c';;
+ *) echo `echo ksh88 bug on AIX 6.1` > /dev/null
+ ECHO_T=' ';;
+ esac;;
+*)
+ ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+ rm -f conf$$.dir/conf$$.file
+else
+ rm -f conf$$.dir
+ mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+ if ln -s conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s='ln -s'
+ # ... but there are two gotchas:
+ # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+ # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+ # In both cases, we have to default to `cp -p'.
+ ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+ as_ln_s='cp -p'
+ elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+ else
+ as_ln_s='cp -p'
+ fi
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p='mkdir -p "$as_dir"'
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+ as_test_x='test -x'
+else
+ if ls -dL / >/dev/null 2>&1; then
+ as_ls_L_option=L
+ else
+ as_ls_L_option=
+ fi
+ as_test_x='
+ eval sh -c '\''
+ if test -d "$1"; then
+ test -d "$1/.";
+ else
+ case $1 in #(
+ -*)set "./$1";;
+ esac;
+ case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
+ ???[sx]*):;;*)false;;esac;fi
+ '\'' sh
+ '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+test -n "$DJDIR" || exec 7<&0 </dev/null
+exec 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+
+# Identity of this package.
+PACKAGE_NAME='HTSlib'
+PACKAGE_TARNAME='htslib'
+PACKAGE_VERSION='1.2.1'
+PACKAGE_STRING='HTSlib 1.2.1'
+PACKAGE_BUGREPORT='samtools-help at lists.sourceforge.net'
+PACKAGE_URL='http://www.htslib.org/'
+
+ac_unique_file="hts.c"
+ac_subst_vars='LTLIBOBJS
+LIBOBJS
+define_IRODS_HOME
+irods
+RANLIB
+OBJEXT
+EXEEXT
+ac_ct_CC
+CPPFLAGS
+LDFLAGS
+CFLAGS
+CC
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_URL
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+with_irods
+'
+ ac_precious_vars='build_alias
+host_alias
+target_alias
+CC
+CFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+ # If the previous option needs an argument, assign it.
+ if test -n "$ac_prev"; then
+ eval $ac_prev=\$ac_option
+ ac_prev=
+ continue
+ fi
+
+ case $ac_option in
+ *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+ *=) ac_optarg= ;;
+ *) ac_optarg=yes ;;
+ esac
+
+ # Accept the important Cygnus configure options, so we can diagnose typos.
+
+ case $ac_dashdash$ac_option in
+ --)
+ ac_dashdash=yes ;;
+
+ -bindir | --bindir | --bindi | --bind | --bin | --bi)
+ ac_prev=bindir ;;
+ -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+ bindir=$ac_optarg ;;
+
+ -build | --build | --buil | --bui | --bu)
+ ac_prev=build_alias ;;
+ -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+ build_alias=$ac_optarg ;;
+
+ -cache-file | --cache-file | --cache-fil | --cache-fi \
+ | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+ ac_prev=cache_file ;;
+ -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+ | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+ cache_file=$ac_optarg ;;
+
+ --config-cache | -C)
+ cache_file=config.cache ;;
+
+ -datadir | --datadir | --datadi | --datad)
+ ac_prev=datadir ;;
+ -datadir=* | --datadir=* | --datadi=* | --datad=*)
+ datadir=$ac_optarg ;;
+
+ -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+ | --dataroo | --dataro | --datar)
+ ac_prev=datarootdir ;;
+ -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+ | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+ datarootdir=$ac_optarg ;;
+
+ -disable-* | --disable-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ as_fn_error $? "invalid feature name: $ac_useropt"
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"enable_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval enable_$ac_useropt=no ;;
+
+ -docdir | --docdir | --docdi | --doc | --do)
+ ac_prev=docdir ;;
+ -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+ docdir=$ac_optarg ;;
+
+ -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+ ac_prev=dvidir ;;
+ -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+ dvidir=$ac_optarg ;;
+
+ -enable-* | --enable-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ as_fn_error $? "invalid feature name: $ac_useropt"
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"enable_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval enable_$ac_useropt=\$ac_optarg ;;
+
+ -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+ | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+ | --exec | --exe | --ex)
+ ac_prev=exec_prefix ;;
+ -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+ | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+ | --exec=* | --exe=* | --ex=*)
+ exec_prefix=$ac_optarg ;;
+
+ -gas | --gas | --ga | --g)
+ # Obsolete; use --with-gas.
+ with_gas=yes ;;
+
+ -help | --help | --hel | --he | -h)
+ ac_init_help=long ;;
+ -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+ ac_init_help=recursive ;;
+ -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+ ac_init_help=short ;;
+
+ -host | --host | --hos | --ho)
+ ac_prev=host_alias ;;
+ -host=* | --host=* | --hos=* | --ho=*)
+ host_alias=$ac_optarg ;;
+
+ -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+ ac_prev=htmldir ;;
+ -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+ | --ht=*)
+ htmldir=$ac_optarg ;;
+
+ -includedir | --includedir | --includedi | --included | --include \
+ | --includ | --inclu | --incl | --inc)
+ ac_prev=includedir ;;
+ -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+ | --includ=* | --inclu=* | --incl=* | --inc=*)
+ includedir=$ac_optarg ;;
+
+ -infodir | --infodir | --infodi | --infod | --info | --inf)
+ ac_prev=infodir ;;
+ -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+ infodir=$ac_optarg ;;
+
+ -libdir | --libdir | --libdi | --libd)
+ ac_prev=libdir ;;
+ -libdir=* | --libdir=* | --libdi=* | --libd=*)
+ libdir=$ac_optarg ;;
+
+ -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+ | --libexe | --libex | --libe)
+ ac_prev=libexecdir ;;
+ -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+ | --libexe=* | --libex=* | --libe=*)
+ libexecdir=$ac_optarg ;;
+
+ -localedir | --localedir | --localedi | --localed | --locale)
+ ac_prev=localedir ;;
+ -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+ localedir=$ac_optarg ;;
+
+ -localstatedir | --localstatedir | --localstatedi | --localstated \
+ | --localstate | --localstat | --localsta | --localst | --locals)
+ ac_prev=localstatedir ;;
+ -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+ | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+ localstatedir=$ac_optarg ;;
+
+ -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+ ac_prev=mandir ;;
+ -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+ mandir=$ac_optarg ;;
+
+ -nfp | --nfp | --nf)
+ # Obsolete; use --without-fp.
+ with_fp=no ;;
+
+ -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+ | --no-cr | --no-c | -n)
+ no_create=yes ;;
+
+ -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+ | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+ no_recursion=yes ;;
+
+ -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+ | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+ | --oldin | --oldi | --old | --ol | --o)
+ ac_prev=oldincludedir ;;
+ -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+ | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+ | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+ oldincludedir=$ac_optarg ;;
+
+ -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+ ac_prev=prefix ;;
+ -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+ prefix=$ac_optarg ;;
+
+ -program-prefix | --program-prefix | --program-prefi | --program-pref \
+ | --program-pre | --program-pr | --program-p)
+ ac_prev=program_prefix ;;
+ -program-prefix=* | --program-prefix=* | --program-prefi=* \
+ | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+ program_prefix=$ac_optarg ;;
+
+ -program-suffix | --program-suffix | --program-suffi | --program-suff \
+ | --program-suf | --program-su | --program-s)
+ ac_prev=program_suffix ;;
+ -program-suffix=* | --program-suffix=* | --program-suffi=* \
+ | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+ program_suffix=$ac_optarg ;;
+
+ -program-transform-name | --program-transform-name \
+ | --program-transform-nam | --program-transform-na \
+ | --program-transform-n | --program-transform- \
+ | --program-transform | --program-transfor \
+ | --program-transfo | --program-transf \
+ | --program-trans | --program-tran \
+ | --progr-tra | --program-tr | --program-t)
+ ac_prev=program_transform_name ;;
+ -program-transform-name=* | --program-transform-name=* \
+ | --program-transform-nam=* | --program-transform-na=* \
+ | --program-transform-n=* | --program-transform-=* \
+ | --program-transform=* | --program-transfor=* \
+ | --program-transfo=* | --program-transf=* \
+ | --program-trans=* | --program-tran=* \
+ | --progr-tra=* | --program-tr=* | --program-t=*)
+ program_transform_name=$ac_optarg ;;
+
+ -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+ ac_prev=pdfdir ;;
+ -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+ pdfdir=$ac_optarg ;;
+
+ -psdir | --psdir | --psdi | --psd | --ps)
+ ac_prev=psdir ;;
+ -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+ psdir=$ac_optarg ;;
+
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ silent=yes ;;
+
+ -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+ ac_prev=sbindir ;;
+ -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+ | --sbi=* | --sb=*)
+ sbindir=$ac_optarg ;;
+
+ -sharedstatedir | --sharedstatedir | --sharedstatedi \
+ | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+ | --sharedst | --shareds | --shared | --share | --shar \
+ | --sha | --sh)
+ ac_prev=sharedstatedir ;;
+ -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+ | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+ | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+ | --sha=* | --sh=*)
+ sharedstatedir=$ac_optarg ;;
+
+ -site | --site | --sit)
+ ac_prev=site ;;
+ -site=* | --site=* | --sit=*)
+ site=$ac_optarg ;;
+
+ -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+ ac_prev=srcdir ;;
+ -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+ srcdir=$ac_optarg ;;
+
+ -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+ | --syscon | --sysco | --sysc | --sys | --sy)
+ ac_prev=sysconfdir ;;
+ -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+ | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+ sysconfdir=$ac_optarg ;;
+
+ -target | --target | --targe | --targ | --tar | --ta | --t)
+ ac_prev=target_alias ;;
+ -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+ target_alias=$ac_optarg ;;
+
+ -v | -verbose | --verbose | --verbos | --verbo | --verb)
+ verbose=yes ;;
+
+ -version | --version | --versio | --versi | --vers | -V)
+ ac_init_version=: ;;
+
+ -with-* | --with-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ as_fn_error $? "invalid package name: $ac_useropt"
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"with_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval with_$ac_useropt=\$ac_optarg ;;
+
+ -without-* | --without-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ as_fn_error $? "invalid package name: $ac_useropt"
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"with_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval with_$ac_useropt=no ;;
+
+ --x)
+ # Obsolete; use --with-x.
+ with_x=yes ;;
+
+ -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+ | --x-incl | --x-inc | --x-in | --x-i)
+ ac_prev=x_includes ;;
+ -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+ | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+ x_includes=$ac_optarg ;;
+
+ -x-libraries | --x-libraries | --x-librarie | --x-librari \
+ | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+ ac_prev=x_libraries ;;
+ -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+ | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+ x_libraries=$ac_optarg ;;
+
+ -*) as_fn_error $? "unrecognized option: \`$ac_option'
+Try \`$0 --help' for more information"
+ ;;
+
+ *=*)
+ ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+ # Reject names that are not valid shell variable names.
+ case $ac_envvar in #(
+ '' | [0-9]* | *[!_$as_cr_alnum]* )
+ as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
+ esac
+ eval $ac_envvar=\$ac_optarg
+ export $ac_envvar ;;
+
+ *)
+ # FIXME: should be removed in autoconf 3.0.
+ $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+ expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+ $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+ : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
+ ;;
+
+ esac
+done
+
+if test -n "$ac_prev"; then
+ ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+ as_fn_error $? "missing argument to $ac_option"
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+ case $enable_option_checking in
+ no) ;;
+ fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+ *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+ esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
+ datadir sysconfdir sharedstatedir localstatedir includedir \
+ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+ libdir localedir mandir
+do
+ eval ac_val=\$$ac_var
+ # Remove trailing slashes.
+ case $ac_val in
+ */ )
+ ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+ eval $ac_var=\$ac_val;;
+ esac
+ # Be sure to have absolute directory names.
+ case $ac_val in
+ [\\/$]* | ?:[\\/]* ) continue;;
+ NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+ esac
+ as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+ if test "x$build_alias" = x; then
+ cross_compiling=maybe
+ $as_echo "$as_me: WARNING: if you wanted to set the --build type, don't use --host.
+ If a cross compiler is detected then cross compile mode will be used" >&2
+ elif test "x$build_alias" != "x$host_alias"; then
+ cross_compiling=yes
+ fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+ as_fn_error $? "working directory cannot be determined"
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+ as_fn_error $? "pwd does not report name of working directory"
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+ ac_srcdir_defaulted=yes
+ # Try the directory containing this script, then the parent directory.
+ ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_myself" : 'X\(//\)[^/]' \| \
+ X"$as_myself" : 'X\(//\)$' \| \
+ X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ srcdir=$ac_confdir
+ if test ! -r "$srcdir/$ac_unique_file"; then
+ srcdir=..
+ fi
+else
+ ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+ test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+ as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+ cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+ pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+ srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+ eval ac_env_${ac_var}_set=\${${ac_var}+set}
+ eval ac_env_${ac_var}_value=\$${ac_var}
+ eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+ eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+ # Omit some internal or obsolete options to make the list less imposing.
+ # This message is too long to be a string in the A/UX 3.1 sh.
+ cat <<_ACEOF
+\`configure' configures HTSlib 1.2.1 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE. See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+ -h, --help display this help and exit
+ --help=short display options specific to this package
+ --help=recursive display the short help of all the included packages
+ -V, --version display version information and exit
+ -q, --quiet, --silent do not print \`checking ...' messages
+ --cache-file=FILE cache test results in FILE [disabled]
+ -C, --config-cache alias for \`--cache-file=config.cache'
+ -n, --no-create do not create output files
+ --srcdir=DIR find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+ --prefix=PREFIX install architecture-independent files in PREFIX
+ [$ac_default_prefix]
+ --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX
+ [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+ --bindir=DIR user executables [EPREFIX/bin]
+ --sbindir=DIR system admin executables [EPREFIX/sbin]
+ --libexecdir=DIR program executables [EPREFIX/libexec]
+ --sysconfdir=DIR read-only single-machine data [PREFIX/etc]
+ --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
+ --localstatedir=DIR modifiable single-machine data [PREFIX/var]
+ --libdir=DIR object code libraries [EPREFIX/lib]
+ --includedir=DIR C header files [PREFIX/include]
+ --oldincludedir=DIR C header files for non-gcc [/usr/include]
+ --datarootdir=DIR read-only arch.-independent data root [PREFIX/share]
+ --datadir=DIR read-only architecture-independent data [DATAROOTDIR]
+ --infodir=DIR info documentation [DATAROOTDIR/info]
+ --localedir=DIR locale-dependent data [DATAROOTDIR/locale]
+ --mandir=DIR man documentation [DATAROOTDIR/man]
+ --docdir=DIR documentation root [DATAROOTDIR/doc/htslib]
+ --htmldir=DIR html documentation [DOCDIR]
+ --dvidir=DIR dvi documentation [DOCDIR]
+ --pdfdir=DIR pdf documentation [DOCDIR]
+ --psdir=DIR ps documentation [DOCDIR]
+_ACEOF
+
+ cat <<\_ACEOF
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+ case $ac_init_help in
+ short | recursive ) echo "Configuration of HTSlib 1.2.1:";;
+ esac
+ cat <<\_ACEOF
+
+Optional Packages:
+ --with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
+ --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
+ --with-irods[=DIR] use RodsAPIs library (in DIR) to support iRODS URLs
+
+Some influential environment variables:
+ CC C compiler command
+ CFLAGS C compiler flags
+ LDFLAGS linker flags, e.g. -L<lib dir> if you have libraries in a
+ nonstandard directory <lib dir>
+ LIBS libraries to pass to the linker, e.g. -l<library>
+ CPPFLAGS (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+ you have headers in a nonstandard directory <include dir>
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to <samtools-help at lists.sourceforge.net>.
+HTSlib home page: <http://www.htslib.org/>.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+ # If there are subdirs, report their specific --help.
+ for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+ test -d "$ac_dir" ||
+ { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+ continue
+ ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+ ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+ # A ".." for each directory in $ac_dir_suffix.
+ ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+ case $ac_top_builddir_sub in
+ "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+ *) ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+ esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+ .) # We are building in place.
+ ac_srcdir=.
+ ac_top_srcdir=$ac_top_builddir_sub
+ ac_abs_top_srcdir=$ac_pwd ;;
+ [\\/]* | ?:[\\/]* ) # Absolute name.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir
+ ac_abs_top_srcdir=$srcdir ;;
+ *) # Relative name.
+ ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_build_prefix$srcdir
+ ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+ cd "$ac_dir" || { ac_status=$?; continue; }
+ # Check for guested configure.
+ if test -f "$ac_srcdir/configure.gnu"; then
+ echo &&
+ $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+ elif test -f "$ac_srcdir/configure"; then
+ echo &&
+ $SHELL "$ac_srcdir/configure" --help=recursive
+ else
+ $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+ fi || ac_status=$?
+ cd "$ac_pwd" || { ac_status=$?; break; }
+ done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+ cat <<\_ACEOF
+HTSlib configure 1.2.1
+generated by GNU Autoconf 2.68
+
+Copyright (C) 2010 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+
+Portions copyright (C) 2015 Genome Research Ltd.
+
+This configure script is free software: you are free to change and
+redistribute it. There is NO WARRANTY, to the extent permitted by law.
+_ACEOF
+ exit
+fi
+
+## ------------------------ ##
+## Autoconf initialization. ##
+## ------------------------ ##
+
+# ac_fn_c_try_compile LINENO
+# --------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_compile ()
+{
+ as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+ rm -f conftest.$ac_objext
+ if { { ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_compile") 2>conftest.err
+ ac_status=$?
+ if test -s conftest.err; then
+ grep -v '^ *+' conftest.err >conftest.er1
+ cat conftest.er1 >&5
+ mv -f conftest.er1 conftest.err
+ fi
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then :
+ ac_retval=0
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_retval=1
+fi
+ eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+ as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_compile
+
+# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists and can be compiled using the include files in
+# INCLUDES, setting the cache variable VAR accordingly.
+ac_fn_c_check_header_compile ()
+{
+ as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ eval "$3=yes"
+else
+ eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+ eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_compile
+
+# ac_fn_c_try_link LINENO
+# -----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_link ()
+{
+ as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+ rm -f conftest.$ac_objext conftest$ac_exeext
+ if { { ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_link") 2>conftest.err
+ ac_status=$?
+ if test -s conftest.err; then
+ grep -v '^ *+' conftest.err >conftest.er1
+ cat conftest.er1 >&5
+ mv -f conftest.er1 conftest.err
+ fi
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest$ac_exeext && {
+ test "$cross_compiling" = yes ||
+ $as_test_x conftest$ac_exeext
+ }; then :
+ ac_retval=0
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_retval=1
+fi
+ # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
+ # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
+ # interfere with the next link command; also delete a directory that is
+ # left behind by Apple's compiler. We do this before executing the actions.
+ rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+ eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+ as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_link
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by HTSlib $as_me 1.2.1, which was
+generated by GNU Autoconf 2.68. Invocation command line was
+
+ $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown`
+
+/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown`
+/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown`
+/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown`
+/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown`
+/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ $as_echo "PATH: $as_dir"
+ done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+ for ac_arg
+ do
+ case $ac_arg in
+ -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ continue ;;
+ *\'*)
+ ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+ esac
+ case $ac_pass in
+ 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+ 2)
+ as_fn_append ac_configure_args1 " '$ac_arg'"
+ if test $ac_must_keep_next = true; then
+ ac_must_keep_next=false # Got value, back to normal.
+ else
+ case $ac_arg in
+ *=* | --config-cache | -C | -disable-* | --disable-* \
+ | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+ | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+ | -with-* | --with-* | -without-* | --without-* | --x)
+ case "$ac_configure_args0 " in
+ "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+ esac
+ ;;
+ -* ) ac_must_keep_next=true ;;
+ esac
+ fi
+ as_fn_append ac_configure_args " '$ac_arg'"
+ ;;
+ esac
+ done
+done
+{ ac_configure_args0=; unset ac_configure_args0;}
+{ ac_configure_args1=; unset ac_configure_args1;}
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log. We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+ # Save into config.log some information that might help in debugging.
+ {
+ echo
+
+ $as_echo "## ---------------- ##
+## Cache variables. ##
+## ---------------- ##"
+ echo
+ # The following way of writing the cache mishandles newlines in values,
+(
+ for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+ eval ac_val=\$$ac_var
+ case $ac_val in #(
+ *${as_nl}*)
+ case $ac_var in #(
+ *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+ esac
+ case $ac_var in #(
+ _ | IFS | as_nl) ;; #(
+ BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+ *) { eval $ac_var=; unset $ac_var;} ;;
+ esac ;;
+ esac
+ done
+ (set) 2>&1 |
+ case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+ *${as_nl}ac_space=\ *)
+ sed -n \
+ "s/'\''/'\''\\\\'\'''\''/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+ ;; #(
+ *)
+ sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+ ;;
+ esac |
+ sort
+)
+ echo
+
+ $as_echo "## ----------------- ##
+## Output variables. ##
+## ----------------- ##"
+ echo
+ for ac_var in $ac_subst_vars
+ do
+ eval ac_val=\$$ac_var
+ case $ac_val in
+ *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+ esac
+ $as_echo "$ac_var='\''$ac_val'\''"
+ done | sort
+ echo
+
+ if test -n "$ac_subst_files"; then
+ $as_echo "## ------------------- ##
+## File substitutions. ##
+## ------------------- ##"
+ echo
+ for ac_var in $ac_subst_files
+ do
+ eval ac_val=\$$ac_var
+ case $ac_val in
+ *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+ esac
+ $as_echo "$ac_var='\''$ac_val'\''"
+ done | sort
+ echo
+ fi
+
+ if test -s confdefs.h; then
+ $as_echo "## ----------- ##
+## confdefs.h. ##
+## ----------- ##"
+ echo
+ cat confdefs.h
+ echo
+ fi
+ test "$ac_signal" != 0 &&
+ $as_echo "$as_me: caught signal $ac_signal"
+ $as_echo "$as_me: exit $exit_status"
+ } >&5
+ rm -f core *.core core.conftest.* &&
+ rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+ exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+ trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+$as_echo "/* confdefs.h */" > confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_URL "$PACKAGE_URL"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+ # We do not want a PATH search for config.site.
+ case $CONFIG_SITE in #((
+ -*) ac_site_file1=./$CONFIG_SITE;;
+ */*) ac_site_file1=$CONFIG_SITE;;
+ *) ac_site_file1=./$CONFIG_SITE;;
+ esac
+elif test "x$prefix" != xNONE; then
+ ac_site_file1=$prefix/share/config.site
+ ac_site_file2=$prefix/etc/config.site
+else
+ ac_site_file1=$ac_default_prefix/share/config.site
+ ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+ test "x$ac_site_file" = xNONE && continue
+ if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+ sed 's/^/| /' "$ac_site_file" >&5
+ . "$ac_site_file" \
+ || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "failed to load site script $ac_site_file
+See \`config.log' for more details" "$LINENO" 5; }
+ fi
+done
+
+if test -r "$cache_file"; then
+ # Some versions of bash will fail to source /dev/null (special files
+ # actually), so we avoid doing that. DJGPP emulates it as a regular file.
+ if test /dev/null != "$cache_file" && test -f "$cache_file"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+ case $cache_file in
+ [\\/]* | ?:[\\/]* ) . "$cache_file";;
+ *) . "./$cache_file";;
+ esac
+ fi
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+ >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+ eval ac_old_set=\$ac_cv_env_${ac_var}_set
+ eval ac_new_set=\$ac_env_${ac_var}_set
+ eval ac_old_val=\$ac_cv_env_${ac_var}_value
+ eval ac_new_val=\$ac_env_${ac_var}_value
+ case $ac_old_set,$ac_new_set in
+ set,)
+ { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,set)
+ { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,);;
+ *)
+ if test "x$ac_old_val" != "x$ac_new_val"; then
+ # differences in whitespace do not lead to failure.
+ ac_old_val_w=`echo x $ac_old_val`
+ ac_new_val_w=`echo x $ac_new_val`
+ if test "$ac_old_val_w" != "$ac_new_val_w"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+ ac_cache_corrupted=:
+ else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+ eval $ac_var=\$ac_old_val
+ fi
+ { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5
+$as_echo "$as_me: former value: \`$ac_old_val'" >&2;}
+ { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5
+$as_echo "$as_me: current value: \`$ac_new_val'" >&2;}
+ fi;;
+ esac
+ # Pass precious variables to config.status.
+ if test "$ac_new_set" = set; then
+ case $ac_new_val in
+ *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+ *) ac_arg=$ac_var=$ac_new_val ;;
+ esac
+ case " $ac_configure_args " in
+ *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy.
+ *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+ esac
+ fi
+done
+if $ac_cache_corrupted; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+ { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+ as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="${ac_tool_prefix}gcc"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+ ac_ct_CC=$CC
+ # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$ac_ct_CC"; then
+ ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_ac_ct_CC="gcc"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+ if test "x$ac_ct_CC" = x; then
+ CC=""
+ else
+ case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+ CC=$ac_ct_CC
+ fi
+else
+ CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+ if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="${ac_tool_prefix}cc"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ fi
+fi
+if test -z "$CC"; then
+ # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+ ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+ ac_prog_rejected=yes
+ continue
+ fi
+ ac_cv_prog_CC="cc"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+ # We found a bogon in the path, so make sure we never use it.
+ set dummy $ac_cv_prog_CC
+ shift
+ if test $# != 0; then
+ # We chose a different compiler from the bogus one.
+ # However, it has the same basename, so the bogon will be chosen
+ # first if we set CC to just the basename; use the full file name.
+ shift
+ ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+ fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+ if test -n "$ac_tool_prefix"; then
+ for ac_prog in cl.exe
+ do
+ # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ test -n "$CC" && break
+ done
+fi
+if test -z "$CC"; then
+ ac_ct_CC=$CC
+ for ac_prog in cl.exe
+do
+ # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$ac_ct_CC"; then
+ ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_ac_ct_CC="$ac_prog"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ test -n "$ac_ct_CC" && break
+done
+
+ if test "x$ac_ct_CC" = x; then
+ CC=""
+ else
+ case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+ CC=$ac_ct_CC
+ fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no acceptable C compiler found in \$PATH
+See \`config.log' for more details" "$LINENO" 5; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+ { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+ ac_status=$?
+ if test -s conftest.err; then
+ sed '10a\
+... rest of stderr output deleted ...
+ 10q' conftest.err >conftest.er1
+ cat conftest.er1 >&5
+ fi
+ rm -f conftest.er1 conftest.err
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }
+done
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
+$as_echo_n "checking whether the C compiler works... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+ * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+ esac
+done
+rm -f $ac_rmfiles
+
+if { { ac_try="$ac_link_default"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_link_default") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; then :
+ # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile. We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+ test -f "$ac_file" || continue
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+ ;;
+ [ab].out )
+ # We found the default executable, but exeext='' is most
+ # certainly right.
+ break;;
+ *.* )
+ if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+ then :; else
+ ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+ fi
+ # We set ac_cv_exeext here because the later test for it is not
+ # safe: cross compilers may not add the suffix if given an `-o'
+ # argument, so we may need to know it at that point already.
+ # Even if this section looks crufty: it has the advantage of
+ # actually working.
+ break;;
+ * )
+ break;;
+ esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+ ac_file=''
+fi
+if test -z "$ac_file"; then :
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "C compiler cannot create executables
+See \`config.log' for more details" "$LINENO" 5; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
+$as_echo_n "checking for C compiler default output file name... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+ac_exeext=$ac_cv_exeext
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { { ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; then :
+ # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+ test -f "$ac_file" || continue
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+ *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+ break;;
+ * ) break;;
+ esac
+done
+else
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest conftest$ac_cv_exeext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <stdio.h>
+int
+main ()
+{
+FILE *f = fopen ("conftest.out", "w");
+ return ferror (f) || fclose (f) != 0;
+
+ ;
+ return 0;
+}
+_ACEOF
+ac_clean_files="$ac_clean_files conftest.out"
+# Check that the compiler produces executables we can run. If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+if test "$cross_compiling" != yes; then
+ { { ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }
+ if { ac_try='./conftest$ac_cv_exeext'
+ { { case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }; then
+ cross_compiling=no
+ else
+ if test "$cross_compiling" = maybe; then
+ cross_compiling=yes
+ else
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details" "$LINENO" 5; }
+ fi
+ fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if ${ac_cv_objext+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { { ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_compile") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; then :
+ for ac_file in conftest.o conftest.obj conftest.*; do
+ test -f "$ac_file" || continue;
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+ *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+ break;;
+ esac
+done
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of object files: cannot compile
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if ${ac_cv_c_compiler_gnu+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+#ifndef __GNUC__
+ choke me
+#endif
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_compiler_gnu=yes
+else
+ ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+ GCC=yes
+else
+ GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if ${ac_cv_prog_cc_g+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ ac_save_c_werror_flag=$ac_c_werror_flag
+ ac_c_werror_flag=yes
+ ac_cv_prog_cc_g=no
+ CFLAGS="-g"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_cv_prog_cc_g=yes
+else
+ CFLAGS=""
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+ ac_c_werror_flag=$ac_save_c_werror_flag
+ CFLAGS="-g"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_cv_prog_cc_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+ CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+ if test "$GCC" = yes; then
+ CFLAGS="-g -O2"
+ else
+ CFLAGS="-g"
+ fi
+else
+ if test "$GCC" = yes; then
+ CFLAGS="-O2"
+ else
+ CFLAGS=
+ fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <stdarg.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+ char **p;
+ int i;
+{
+ return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+ char *s;
+ va_list v;
+ va_start (v,p);
+ s = g (p, va_arg (v,int));
+ va_end (v);
+ return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has
+ function prototypes and stuff, but not '\xHH' hex character constants.
+ These don't provoke an error unfortunately, instead are silently treated
+ as 'x'. The following induces an error, until -std is added to get
+ proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an
+ array size at least. It's necessary to write '\x00'==0 to get something
+ that's true only with -std. */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+ inside strings and character constants. */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1];
+ ;
+ return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+ -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+ CC="$ac_save_CC $ac_arg"
+ if ac_fn_c_try_compile "$LINENO"; then :
+ ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+ test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+ x)
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+ xno)
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+ *)
+ CC="$CC $ac_cv_prog_cc_c89"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
+
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_RANLIB+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$RANLIB"; then
+ ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+RANLIB=$ac_cv_prog_RANLIB
+if test -n "$RANLIB"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5
+$as_echo "$RANLIB" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_RANLIB"; then
+ ac_ct_RANLIB=$RANLIB
+ # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_RANLIB+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$ac_ct_RANLIB"; then
+ ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_ac_ct_RANLIB="ranlib"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB
+if test -n "$ac_ct_RANLIB"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5
+$as_echo "$ac_ct_RANLIB" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+ if test "x$ac_ct_RANLIB" = x; then
+ RANLIB=":"
+ else
+ case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+ RANLIB=$ac_ct_RANLIB
+ fi
+else
+ RANLIB="$ac_cv_prog_RANLIB"
+fi
+
+
+
+# Check whether --with-irods was given.
+if test "${with_irods+set}" = set; then :
+ withval=$with_irods; case $withval in
+ no) irods=disabled ;;
+ yes) irods=enabled ;;
+ *) irods=enabled; IRODS_HOME=$withval ;;
+ esac
+else
+ irods=disabled
+fi
+
+
+save_LIBS=$LIBS
+zlib_devel=ok
+
+ac_fn_c_check_header_compile "$LINENO" "zlib.h" "ac_cv_header_zlib_h" ";
+"
+if test "x$ac_cv_header_zlib_h" = xyes; then :
+
+else
+ zlib_devel=missing
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inflate in -lz" >&5
+$as_echo_n "checking for inflate in -lz... " >&6; }
+if ${ac_cv_lib_z_inflate+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-lz $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+/* Override any GCC internal prototype to avoid an error.
+ Use char because int might match the return type of a GCC
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char inflate ();
+int
+main ()
+{
+return inflate ();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ ac_cv_lib_z_inflate=yes
+else
+ ac_cv_lib_z_inflate=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_inflate" >&5
+$as_echo "$ac_cv_lib_z_inflate" >&6; }
+if test "x$ac_cv_lib_z_inflate" = xyes; then :
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBZ 1
+_ACEOF
+
+ LIBS="-lz $LIBS"
+
+else
+ zlib_devel=missing
+fi
+
+LIBS=$save_LIBS
+
+if test $zlib_devel != ok; then
+ as_fn_error $? "zlib development files not found
+
+HTSlib uses compression routines from the zlib library <http://zlib.net>.
+Building HTSlib requires zlib development files to be installed on the build
+machine; you may need to ensure a package such as zlib1g-dev (on Debian or
+Ubuntu Linux) or zlib-devel (on RPM-based Linux distributions) is installed.
+
+FAILED. This error must be resolved in order to build HTSlib successfully." "$LINENO" 5
+fi
+
+if test $irods = enabled; then
+ # TODO Also test whether we require libgssapi_krb5 and AC_CHECK_LIB it
+ save_LDFLAGS=$LDFLAGS
+ LDFLAGS="$LDFLAGS -L$IRODS_HOME/lib/core/obj"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for getRodsEnvFileName in -lRodsAPIs" >&5
+$as_echo_n "checking for getRodsEnvFileName in -lRodsAPIs... " >&6; }
+if ${ac_cv_lib_RodsAPIs_getRodsEnvFileName+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-lRodsAPIs -lgssapi_krb5 -lpthread $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+/* Override any GCC internal prototype to avoid an error.
+ Use char because int might match the return type of a GCC
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char getRodsEnvFileName ();
+int
+main ()
+{
+return getRodsEnvFileName ();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ ac_cv_lib_RodsAPIs_getRodsEnvFileName=yes
+else
+ ac_cv_lib_RodsAPIs_getRodsEnvFileName=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_RodsAPIs_getRodsEnvFileName" >&5
+$as_echo "$ac_cv_lib_RodsAPIs_getRodsEnvFileName" >&6; }
+if test "x$ac_cv_lib_RodsAPIs_getRodsEnvFileName" = xyes; then :
+ case $with_irods in
+ yes) define_IRODS_HOME='# Uses $(IRODS_HOME) from the environment' ;;
+ *) define_IRODS_HOME="IRODS_HOME = $with_irods" ;;
+ esac
+else
+ as_fn_error $? "iRODS development files not found
+
+Support for iRODS URLs requires the libRodsAPI client library and headers.
+Configure with --with-irods=DIR (or just --with-irods if \$IRODS_HOME has
+been exported with a suitable value), where DIR is the base of an iRODS tree
+such that the library is present as DIR/lib/core/obj/libRodsAPI.* and headers
+are present under DIR/lib/api/include and so on." "$LINENO" 5
+fi
+
+ LDFLAGS=$save_LDFLAGS
+else
+ define_IRODS_HOME='IRODS_HOME ?= /disabled'
+fi
+
+
+
+ac_config_files="$ac_config_files config.mk"
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems. If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+ for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+ eval ac_val=\$$ac_var
+ case $ac_val in #(
+ *${as_nl}*)
+ case $ac_var in #(
+ *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+ esac
+ case $ac_var in #(
+ _ | IFS | as_nl) ;; #(
+ BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+ *) { eval $ac_var=; unset $ac_var;} ;;
+ esac ;;
+ esac
+ done
+
+ (set) 2>&1 |
+ case $as_nl`(ac_space=' '; set) 2>&1` in #(
+ *${as_nl}ac_space=\ *)
+ # `set' does not quote correctly, so add quotes: double-quote
+ # substitution turns \\\\ into \\, and sed turns \\ into \.
+ sed -n \
+ "s/'/'\\\\''/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+ ;; #(
+ *)
+ # `set' quotes correctly as required by POSIX, so do not add quotes.
+ sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+ ;;
+ esac |
+ sort
+) |
+ sed '
+ /^ac_cv_env_/b end
+ t clear
+ :clear
+ s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+ t end
+ s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+ :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+ if test -w "$cache_file"; then
+ if test "x$cache_file" != "x/dev/null"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+ if test ! -f "$cache_file" || test -h "$cache_file"; then
+ cat confcache >"$cache_file"
+ else
+ case $cache_file in #(
+ */* | ?:*)
+ mv -f confcache "$cache_file"$$ &&
+ mv -f "$cache_file"$$ "$cache_file" ;; #(
+ *)
+ mv -f confcache "$cache_file" ;;
+ esac
+ fi
+ fi
+ else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+ fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section. Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\[/\\&/g
+s/\]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+ g
+ s/^\n//
+ s/\n/ /g
+ p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+
+
+ac_libobjs=
+ac_ltlibobjs=
+U=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+ # 1. Remove the extension, and $U if already installed.
+ ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+ ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+ # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR
+ # will be set to the directory where LIBOBJS objects are built.
+ as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+ as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+: "${CONFIG_STATUS=./config.status}"
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+as_write_fail=0
+cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in #(
+ *posix*) :
+ set -o posix ;; #(
+ *) :
+ ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+ && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='print -r --'
+ as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='printf %s\n'
+ as_echo_n='printf %s'
+else
+ if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+ as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+ as_echo_n='/usr/ucb/echo -n'
+ else
+ as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+ as_echo_n_body='eval
+ arg=$1;
+ case $arg in #(
+ *"$as_nl"*)
+ expr "X$arg" : "X\\(.*\\)$as_nl";
+ arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+ esac;
+ expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+ '
+ export as_echo_n_body
+ as_echo_n='sh -c $as_echo_n_body as_echo'
+ fi
+ export as_echo_body
+ as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ PATH_SEPARATOR=:
+ (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+ (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+ PATH_SEPARATOR=';'
+ }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order. Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" "" $as_nl"
+
+# Find who we are. Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+ done
+IFS=$as_save_IFS
+
+ ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+ as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+ $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+ exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there. '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+ && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+ as_status=$1; test $as_status -eq 0 && as_status=1
+ if test "$4"; then
+ as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+ $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+ fi
+ $as_echo "$as_me: error: $2" >&2
+ as_fn_exit $as_status
+} # as_fn_error
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+ return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+ set +e
+ as_fn_set_status $1
+ exit $1
+} # as_fn_exit
+
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+ { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+ eval 'as_fn_append ()
+ {
+ eval $1+=\$2
+ }'
+else
+ as_fn_append ()
+ {
+ eval $1=\$$1\$2
+ }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+ eval 'as_fn_arith ()
+ {
+ as_val=$(( $* ))
+ }'
+else
+ as_fn_arith ()
+ {
+ as_val=`expr "$@" || test $? -eq 1`
+ }
+fi # as_fn_arith
+
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+ as_dirname=dirname
+else
+ as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+ case `echo 'xy\c'` in
+ *c*) ECHO_T=' ';; # ECHO_T is single tab character.
+ xy) ECHO_C='\c';;
+ *) echo `echo ksh88 bug on AIX 6.1` > /dev/null
+ ECHO_T=' ';;
+ esac;;
+*)
+ ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+ rm -f conf$$.dir/conf$$.file
+else
+ rm -f conf$$.dir
+ mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+ if ln -s conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s='ln -s'
+ # ... but there are two gotchas:
+ # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+ # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+ # In both cases, we have to default to `cp -p'.
+ ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+ as_ln_s='cp -p'
+ elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+ else
+ as_ln_s='cp -p'
+ fi
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+ case $as_dir in #(
+ -*) as_dir=./$as_dir;;
+ esac
+ test -d "$as_dir" || eval $as_mkdir_p || {
+ as_dirs=
+ while :; do
+ case $as_dir in #(
+ *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+ *) as_qdir=$as_dir;;
+ esac
+ as_dirs="'$as_qdir' $as_dirs"
+ as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_dir" : 'X\(//\)[^/]' \| \
+ X"$as_dir" : 'X\(//\)$' \| \
+ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ test -d "$as_dir" && break
+ done
+ test -z "$as_dirs" || eval "mkdir $as_dirs"
+ } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p='mkdir -p "$as_dir"'
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+ as_test_x='test -x'
+else
+ if ls -dL / >/dev/null 2>&1; then
+ as_ls_L_option=L
+ else
+ as_ls_L_option=
+ fi
+ as_test_x='
+ eval sh -c '\''
+ if test -d "$1"; then
+ test -d "$1/.";
+ else
+ case $1 in #(
+ -*)set "./$1";;
+ esac;
+ case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
+ ???[sx]*):;;*)false;;esac;fi
+ '\'' sh
+ '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+## ----------------------------------- ##
+## Main body of $CONFIG_STATUS script. ##
+## ----------------------------------- ##
+_ASEOF
+test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# Save the log message, to keep $0 and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by HTSlib $as_me 1.2.1, which was
+generated by GNU Autoconf 2.68. Invocation command line was
+
+ CONFIG_FILES = $CONFIG_FILES
+ CONFIG_HEADERS = $CONFIG_HEADERS
+ CONFIG_LINKS = $CONFIG_LINKS
+ CONFIG_COMMANDS = $CONFIG_COMMANDS
+ $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files and other configuration actions
+from templates according to the current configuration. Unless the files
+and actions are specified as TAGs, all are instantiated by default.
+
+Usage: $0 [OPTION]... [TAG]...
+
+ -h, --help print this help, then exit
+ -V, --version print version number and configuration settings, then exit
+ --config print configuration, then exit
+ -q, --quiet, --silent
+ do not print progress messages
+ -d, --debug don't remove temporary files
+ --recheck update $as_me by reconfiguring in the same conditions
+ --file=FILE[:TEMPLATE]
+ instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to <samtools-help at lists.sourceforge.net>.
+HTSlib home page: <http://www.htslib.org/>."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_version="\\
+HTSlib config.status 1.2.1
+configured by $0, generated by GNU Autoconf 2.68,
+ with options \\"\$ac_cs_config\\"
+
+Copyright (C) 2010 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+ case $1 in
+ --*=?*)
+ ac_option=`expr "X$1" : 'X\([^=]*\)='`
+ ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+ ac_shift=:
+ ;;
+ --*=)
+ ac_option=`expr "X$1" : 'X\([^=]*\)='`
+ ac_optarg=
+ ac_shift=:
+ ;;
+ *)
+ ac_option=$1
+ ac_optarg=$2
+ ac_shift=shift
+ ;;
+ esac
+
+ case $ac_option in
+ # Handling of the options.
+ -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+ ac_cs_recheck=: ;;
+ --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+ $as_echo "$ac_cs_version"; exit ;;
+ --config | --confi | --conf | --con | --co | --c )
+ $as_echo "$ac_cs_config"; exit ;;
+ --debug | --debu | --deb | --de | --d | -d )
+ debug=: ;;
+ --file | --fil | --fi | --f )
+ $ac_shift
+ case $ac_optarg in
+ *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+ '') as_fn_error $? "missing file argument" ;;
+ esac
+ as_fn_append CONFIG_FILES " '$ac_optarg'"
+ ac_need_defaults=false;;
+ --he | --h | --help | --hel | -h )
+ $as_echo "$ac_cs_usage"; exit ;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil | --si | --s)
+ ac_cs_silent=: ;;
+
+ # This is an error.
+ -*) as_fn_error $? "unrecognized option: \`$1'
+Try \`$0 --help' for more information." ;;
+
+ *) as_fn_append ac_config_targets " $1"
+ ac_need_defaults=false ;;
+
+ esac
+ shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+ exec 6>/dev/null
+ ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+ set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+ shift
+ \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+ CONFIG_SHELL='$SHELL'
+ export CONFIG_SHELL
+ exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+ echo
+ sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+ $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+ case $ac_config_target in
+ "config.mk") CONFIG_FILES="$CONFIG_FILES config.mk" ;;
+
+ *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
+ esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used. Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+ test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience. Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+ tmp= ac_tmp=
+ trap 'exit_status=$?
+ : "${ac_tmp:=$tmp}"
+ { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
+' 0
+ trap 'as_fn_exit 1' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+ tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+ test -d "$tmp"
+} ||
+{
+ tmp=./conf$$-$RANDOM
+ (umask 077 && mkdir "$tmp")
+} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+ac_tmp=$tmp
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=`echo X | tr X '\015'`
+# On cygwin, bash can eat \r inside `` if the user requested igncr.
+# But we know of no other shell where ac_cr would be empty at this
+# point, so we can use a bashism as a fallback.
+if test "x$ac_cr" = x; then
+ eval ac_cr=\$\'\\r\'
+fi
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+ ac_cs_awk_cr='\\r'
+else
+ ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+ echo "cat >conf$$subs.awk <<_ACEOF" &&
+ echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+ echo "_ACEOF"
+} >conf$$subs.sh ||
+ as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+ . ./conf$$subs.sh ||
+ as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+
+ ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+ if test $ac_delim_n = $ac_delim_num; then
+ break
+ elif $ac_last_try; then
+ as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ else
+ ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+ fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\)..*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\)..*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+ N
+ s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
+ for (key in S) S_is_set[key] = 1
+ FS = ""
+
+}
+{
+ line = $ 0
+ nfields = split(line, field, "@")
+ substed = 0
+ len = length(field[1])
+ for (i = 2; i < nfields; i++) {
+ key = field[i]
+ keylen = length(key)
+ if (S_is_set[key]) {
+ value = S[key]
+ line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+ len += length(value) + length(field[++i])
+ substed = 1
+ } else
+ len += 1 + keylen
+ }
+
+ print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+ sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+ cat
+fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
+ || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
+# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+ ac_vpsub='/^[ ]*VPATH[ ]*=[ ]*/{
+h
+s///
+s/^/:/
+s/[ ]*$/:/
+s/:\$(srcdir):/:/g
+s/:\${srcdir}:/:/g
+s/:@srcdir@:/:/g
+s/^:*//
+s/:*$//
+x
+s/\(=[ ]*\).*/\1/
+G
+s/\n//
+s/^[^=]*=[ ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+
+eval set X " :F $CONFIG_FILES "
+shift
+for ac_tag
+do
+ case $ac_tag in
+ :[FHLC]) ac_mode=$ac_tag; continue;;
+ esac
+ case $ac_mode$ac_tag in
+ :[FHL]*:*);;
+ :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
+ :[FH]-) ac_tag=-:-;;
+ :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+ esac
+ ac_save_IFS=$IFS
+ IFS=:
+ set x $ac_tag
+ IFS=$ac_save_IFS
+ shift
+ ac_file=$1
+ shift
+
+ case $ac_mode in
+ :L) ac_source=$1;;
+ :[FH])
+ ac_file_inputs=
+ for ac_f
+ do
+ case $ac_f in
+ -) ac_f="$ac_tmp/stdin";;
+ *) # Look for the file first in the build tree, then in the source tree
+ # (if the path is not absolute). The absolute path cannot be DOS-style,
+ # because $ac_f cannot contain `:'.
+ test -f "$ac_f" ||
+ case $ac_f in
+ [\\/$]*) false;;
+ *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+ esac ||
+ as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
+ esac
+ case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+ as_fn_append ac_file_inputs " '$ac_f'"
+ done
+
+ # Let's still pretend it is `configure' which instantiates (i.e., don't
+ # use $as_me), people would be surprised to read:
+ # /* config.h. Generated by config.status. */
+ configure_input='Generated from '`
+ $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+ `' by configure.'
+ if test x"$ac_file" != x-; then
+ configure_input="$ac_file. $configure_input"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+ fi
+ # Neutralize special characters interpreted by sed in replacement strings.
+ case $configure_input in #(
+ *\&* | *\|* | *\\* )
+ ac_sed_conf_input=`$as_echo "$configure_input" |
+ sed 's/[\\\\&|]/\\\\&/g'`;; #(
+ *) ac_sed_conf_input=$configure_input;;
+ esac
+
+ case $ac_tag in
+ *:-:* | *:-) cat >"$ac_tmp/stdin" \
+ || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
+ esac
+ ;;
+ esac
+
+ ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$ac_file" : 'X\(//\)[^/]' \| \
+ X"$ac_file" : 'X\(//\)$' \| \
+ X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ as_dir="$ac_dir"; as_fn_mkdir_p
+ ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+ ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+ # A ".." for each directory in $ac_dir_suffix.
+ ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+ case $ac_top_builddir_sub in
+ "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+ *) ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+ esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+ .) # We are building in place.
+ ac_srcdir=.
+ ac_top_srcdir=$ac_top_builddir_sub
+ ac_abs_top_srcdir=$ac_pwd ;;
+ [\\/]* | ?:[\\/]* ) # Absolute name.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir
+ ac_abs_top_srcdir=$srcdir ;;
+ *) # Relative name.
+ ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_build_prefix$srcdir
+ ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+ case $ac_mode in
+ :F)
+ #
+ # CONFIG_FILE
+ #
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+ac_sed_dataroot='
+/datarootdir/ {
+ p
+ q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+ { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ ac_datarootdir_hack='
+ s&@datadir@&$datadir&g
+ s&@docdir@&$docdir&g
+ s&@infodir@&$infodir&g
+ s&@localedir@&$localedir&g
+ s&@mandir@&$mandir&g
+ s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
+ >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+ { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
+ { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \
+ "$ac_tmp/out"`; test -z "$ac_out"; } &&
+ { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined. Please make sure it is defined" >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined. Please make sure it is defined" >&2;}
+
+ rm -f "$ac_tmp/stdin"
+ case $ac_file in
+ -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
+ *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
+ esac \
+ || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+ ;;
+
+
+
+ esac
+
+done # for ac_tag
+
+
+as_fn_exit 0
+_ACEOF
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+ as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded. So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status. When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+ ac_cs_success=:
+ ac_config_status_args=
+ test "$silent" = yes &&
+ ac_config_status_args="$ac_config_status_args --quiet"
+ exec 5>/dev/null
+ $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+ exec 5>>config.log
+ # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+ # would make configure fail if this is the last instruction.
+ $ac_cs_success || as_fn_exit 1
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
diff --git a/htslib/configure.ac b/htslib/configure.ac
new file mode 100644
index 0000000..77ce99c
--- /dev/null
+++ b/htslib/configure.ac
@@ -0,0 +1,93 @@
+# Configure script for htslib, a C library for high-throughput sequencing data.
+#
+# Copyright (C) 2015 Genome Research Ltd.
+#
+# Author: John Marshall <jm18 at sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+dnl Process this file with autoconf to produce a configure script
+AC_INIT([HTSlib], m4_esyscmd_s([make print-version]),
+ [samtools-help at lists.sourceforge.net], [], [http://www.htslib.org/])
+AC_PREREQ(2.63) dnl This version introduced 4-argument AC_CHECK_HEADER
+AC_CONFIG_SRCDIR(hts.c)
+
+dnl Copyright notice to be copied into the generated configure script
+AC_COPYRIGHT([Portions copyright (C) 2015 Genome Research Ltd.
+
+This configure script is free software: you are free to change and
+redistribute it. There is NO WARRANTY, to the extent permitted by law.])
+
+AC_PROG_CC
+AC_PROG_RANLIB
+
+AC_ARG_WITH([irods],
+ [AS_HELP_STRING([[--with-irods[=DIR]]],
+ [use RodsAPIs library (in DIR) to support iRODS URLs])],
+ [case $withval in
+ no) irods=disabled ;;
+ yes) irods=enabled ;;
+ *) irods=enabled; IRODS_HOME=$withval ;;
+ esac],
+ [irods=disabled])
+
+save_LIBS=$LIBS
+zlib_devel=ok
+dnl Set a trivial non-empty INCLUDES to avoid excess default includes tests
+AC_CHECK_HEADER([zlib.h], [], [zlib_devel=missing], [;])
+AC_CHECK_LIB(z, inflate, [], [zlib_devel=missing])
+LIBS=$save_LIBS
+
+if test $zlib_devel != ok; then
+ AC_MSG_ERROR([zlib development files not found
+
+HTSlib uses compression routines from the zlib library <http://zlib.net>.
+Building HTSlib requires zlib development files to be installed on the build
+machine; you may need to ensure a package such as zlib1g-dev (on Debian or
+Ubuntu Linux) or zlib-devel (on RPM-based Linux distributions) is installed.
+
+FAILED. This error must be resolved in order to build HTSlib successfully.])
+fi
+
+if test $irods = enabled; then
+ # TODO Also test whether we require libgssapi_krb5 and AC_CHECK_LIB it
+ save_LDFLAGS=$LDFLAGS
+ LDFLAGS="$LDFLAGS -L$IRODS_HOME/lib/core/obj"
+ AC_CHECK_LIB([RodsAPIs], [getRodsEnvFileName],
+ [case $with_irods in
+ yes) define_IRODS_HOME='# Uses $(IRODS_HOME) from the environment' ;;
+ *) define_IRODS_HOME="IRODS_HOME = $with_irods" ;;
+ esac],
+ [AC_MSG_ERROR([iRODS development files not found
+
+Support for iRODS URLs requires the libRodsAPI client library and headers.
+Configure with --with-irods=DIR (or just --with-irods if \$IRODS_HOME has
+been exported with a suitable value), where DIR is the base of an iRODS tree
+such that the library is present as DIR/lib/core/obj/libRodsAPI.* and headers
+are present under DIR/lib/api/include and so on.])],
+ [-lgssapi_krb5 -lpthread])
+ LDFLAGS=$save_LDFLAGS
+else
+ define_IRODS_HOME='IRODS_HOME ?= /disabled'
+fi
+AC_SUBST([irods])
+AC_SUBST([define_IRODS_HOME])
+
+AC_CONFIG_FILES(config.mk)
+AC_OUTPUT
diff --git a/htslib/cram/cram.h b/htslib/cram/cram.h
index 0b8b291..02f7774 100644
--- a/htslib/cram/cram.h
+++ b/htslib/cram/cram.h
@@ -31,8 +31,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*! \file
* CRAM interface.
*
- * Consider using the higher level scram_*() API for programs that wish to
- * be file format agnostic.
+ * Consider using the higher level hts_*() API for programs that wish to
+ * be file format agnostic (see htslib/hts.h).
*
* This API should be used for CRAM specific code. The specifics of the
* public API are implemented in cram_io.h, cram_encode.h and cram_decode.h
@@ -43,13 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern "C" {
#endif
-#ifdef SAMTOOLS
-# include "cram/cram_samtools.h"
-#endif
-
#ifndef _CRAM_H_
#define _CRAM_H_
+#include "cram/cram_samtools.h"
#include "cram/sam_header.h"
#include "cram_structs.h"
#include "cram_io.h"
diff --git a/htslib/cram/cram_codecs.c b/htslib/cram/cram_codecs.c
index 3c3d13f..c6bfb16 100644
--- a/htslib/cram/cram_codecs.c
+++ b/htslib/cram/cram_codecs.c
@@ -271,8 +271,7 @@ static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) {
}
}
-
-
+ /* fits in current bit-field */
if (nbits <= block->bit+1) {
block->data[block->byte] |= (val << (block->bit+1-nbits));
if ((block->bit-=nbits) == -1) {
@@ -330,11 +329,11 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c,
/* Find the external block */
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->external.content_id]))
- return -1;
+ return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
- if (b->content_type == EXTERNAL &&
+ if (b && b->content_type == EXTERNAL &&
b->content_id == c->external.content_id) {
break;
}
@@ -361,11 +360,11 @@ int cram_external_decode_char(cram_slice *slice, cram_codec *c,
/* Find the external block */
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->external.content_id]))
- return -1;
+ return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
- if (b->content_type == EXTERNAL &&
+ if (b && b->content_type == EXTERNAL &&
b->content_id == c->external.content_id) {
break;
}
@@ -382,9 +381,9 @@ int cram_external_decode_char(cram_slice *slice, cram_codec *c,
return 0;
}
-int cram_external_decode_block(cram_slice *slice, cram_codec *c,
- cram_block *in, char *out_,
- int *out_size) {
+static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out_,
+ int *out_size) {
int i;
char *cp;
cram_block *b = NULL;
@@ -393,11 +392,11 @@ int cram_external_decode_block(cram_slice *slice, cram_codec *c,
/* Find the external block */
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->external.content_id]))
- return -1;
+ return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
- if (b->content_type == EXTERNAL &&
+ if (b && b->content_type == EXTERNAL &&
b->content_id == c->external.content_id) {
break;
}
@@ -450,11 +449,17 @@ cram_codec *cram_external_decode_init(char *data, int size,
return c;
}
-int cram_external_encode(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
+int cram_external_encode_int(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
uint32_t *i32 = (uint32_t *)in;
- itf8_put_blk(out, *i32);
+ itf8_put_blk(c->out, *i32);
+ return 0;
+}
+
+int cram_external_encode_char(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ BLOCK_APPEND(c->out, in, in_size);
return 0;
}
@@ -495,7 +500,12 @@ cram_codec *cram_external_encode_init(cram_stats *st,
return NULL;
c->codec = E_EXTERNAL;
c->free = cram_external_encode_free;
- c->encode = cram_external_encode;
+ if (option == E_INT || option == E_LONG)
+ c->encode = cram_external_encode_int;
+ else if (option == E_BYTE_ARRAY || option == E_BYTE)
+ c->encode = cram_external_encode_char;
+ else
+ abort();
c->store = cram_external_encode_store;
c->e_external.content_id = (size_t)dat;
@@ -516,7 +526,7 @@ int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char
out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
} else {
for (i = 0, n = *out_size; i < n; i++)
- out_i[i] = 0;
+ out_i[i] = -c->beta.offset;
}
return 0;
@@ -530,7 +540,7 @@ int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char
out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
} else {
for (i = 0, n = *out_size; i < n; i++)
- out[i] = 0;
+ out[i] = -c->beta.offset;
}
return 0;
@@ -591,23 +601,25 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b,
}
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
+ char *in, int in_size) {
int *syms = (int *)in;
int i, r = 0;
for (i = 0; i < in_size; i++)
- r |= store_bits_MSB(out, syms[i] + c->e_beta.offset, c->e_beta.nbits);
+ r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
+ c->e_beta.nbits);
return r;
}
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
+ char *in, int in_size) {
unsigned char *syms = (unsigned char *)in;
int i, r = 0;
for (i = 0; i < in_size; i++)
- r |= store_bits_MSB(out, syms[i] + c->e_beta.offset, c->e_beta.nbits);
+ r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
+ c->e_beta.nbits);
return r;
}
@@ -859,9 +871,9 @@ int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
//val <<= dlen;
//val |= get_bits_MSB(in, dlen);
- //last_len = (len += dlen);
+ //last_len = (len += dlen);
- last_len = (len += dlen);
+ last_len = (len += dlen);
for (; dlen; dlen--) GET_BIT_MSB(in, val);
idx = val - codes[idx].p;
@@ -909,9 +921,9 @@ int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
//val <<= dlen;
//val |= get_bits_MSB(in, dlen);
- //last_len = (len += dlen);
+ //last_len = (len += dlen);
- last_len = (len += dlen);
+ last_len = (len += dlen);
for (; dlen; dlen--) GET_BIT_MSB(in, val);
idx = val - codes[idx].p;
@@ -1051,12 +1063,12 @@ cram_codec *cram_huffman_decode_init(char *data, int size,
}
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
+ char *in, int in_size) {
return 0;
}
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
+ char *in, int in_size) {
int i, code, len, r = 0;
unsigned char *syms = (unsigned char *)in;
@@ -1080,19 +1092,19 @@ int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
len = c->e_huffman.codes[i].len;
}
- r |= store_bits_MSB(out, code, len);
+ r |= store_bits_MSB(c->out, code, len);
} while (--in_size);
return r;
}
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
+ char *in, int in_size) {
return 0;
}
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
+ char *in, int in_size) {
int i, code, len, r = 0;
int *syms = (int *)in;
@@ -1117,7 +1129,7 @@ int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
len = c->e_huffman.codes[i].len;
}
- r |= store_bits_MSB(out, code, len);
+ r |= store_bits_MSB(c->out, code, len);
} while (--in_size);
return r;
@@ -1428,19 +1440,37 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size,
}
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
- return -1; // not imp.
+ char *in, int in_size) {
+ int32_t i32 = in_size;
+ int r = 0;
+
+ r |= c->e_byte_array_len.len_codec->encode(slice,
+ c->e_byte_array_len.len_codec,
+ (char *)&i32, 1);
+ r |= c->e_byte_array_len.val_codec->encode(slice,
+ c->e_byte_array_len.val_codec,
+ in, in_size);
+ return r;
}
void cram_byte_array_len_encode_free(cram_codec *c) {
if (!c)
return;
+
+ if (c->e_byte_array_len.len_codec)
+ c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec);
+
+ if (c->e_byte_array_len.val_codec)
+ c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec);
+
free(c);
}
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
char *prefix, int version) {
- int len = 0;
+ int len = 0, len2, len3;
+ cram_codec *tc;
+ cram_block *b_len, *b_val;
if (prefix) {
size_t l = strlen(prefix);
@@ -1448,16 +1478,23 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
len += l;
}
+ tc = c->e_byte_array_len.len_codec;
+ b_len = cram_new_block(0, 0);
+ len2 = tc->store(tc, b_len, NULL, version);
+
+ tc = c->e_byte_array_len.val_codec;
+ b_val = cram_new_block(0, 0);
+ len3 = tc->store(tc, b_val, NULL, version);
+
len += itf8_put_blk(b, c->codec);
- len += itf8_put_blk(b, c->e_byte_array_len.len_len +
- c->e_byte_array_len.val_len);
- BLOCK_APPEND(b, c->e_byte_array_len.len_dat, c->e_byte_array_len.len_len);
- len += c->e_byte_array_len.len_len;
+ len += itf8_put_blk(b, len2+len3);
+ BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
+ BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
- BLOCK_APPEND(b, c->e_byte_array_len.val_dat, c->e_byte_array_len.val_len);
- len += c->e_byte_array_len.val_len;
+ cram_free_block(b_len);
+ cram_free_block(b_val);
- return len;
+ return len + len2 + len3;
}
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
@@ -1475,10 +1512,14 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
c->encode = cram_byte_array_len_encode;
c->store = cram_byte_array_len_encode_store;
- c->e_byte_array_len.len_len = e->len_len;
- c->e_byte_array_len.len_dat = e->len_dat;
- c->e_byte_array_len.val_len = e->val_len;
- c->e_byte_array_len.val_dat = e->val_dat;
+ c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
+ NULL, E_INT,
+ e->len_dat,
+ version);
+ c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
+ NULL, E_BYTE_ARRAY,
+ e->val_dat,
+ version);
return c;
}
@@ -1487,20 +1528,20 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
* ---------------------------------------------------------------------------
* BYTE_ARRAY_STOP
*/
-int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
- cram_block *in, char *out,
- int *out_size) {
+static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out,
+ int *out_size) {
int i;
cram_block *b = NULL;
char *cp, ch;
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->byte_array_stop.content_id]))
- return -1;
+ return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
- if (b->content_type == EXTERNAL &&
+ if (b && b->content_type == EXTERNAL &&
b->content_id == c->byte_array_stop.content_id) {
break;
}
@@ -1529,20 +1570,19 @@ int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
cram_block *in, char *out_,
int *out_size) {
- int space = 256;
cram_block *b = NULL;
cram_block *out = (cram_block *)out_;
- char *cp, ch, *out_cp, *cp_end, *out_end;
+ char *cp, *out_cp, *cp_end;
char stop;
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->byte_array_stop.content_id]))
- return -1;
+ return *out_size?-1:0;
} else {
int i;
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
- if (b->content_type == EXTERNAL &&
+ if (b && b->content_type == EXTERNAL &&
b->content_id == c->byte_array_stop.content_id) {
break;
}
@@ -1555,25 +1595,20 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
return -1;
cp = (char *)b->data + b->idx;
cp_end = (char *)b->data + b->uncomp_size;
- BLOCK_GROW(out, space);
out_cp = (char *)BLOCK_END(out);
- out_end = out_cp + space;
stop = c->byte_array_stop.stop;
- while ((ch = *cp) != stop) {
- if (cp++ == cp_end)
- return -1;
- *out_cp++ = ch;
-
- if (out_cp == out_end) {
- BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
- space *= 2;
- BLOCK_GROW(out, space);
- out_cp = (char *)BLOCK_END(out);
- out_end = out_cp + space;
- }
+ if (cp_end - cp < out->alloc - out->byte) {
+ while (*cp != stop && cp != cp_end)
+ *out_cp++ = *cp++;
+ BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
+ } else {
+ char *cp_start;
+ for (cp_start = cp; *cp != stop && cp != cp_end; cp++)
+ ;
+ BLOCK_APPEND(out, cp_start, cp - cp_start);
+ BLOCK_GROW(out, cp - cp_start);
}
- BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
*out_size = cp - (char *)(b->data + b->idx);
b->idx = cp - (char *)b->data + 1;
@@ -1603,7 +1638,7 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
c->free = cram_byte_array_stop_decode_free;
c->byte_array_stop.stop = *cp++;
- if (version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(version) == 1) {
c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
+ (cp[3]<<24);
cp += 4;
@@ -1621,8 +1656,10 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
}
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
- cram_block *out, char *in, int in_size) {
- return -1; // not imp.
+ char *in, int in_size) {
+ BLOCK_APPEND(c->out, in, in_size);
+ BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop);
+ return 0;
}
void cram_byte_array_stop_encode_free(cram_codec *c) {
@@ -1644,7 +1681,7 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
cp += itf8_put(cp, c->codec);
- if (version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(version) == 1) {
cp += itf8_put(cp, 5);
*cp++ = c->e_byte_array_stop.stop;
*cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff;
@@ -1756,9 +1793,54 @@ cram_codec *cram_encoder_init(enum cram_encoding codec,
return NULL;
if (encode_init[codec]) {
- return encode_init[codec](st, option, dat, version);
+ cram_codec *r;
+ if ((r = encode_init[codec](st, option, dat, version)))
+ r->out = NULL;
+ return r;
} else {
fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec));
abort();
}
}
+
+/*
+ * Returns the content_id used by this codec, also in id2 if byte_array_len.
+ * Returns -1 for the CORE block and -2 for unneeded.
+ * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
+ */
+int cram_codec_to_id(cram_codec *c, int *id2) {
+ int bnum1, bnum2 = -2;
+
+ switch (c->codec) {
+ case E_HUFFMAN:
+ bnum1 = c->huffman.ncodes == 1 ? -2 : -1;
+ break;
+ case E_GOLOMB:
+ case E_BETA:
+ case E_SUBEXP:
+ case E_GOLOMB_RICE:
+ case E_GAMMA:
+ bnum1 = -1;
+ break;
+ case E_EXTERNAL:
+ bnum1 = c->external.content_id;
+ break;
+ case E_BYTE_ARRAY_LEN:
+ bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL);
+ bnum2 = cram_codec_to_id(c->byte_array_len.value_codec, NULL);
+ break;
+ case E_BYTE_ARRAY_STOP:
+ bnum1 = c->byte_array_stop.content_id;
+ break;
+ case E_NULL:
+ bnum1 = -2;
+ break;
+ default:
+ fprintf(stderr, "Unknown codec type %d\n", c->codec);
+ bnum1 = -1;
+ }
+
+ if (id2)
+ *id2 = bnum2;
+ return bnum1;
+}
diff --git a/htslib/cram/cram_codecs.h b/htslib/cram/cram_codecs.h
index 7037814..e047901 100644
--- a/htslib/cram/cram_codecs.h
+++ b/htslib/cram/cram_codecs.h
@@ -97,10 +97,12 @@ typedef struct {
} cram_byte_array_stop_decoder;
typedef struct {
- uint32_t len_len;
- unsigned char *len_dat;
- uint32_t val_len;
- unsigned char *val_dat;
+ enum cram_encoding len_encoding;
+ enum cram_encoding val_encoding;
+ void *len_dat;
+ void *val_dat;
+ struct cram_codec *len_codec;
+ struct cram_codec *val_codec;
} cram_byte_array_len_encoder;
/*
@@ -108,11 +110,12 @@ typedef struct {
*/
typedef struct cram_codec {
enum cram_encoding codec;
+ cram_block *out;
void (*free)(struct cram_codec *codec);
int (*decode)(cram_slice *slice, struct cram_codec *codec,
cram_block *in, char *out, int *out_size);
int (*encode)(cram_slice *slice, struct cram_codec *codec,
- cram_block *out, char *in, int in_size);
+ char *in, int in_size);
int (*store)(struct cram_codec *codec, cram_block *b, char *prefix,
int version);
union {
@@ -146,7 +149,14 @@ cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st,
//#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, (--b->bit == -1) && (b->bit = 7, b->byte++))
-#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, b->byte += (b->bit==0), b->bit+=(b->bit==0)*8-1)
+#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, b->byte += (--b->bit<0), b->bit&=7)
+
+/*
+ * Returns the content_id used by this codec, also in id2 if byte_array_len.
+ * Returns -1 for the CORE block and -2 for unneeded.
+ * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
+ */
+int cram_codec_to_id(cram_codec *c, int *id2);
#ifdef __cplusplus
}
diff --git a/htslib/cram/cram_decode.c b/htslib/cram/cram_decode.c
index e002ac9..1d6281e 100644
--- a/htslib/cram/cram_decode.c
+++ b/htslib/cram/cram_decode.c
@@ -133,13 +133,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
return NULL;
if (b->method != RAW) {
- if (cram_uncompress_block(b))
+ if (cram_uncompress_block(b)) {
+ free(hdr);
return NULL;
+ }
}
cp = (char *)b->data;
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
cp += itf8_get(cp, &hdr->ref_seq_id);
cp += itf8_get(cp, &hdr->ref_seq_start);
cp += itf8_get(cp, &hdr->ref_seq_span);
@@ -367,179 +369,212 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
*/
if (key[0] == 'B' && key[1] == 'F') {
- if (!(hdr->BF_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'C' && key[1] == 'F') {
- if (!(hdr->CF_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'I') {
- if (!(hdr->RI_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'L') {
- if (!(hdr->RL_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'A' && key[1] == 'P') {
- if (!(hdr->AP_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'G') {
- if (!(hdr->RG_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'M' && key[1] == 'F') {
- if (!(hdr->MF_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'N' && key[1] == 'S') {
- if (!(hdr->NS_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'N' && key[1] == 'P') {
- if (!(hdr->NP_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'S') {
- if (!(hdr->TS_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'N' && key[1] == 'F') {
- if (!(hdr->NF_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'C') {
- if (!(hdr->TC_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
- fd->version))) {
+ if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'N') {
- if (!(hdr->TN_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'F' && key[1] == 'N') {
- if (!(hdr->FN_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'F' && key[1] == 'C') {
- if (!(hdr->FC_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
- fd->version))) {
+ if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'F' && key[1] == 'P') {
- if (!(hdr->FP_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'B' && key[1] == 'S') {
- if (!(hdr->BS_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
- fd->version))) {
+ if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'I' && key[1] == 'N') {
- if (!(hdr->IN_codec = cram_decoder_init(encoding, cp, size,
- E_BYTE_ARRAY,
- fd->version))) {
+ if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'S' && key[1] == 'C') {
- if (!(hdr->SC_codec = cram_decoder_init(encoding, cp, size,
- E_BYTE_ARRAY,
- fd->version))) {
+ if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'D' && key[1] == 'L') {
- if (!(hdr->DL_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'B' && key[1] == 'A') {
- if (!(hdr->BA_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
- fd->version))) {
+ if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'B' && key[1] == 'B') {
+ if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'S') {
- if (!(hdr->RS_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'P' && key[1] == 'D') {
- if (!(hdr->PD_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'H' && key[1] == 'C') {
- if (!(hdr->HC_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'M' && key[1] == 'Q') {
- if (!(hdr->MQ_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'N') {
- if (!(hdr->RN_codec = cram_decoder_init(encoding, cp, size,
- E_BYTE_ARRAY_BLOCK,
- fd->version))) {
+ if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY_BLOCK,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'Q' && key[1] == 'S') {
- if (!(hdr->QS_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
- fd->version))) {
+ if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
- if (!(hdr->Qs_codec = cram_decoder_init(encoding, cp, size,
- E_BYTE_ARRAY,
- fd->version))) {
+ } else if (key[0] == 'Q' && key[1] == 'Q') {
+ if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'L') {
- if (!(hdr->TL_codec = cram_decoder_init(encoding, cp, size, E_INT,
- fd->version))) {
+ if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
@@ -601,6 +636,323 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
return hdr;
}
+/*
+ * Note we also need to scan through the record encoding map to
+ * see which data series share the same block, either external or
+ * CORE. For example if we need the BF data series but MQ and CF
+ * are also encoded in the same block then we need to add those in
+ * as a dependency in order to correctly decode BF.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_dependent_data_series(cram_fd *fd,
+ cram_block_compression_hdr *hdr,
+ cram_slice *s) {
+ int *block_used;
+ int core_used = 0;
+ int i;
+ static int i_to_id[] = {
+ DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
+ DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
+ DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
+ DS_HC, DS_SC, DS_BB, DS_QQ,
+ };
+ uint32_t orig_ds;
+
+ /*
+ * Set the data_series bit field based on fd->required_fields
+ * contents.
+ */
+ if (fd->required_fields && fd->required_fields != INT_MAX) {
+ hdr->data_series = 0;
+
+ if (fd->required_fields & SAM_QNAME)
+ hdr->data_series |= CRAM_RN;
+
+ if (fd->required_fields & SAM_FLAG)
+ hdr->data_series |= CRAM_BF;
+
+ if (fd->required_fields & SAM_RNAME)
+ hdr->data_series |= CRAM_RI | CRAM_BF;
+
+ if (fd->required_fields & SAM_POS)
+ hdr->data_series |= CRAM_AP | CRAM_BF;
+
+ if (fd->required_fields & SAM_MAPQ)
+ hdr->data_series |= CRAM_MQ;
+
+ if (fd->required_fields & SAM_CIGAR)
+ hdr->data_series |= CRAM_CIGAR;
+
+ if (fd->required_fields & SAM_RNEXT)
+ hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
+
+ if (fd->required_fields & SAM_PNEXT)
+ hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
+
+ if (fd->required_fields & SAM_TLEN)
+ hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
+ CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
+
+ if (fd->required_fields & SAM_SEQ)
+ hdr->data_series |= CRAM_SEQ;
+
+ if (!(fd->required_fields & SAM_AUX))
+ // No easy way to get MD/NM without other tags at present
+ fd->decode_md = 0;
+
+ if (fd->required_fields & SAM_QUAL)
+ hdr->data_series |= CRAM_SEQ;
+
+ if (fd->required_fields & SAM_AUX)
+ hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
+
+ if (fd->required_fields & SAM_RGAUX)
+ hdr->data_series |= CRAM_RG | CRAM_BF;
+
+ // Always uncompress CORE block
+ if (cram_uncompress_block(s->block[0]))
+ return -1;
+ } else {
+ hdr->data_series = CRAM_ALL;
+
+ for (i = 0; i < s->hdr->num_blocks; i++) {
+ if (cram_uncompress_block(s->block[i]))
+ return -1;
+ }
+
+ return 0;
+ }
+
+ block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
+ if (!block_used)
+ return -1;
+
+ do {
+ /*
+ * Also set data_series based on code prerequisites. Eg if we need
+ * CRAM_QS then we also need to know CRAM_RL so we know how long it
+ * is, or if we need FC/FP then we also need FN (number of features).
+ *
+ * It's not reciprocal though. We may be needing to decode FN
+ * but have no need to decode FC, FP and cigar ops.
+ */
+ if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP;
+
+ // cram_decode_seq() needs seq[] array
+ if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL;
+
+ if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC;
+ if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN;
+ if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL;
+ if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF;
+ if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF;
+ if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI;
+ if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
+ hdr->data_series |= CRAM_CF;
+ if (!hdr->read_names_included && hdr->data_series & CRAM_RN)
+ hdr->data_series |= CRAM_CF | CRAM_NF;
+ if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
+ hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
+
+ orig_ds = hdr->data_series;
+
+ // Find which blocks are in use.
+ for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
+ int bnum1, bnum2, j;
+ cram_codec *c = hdr->codecs[i_to_id[i]];
+
+ if (!(hdr->data_series & (1<<i)))
+ continue;
+
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ core_used = 1;
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type == EXTERNAL &&
+ s->block[j]->content_id == bnum1) {
+ block_used[j] = 1;
+ if (cram_uncompress_block(s->block[j])) {
+ free(block_used);
+ return -1;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+ }
+
+ // Tags too
+ if ((fd->required_fields & SAM_AUX) ||
+ (hdr->data_series & CRAM_aux)) {
+ for (i = 0; i < CRAM_MAP_HASH; i++) {
+ int bnum1, bnum2, j;
+ cram_map *m = hdr->tag_encoding_map[i];
+
+ while (m) {
+ cram_codec *c = m->codec;
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ core_used = 1;
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type == EXTERNAL &&
+ s->block[j]->content_id == bnum1) {
+ block_used[j] = 1;
+ if (cram_uncompress_block(s->block[j])) {
+ free(block_used);
+ return -1;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+
+ m = m->next;
+ }
+ }
+ }
+
+ // We now know which blocks are in used, so repeat and find
+ // which other data series need to be added.
+ for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
+ int bnum1, bnum2, j;
+ cram_codec *c = hdr->codecs[i_to_id[i]];
+
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ if (core_used) {
+ //printf(" + data series %08x:\n", 1<<i);
+ hdr->data_series |= 1<<i;
+ }
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type == EXTERNAL &&
+ s->block[j]->content_id == bnum1) {
+ if (block_used[j]) {
+ //printf(" + data series %08x:\n", 1<<i);
+ hdr->data_series |= 1<<i;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+ }
+
+ // Tags too
+ for (i = 0; i < CRAM_MAP_HASH; i++) {
+ int bnum1, bnum2, j;
+ cram_map *m = hdr->tag_encoding_map[i];
+
+ while (m) {
+ cram_codec *c = m->codec;
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ //printf(" + data series %08x:\n", CRAM_aux);
+ hdr->data_series |= CRAM_aux;
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type &&
+ s->block[j]->content_id == bnum1) {
+ if (block_used[j]) {
+ //printf(" + data series %08x:\n",
+ // CRAM_aux);
+ hdr->data_series |= CRAM_aux;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+
+ m = m->next;
+ }
+ }
+ } while (orig_ds != hdr->data_series);
+
+ free(block_used);
+ return 0;
+}
+
/* ----------------------------------------------------------------------
* CRAM slices
*/
@@ -630,8 +982,15 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
cp += itf8_get(cp, &hdr->ref_seq_span);
}
cp += itf8_get(cp, &hdr->num_records);
- if (fd->version != CRAM_1_VERS)
- cp += itf8_get(cp, &hdr->record_counter);
+ hdr->record_counter = 0;
+ if (CRAM_MAJOR_VERS(fd->version) == 2) {
+ int32_t i32;
+ cp += itf8_get(cp, &i32);
+ hdr->record_counter = i32;
+ } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ cp += ltf8_get(cp, &hdr->record_counter);
+ }
+
cp += itf8_get(cp, &hdr->num_blocks);
cp += itf8_get(cp, &hdr->num_content_ids);
@@ -649,7 +1008,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
cp += itf8_get(cp, &hdr->ref_base_id);
}
- if (fd->version != CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
memcpy(hdr->md5, cp, 16);
} else {
memset(hdr->md5, 0, 16);
@@ -707,10 +1066,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
uint32_t cigar_alloc = s->cigar_alloc;
uint32_t nm = 0, md_dist = 0;
int orig_aux = 0;
- int decode_md = fd->decode_md;
- char buf[20];
+ int decode_md = fd->decode_md && s->ref;
+ uint32_t ds = c->comp_hdr->data_series;
- if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
+ if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
memset(qual, 30, cr->len);
}
@@ -719,14 +1078,22 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
BLOCK_APPEND(s->aux_blk, "MDZ", 3);
}
- if (!c->comp_hdr->FN_codec) return -1;
- r |= c->comp_hdr->FN_codec->decode(s,c->comp_hdr->FN_codec, blk,
- (char *)&fn, &out_sz);
+ if (ds & CRAM_FN) {
+ if (!c->comp_hdr->codecs[DS_FN]) return -1;
+ r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
+ blk, (char *)&fn, &out_sz);
+ } else {
+ fn = 0;
+ }
ref_pos--; // count from 0
cr->cigar = ncigar;
+
+ if (!(ds & (CRAM_FC | CRAM_FP)))
+ goto skip_cigar;
+
for (f = 0; f < fn; f++) {
- int32_t pos;
+ int32_t pos = 0;
char op;
if (ncigar+2 >= cigar_alloc) {
@@ -736,12 +1103,22 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
return -1;
}
- if (!c->comp_hdr->FC_codec) return -1;
- r |= c->comp_hdr->FC_codec->decode(s, c->comp_hdr->FC_codec, blk,
- &op, &out_sz);
- if (!c->comp_hdr->FP_codec) return -1;
- r |= c->comp_hdr->FP_codec->decode(s, c->comp_hdr->FP_codec, blk,
- (char *)&pos, &out_sz);
+ if (ds & CRAM_FC) {
+ if (!c->comp_hdr->codecs[DS_FC]) return -1;
+ r |= c->comp_hdr->codecs[DS_FC]->decode(s,
+ c->comp_hdr->codecs[DS_FC],
+ blk,
+ &op, &out_sz);
+ }
+
+ if (!(ds & CRAM_FP))
+ continue;
+
+ if (!c->comp_hdr->codecs[DS_FP]) return -1;
+ r |= c->comp_hdr->codecs[DS_FP]->decode(s,
+ c->comp_hdr->codecs[DS_FP],
+ blk,
+ (char *)&pos, &out_sz);
pos += prev_pos;
if (pos > seq_pos) {
@@ -781,6 +1158,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
prev_pos = pos;
+ if (!(ds & CRAM_FC))
+ goto skip_cigar;
+
+ if (!(ds & CRAM_FC))
+ continue;
+
switch(op) {
case 'S': { // soft clip: IN
int32_t out_sz2 = 1;
@@ -789,20 +1172,36 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (fd->version == CRAM_1_VERS) {
- r |= c->comp_hdr->IN_codec
- ? c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec,
- blk, &seq[pos-1], &out_sz2)
- : (seq[pos-1] = 'N', out_sz2 = 1, 0);
- } else {
- r |= c->comp_hdr->SC_codec
- ? c->comp_hdr->SC_codec->decode(s, c->comp_hdr->SC_codec,
- blk, &seq[pos-1], &out_sz2)
- : (seq[pos-1] = 'N', out_sz2 = 1, 0);
- }
- cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
- cig_op = BAM_CSOFT_CLIP;
- seq_pos += out_sz2;
+ if (ds & CRAM_IN) {
+ switch (CRAM_MAJOR_VERS(fd->version)) {
+ case 1:
+ r |= c->comp_hdr->codecs[DS_IN]
+ ? c->comp_hdr->codecs[DS_IN]
+ ->decode(s, c->comp_hdr->codecs[DS_IN],
+ blk, &seq[pos-1], &out_sz2)
+ : (seq[pos-1] = 'N', out_sz2 = 1, 0);
+ break;
+
+ case 2:
+ default:
+ r |= c->comp_hdr->codecs[DS_SC]
+ ? c->comp_hdr->codecs[DS_SC]
+ ->decode(s, c->comp_hdr->codecs[DS_SC],
+ blk, &seq[pos-1], &out_sz2)
+ : (seq[pos-1] = 'N', out_sz2 = 1, 0);
+ break;
+
+// default:
+// r |= c->comp_hdr->codecs[DS_BB]
+// ? c->comp_hdr->codecs[DS_BB]
+// ->decode(s, c->comp_hdr->codecs[DS_BB],
+// blk, &seq[pos-1], &out_sz2)
+// : (seq[pos-1] = 'N', out_sz2 = 1, 0);
+ }
+ cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
+ cig_op = BAM_CSOFT_CLIP;
+ seq_pos += out_sz2;
+ }
break;
}
@@ -813,10 +1212,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (!c->comp_hdr->BS_codec) return -1;
- r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk,
- (char *)&base, &out_sz);
- seq[pos-1] = 'N'; // FIXME look up BS=base value
+ if (ds & CRAM_BS) {
+ if (!c->comp_hdr->codecs[DS_BS]) return -1;
+ r |= c->comp_hdr->codecs[DS_BS]
+ ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
+ (char *)&base, &out_sz);
+ seq[pos-1] = 'N'; // FIXME look up BS=base value
+ }
cig_op = BAM_CBASE_MISMATCH;
#else
int ref_base;
@@ -824,18 +1226,23 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (!c->comp_hdr->BS_codec) return -1;
- r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk,
- (char *)&base, &out_sz);
- if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
- seq[pos-1] = 'N';
- } else {
- ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]];
- seq[pos-1] = c->comp_hdr->substitution_matrix[ref_base][base];
- if (decode_md) {
- BLOCK_APPENDF_2(s->aux_blk, buf, "%d%c",
- md_dist, s->ref[ref_pos-s->ref_start +1]);
- md_dist = 0;
+ if (ds & CRAM_BS) {
+ if (!c->comp_hdr->codecs[DS_BS]) return -1;
+ r |= c->comp_hdr->codecs[DS_BS]
+ ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
+ (char *)&base, &out_sz);
+ if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
+ seq[pos-1] = 'N';
+ } else {
+ ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]];
+ seq[pos-1] = c->comp_hdr->
+ substitution_matrix[ref_base][base];
+ if (decode_md) {
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+ BLOCK_APPEND_CHAR(s->aux_blk,
+ s->ref[ref_pos-s->ref_start +1]);
+ md_dist = 0;
+ }
}
}
cig_op = BAM_CMATCH;
@@ -852,20 +1259,25 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (!c->comp_hdr->DL_codec) return -1;
- r |= c->comp_hdr->DL_codec->decode(s, c->comp_hdr->DL_codec, blk,
- (char *)&i32, &out_sz);
- if (decode_md) {
- BLOCK_APPENDF_1(s->aux_blk, buf, "%d^", md_dist);
- BLOCK_APPEND(s->aux_blk, &s->ref[ref_pos - s->ref_start +1],
- i32);
- md_dist = 0;
- }
- cig_op = BAM_CDEL;
- cig_len += i32;
- ref_pos += i32;
- nm += i32;
- //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
+ if (ds & CRAM_DL) {
+ if (!c->comp_hdr->codecs[DS_DL]) return -1;
+ r |= c->comp_hdr->codecs[DS_DL]
+ ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
+ (char *)&i32, &out_sz);
+ if (decode_md) {
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+ BLOCK_APPEND_CHAR(s->aux_blk, '^');
+ BLOCK_APPEND(s->aux_blk,
+ &s->ref[ref_pos - s->ref_start +1],
+ i32);
+ md_dist = 0;
+ }
+ cig_op = BAM_CDEL;
+ cig_len += i32;
+ ref_pos += i32;
+ nm += i32;
+ //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
+ }
break;
}
@@ -877,14 +1289,17 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cig_len = 0;
}
- if (!c->comp_hdr->IN_codec) return -1;
- r |= c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec, blk,
- &seq[pos-1], &out_sz2);
- cig_op = BAM_CINS;
- cig_len += out_sz2;
- seq_pos += out_sz2;
- nm += out_sz2;
- //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
+ if (ds & CRAM_IN) {
+ if (!c->comp_hdr->codecs[DS_IN]) return -1;
+ r |= c->comp_hdr->codecs[DS_IN]
+ ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
+ &seq[pos-1], &out_sz2);
+ cig_op = BAM_CINS;
+ cig_len += out_sz2;
+ seq_pos += out_sz2;
+ nm += out_sz2;
+ //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
+ }
break;
}
@@ -893,14 +1308,64 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (!c->comp_hdr->BA_codec) return -1;
- r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk,
- (char *)&seq[pos-1], &out_sz);
+ if (ds & CRAM_BA) {
+ if (!c->comp_hdr->codecs[DS_BA]) return -1;
+ r |= c->comp_hdr->codecs[DS_BA]
+ ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
+ (char *)&seq[pos-1], &out_sz);
+ }
cig_op = BAM_CINS;
cig_len++;
seq_pos++;
nm++;
- //printf(" %d: BA = %c (ret %d)\n", f, seq[pos-1], r);
+ break;
+ }
+
+ case 'b': { // Several bases
+ int32_t len = 1;
+
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+
+ if (ds & CRAM_BB) {
+ if (!c->comp_hdr->codecs[DS_BB]) return -1;
+ r |= c->comp_hdr->codecs[DS_BB]
+ ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
+ (char *)&seq[pos-1], &len);
+ }
+
+ cig_op = BAM_CMATCH;
+
+ cig_len+=len;
+ seq_pos+=len;
+ ref_pos+=len;
+ //prev_pos+=len;
+ break;
+ }
+
+ case 'q': { // Several quality values
+ int32_t len = 1;
+
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+
+ if (ds & CRAM_QQ) {
+ if (!c->comp_hdr->codecs[DS_QQ]) return -1;
+ r |= c->comp_hdr->codecs[DS_QQ]
+ ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
+ (char *)&qual[pos-1], &len);
+ }
+
+ cig_op = BAM_CMATCH;
+
+ cig_len+=len;
+ seq_pos+=len;
+ ref_pos+=len;
+ //prev_pos+=len;
break;
}
@@ -916,12 +1381,18 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cig_len = 0;
}
#endif
- if (!c->comp_hdr->BA_codec) return -1;
- r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk,
- (char *)&seq[pos-1], &out_sz);
- if (!c->comp_hdr->QS_codec) return -1;
- r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk,
- (char *)&qual[pos-1], &out_sz);
+ if (ds & CRAM_BA) {
+ if (!c->comp_hdr->codecs[DS_BA]) return -1;
+ r |= c->comp_hdr->codecs[DS_BA]
+ ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
+ (char *)&seq[pos-1], &out_sz);
+ }
+ if (ds & CRAM_QS) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
+ (char *)&qual[pos-1], &out_sz);
+ }
#ifdef USE_X
cig_op = BAM_CBASE_MISMATCH;
#else
@@ -935,10 +1406,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
}
case 'Q': { // Quality score; QS
- if (!c->comp_hdr->QS_codec) return -1;
- r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk,
- (char *)&qual[pos-1], &out_sz);
- //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
+ if (ds & CRAM_QS) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
+ (char *)&qual[pos-1], &out_sz);
+ //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
+ }
break;
}
@@ -947,12 +1421,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (!c->comp_hdr->HC_codec) return -1;
- r |= c->comp_hdr->HC_codec->decode(s, c->comp_hdr->HC_codec, blk,
- (char *)&i32, &out_sz);
- cig_op = BAM_CHARD_CLIP;
- cig_len += i32;
- nm += i32;
+ if (ds & CRAM_HC) {
+ if (!c->comp_hdr->codecs[DS_HC]) return -1;
+ r |= c->comp_hdr->codecs[DS_HC]
+ ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
+ (char *)&i32, &out_sz);
+ cig_op = BAM_CHARD_CLIP;
+ cig_len += i32;
+ nm += i32;
+ }
break;
}
@@ -961,12 +1438,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (!c->comp_hdr->PD_codec) return -1;
- r |= c->comp_hdr->PD_codec->decode(s, c->comp_hdr->PD_codec, blk,
- (char *)&i32, &out_sz);
- cig_op = BAM_CPAD;
- cig_len += i32;
- nm += i32;
+ if (ds & CRAM_PD) {
+ if (!c->comp_hdr->codecs[DS_PD]) return -1;
+ r |= c->comp_hdr->codecs[DS_PD]
+ ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
+ (char *)&i32, &out_sz);
+ cig_op = BAM_CPAD;
+ cig_len += i32;
+ nm += i32;
+ }
break;
}
@@ -975,13 +1455,16 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
- if (!c->comp_hdr->RS_codec) return -1;
- r |= c->comp_hdr->RS_codec->decode(s, c->comp_hdr->RS_codec, blk,
- (char *)&i32, &out_sz);
- cig_op = BAM_CREF_SKIP;
- cig_len += i32;
- ref_pos += i32;
- nm += i32;
+ if (ds & CRAM_RS) {
+ if (!c->comp_hdr->codecs[DS_RS]) return -1;
+ r |= c->comp_hdr->codecs[DS_RS]
+ ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
+ (char *)&i32, &out_sz);
+ cig_op = BAM_CREF_SKIP;
+ cig_len += i32;
+ ref_pos += i32;
+ nm += i32;
+ }
break;
}
@@ -990,8 +1473,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
}
}
+ if (!(ds & CRAM_FC))
+ goto skip_cigar;
+
/* An implement match op for any unaccounted for bases */
- if (cr->len >= seq_pos) {
+ if ((ds & CRAM_FN) && cr->len >= seq_pos) {
if (s->ref) {
if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
static int whinged = 0;
@@ -1027,8 +1513,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
#endif
cig_len += cr->len - seq_pos+1;
}
- if (decode_md) {
- BLOCK_APPENDF_1(s->aux_blk, buf, "%d", md_dist);
+
+ skip_cigar:
+
+ if ((ds & CRAM_FN) && decode_md) {
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
}
if (cig_len) {
@@ -1047,16 +1536,24 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
//printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
- if (!c->comp_hdr->MQ_codec) return -1;
- r |= c->comp_hdr->MQ_codec->decode(s, c->comp_hdr->MQ_codec, blk,
- (char *)&cr->mqual, &out_sz);
+ if (ds & CRAM_MQ) {
+ if (!c->comp_hdr->codecs[DS_MQ]) return -1;
+ r |= c->comp_hdr->codecs[DS_MQ]
+ ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
+ (char *)&cr->mqual, &out_sz);
+ } else {
+ cr->mqual = 40;
+ }
- if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
+ if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
int32_t out_sz2 = cr->len;
- if (!c->comp_hdr->Qs_codec) return -1;
- r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec, blk,
- qual, &out_sz2);
+ if (ds & CRAM_QS) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
+ qual, &out_sz2);
+ }
}
s->cigar = cigar;
@@ -1100,9 +1597,9 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
int i, r = 0, out_sz = 1;
unsigned char ntags;
- if (!c->comp_hdr->TC_codec) return -1;
- r |= c->comp_hdr->TC_codec->decode(s, c->comp_hdr->TC_codec, blk,
- (char *)&ntags, &out_sz);
+ if (!c->comp_hdr->codecs[DS_TC]) return -1;
+ r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
+ (char *)&ntags, &out_sz);
cr->ntags = ntags;
//printf("TC=%d\n", cr->ntags);
@@ -1115,9 +1612,9 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
cram_map *m;
//printf("Tag %d/%d\n", i+1, cr->ntags);
- if (!c->comp_hdr->TN_codec) return -1;
- r |= c->comp_hdr->TN_codec->decode(s, c->comp_hdr->TN_codec,
- blk, (char *)&id, &out_sz);
+ if (!c->comp_hdr->codecs[DS_TN]) return -1;
+ r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
+ blk, (char *)&id, &out_sz);
if (out_sz == 3) {
tag_data[0] = ((char *)&id)[0];
tag_data[1] = ((char *)&id)[1];
@@ -1145,12 +1642,19 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
static int cram_decode_aux(cram_container *c, cram_slice *s,
cram_block *blk, cram_record *cr) {
int i, r = 0, out_sz = 1;
- int32_t TL;
+ int32_t TL = 0;
unsigned char *TN;
+ uint32_t ds = c->comp_hdr->data_series;
- if (!c->comp_hdr->TL_codec) return -1;
- r |= c->comp_hdr->TL_codec->decode(s, c->comp_hdr->TL_codec, blk,
- (char *)&TL, &out_sz);
+ if (!(ds & (CRAM_TL|CRAM_aux))) {
+ cr->aux = 0;
+ cr->aux_size = 0;
+ return 0;
+ }
+
+ if (!c->comp_hdr->codecs[DS_TL]) return -1;
+ r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
+ (char *)&TL, &out_sz);
if (r || TL < 0 || TL >= c->comp_hdr->nTL)
return -1;
@@ -1161,6 +1665,9 @@ static int cram_decode_aux(cram_container *c, cram_slice *s,
cr->aux_size = 0;
cr->aux = BLOCK_SIZE(s->aux_blk);
+ if (!(ds & CRAM_aux))
+ return 0;
+
for (i = 0; i < cr->ntags; i++) {
int32_t id, out_sz = 1;
unsigned char tag_data[3];
@@ -1186,9 +1693,21 @@ static int cram_decode_aux(cram_container *c, cram_slice *s,
}
/* Resolve mate pair cross-references between recs within this slice */
-static void cram_decode_slice_xref(cram_slice *s) {
+static void cram_decode_slice_xref(cram_slice *s, int required_fields) {
int rec;
+ if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
+ for (rec = 0; rec < s->hdr->num_records; rec++) {
+ cram_record *cr = &s->crecs[rec];
+
+ cr->tlen = 0;
+ cr->mate_pos = 0;
+ cr->mate_ref_id = -1;
+ }
+
+ return;
+ }
+
for (rec = 0; rec < s->hdr->num_records; rec++) {
cram_record *cr = &s->crecs[rec];
@@ -1213,9 +1732,14 @@ static void cram_decode_slice_xref(cram_slice *s) {
int tlen;
int ref = cr->ref_id;
+ // number of segments starting at the same point.
+ int left_cnt = 0;
+
do {
if (aleft > s->crecs[id2].apos)
- aleft = s->crecs[id2].apos;
+ aleft = s->crecs[id2].apos, left_cnt = 1;
+ else if (aleft == s->crecs[id2].apos)
+ left_cnt++;
if (aright < s->crecs[id2].aend)
aright = s->crecs[id2].aend;
if (s->crecs[id2].mate_line == -1) {
@@ -1239,9 +1763,8 @@ static void cram_decode_slice_xref(cram_slice *s) {
* bit flags instead, as a tie breaker.
*/
if (s->crecs[id2].apos == aleft) {
- if (s->crecs[id2].aend != aright)
- s->crecs[id2].tlen = tlen;
- else if (s->crecs[id2].flags & BAM_FREAD1)
+ if (left_cnt == 1 ||
+ (s->crecs[id2].flags & BAM_FREAD1))
s->crecs[id2].tlen = tlen;
else
s->crecs[id2].tlen = -tlen;
@@ -1252,9 +1775,8 @@ static void cram_decode_slice_xref(cram_slice *s) {
id2 = s->crecs[id2].mate_line;
while (id2 != id1) {
if (s->crecs[id2].apos == aleft) {
- if (s->crecs[id2].aend != aright)
- s->crecs[id2].tlen = tlen;
- else if (s->crecs[id2].flags & BAM_FREAD1)
+ if (left_cnt == 1 ||
+ (s->crecs[id2].flags & BAM_FREAD1))
s->crecs[id2].tlen = tlen;
else
s->crecs[id2].tlen = -tlen;
@@ -1339,15 +1861,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
unsigned char cf;
int out_sz, r = 0;
int rec;
- char *seq, *qual;
+ char *seq = NULL, *qual = NULL;
int unknown_rg = -1;
- int id, embed_ref;
+ int embed_ref;
char **refs = NULL;
+ uint32_t ds;
- for (id = 0; id < s->hdr->num_blocks; id++) {
- if (cram_uncompress_block(s->block[id]))
- return -1;
- }
+ if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
+ return -1;
+
+ ds = c->comp_hdr->data_series;
blk->bit = 7; // MSB first
@@ -1378,6 +1901,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
if (!s->block_by_id ||
!(b = s->block_by_id[s->hdr->ref_base_id]))
return -1;
+ cram_uncompress_block(b);
s->ref = (char *)BLOCK_DATA(b);
s->ref_start = s->hdr->ref_seq_start;
s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
@@ -1386,10 +1910,11 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
//s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
//s->ref_start = 1;
- s->ref =
- cram_get_ref(fd, s->hdr->ref_seq_id,
- s->hdr->ref_seq_start,
- s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
+ if (fd->required_fields & SAM_SEQ)
+ s->ref =
+ cram_get_ref(fd, s->hdr->ref_seq_id,
+ s->hdr->ref_seq_start,
+ s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
s->ref_start = s->hdr->ref_seq_start;
s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
@@ -1400,7 +1925,8 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
}
pthread_mutex_lock(&fd->ref_lock);
pthread_mutex_lock(&fd->refs->lock);
- if (s->ref_end > fd->refs->ref_id[ref_id]->length) {
+ if ((fd->required_fields & SAM_SEQ) &&
+ s->ref_end > fd->refs->ref_id[ref_id]->length) {
fprintf(stderr, "Slice ends beyond reference end.\n");
s->ref_end = fd->refs->ref_id[ref_id]->length;
}
@@ -1409,14 +1935,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
}
}
- if (s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
+ if ((fd->required_fields & SAM_SEQ) &&
+ s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
s->hdr->ref_seq_id, s->hdr->ref_seq_start,
s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
return -1;
}
- if (fd->version != CRAM_1_VERS && s->hdr->ref_seq_id >= 0
+ if (CRAM_MAJOR_VERS(fd->version) != 1
+ && (fd->required_fields & SAM_SEQ)
+ && s->hdr->ref_seq_id >= 0
&& !fd->ignore_md5
&& memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
MD5_CTX md5;
@@ -1483,67 +2012,96 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
cr->s = s;
out_sz = 1; /* decode 1 item */
- if (!c->comp_hdr->BF_codec) return -1;
- r |= c->comp_hdr->BF_codec->decode(s, c->comp_hdr->BF_codec, blk,
- (char *)&bf, &out_sz);
- if (bf < 0 ||
- bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
- return -1;
- bf = fd->bam_flag_swap[bf];
- cr->flags = bf;
-
- if (fd->version == CRAM_1_VERS) {
- /* CF is byte in 1.0, int32 in 2.0 */
- if (!c->comp_hdr->CF_codec) return -1;
- r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk,
- (char *)&cf, &out_sz);
- cr->cram_flags = cf;
+ if (ds & CRAM_BF) {
+ if (!c->comp_hdr->codecs[DS_BF]) return -1;
+ r |= c->comp_hdr->codecs[DS_BF]
+ ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
+ (char *)&bf, &out_sz);
+ if (bf < 0 ||
+ bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
+ return -1;
+ bf = fd->bam_flag_swap[bf];
+ cr->flags = bf;
} else {
- if (!c->comp_hdr->CF_codec) return -1;
- r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk,
- (char *)&cr->cram_flags,
- &out_sz);
- cf = cr->cram_flags;
+ cr->flags = bf = 0x4; // unmapped
+ }
+
+ if (ds & CRAM_CF) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ /* CF is byte in 1.0, int32 in 2.0 */
+ if (!c->comp_hdr->codecs[DS_CF]) return -1;
+ r |= c->comp_hdr->codecs[DS_CF]
+ ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
+ (char *)&cf, &out_sz);
+ cr->cram_flags = cf;
+ } else {
+ if (!c->comp_hdr->codecs[DS_CF]) return -1;
+ r |= c->comp_hdr->codecs[DS_CF]
+ ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
+ (char *)&cr->cram_flags,
+ &out_sz);
+ cf = cr->cram_flags;
+ }
}
- if (fd->version != CRAM_1_VERS && ref_id == -2) {
- if (!c->comp_hdr->RI_codec) return -1;
- r |= c->comp_hdr->RI_codec->decode(s, c->comp_hdr->RI_codec, blk,
- (char *)&cr->ref_id, &out_sz);
- if (cr->ref_id >= 0) {
- if (!fd->no_ref) {
- if (!refs[cr->ref_id])
- refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
- s->ref = refs[cr->ref_id];
+ if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
+ if (ds & CRAM_RI) {
+ if (!c->comp_hdr->codecs[DS_RI]) return -1;
+ r |= c->comp_hdr->codecs[DS_RI]
+ ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
+ (char *)&cr->ref_id, &out_sz);
+ if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
+ && cr->ref_id >= 0) {
+ if (!fd->no_ref) {
+ if (!refs[cr->ref_id])
+ refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
+ 1, 0);
+ s->ref = refs[cr->ref_id];
+ }
+ s->ref_start = 1;
+ pthread_mutex_lock(&fd->ref_lock);
+ pthread_mutex_lock(&fd->refs->lock);
+ s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
}
- s->ref_start = 1;
- pthread_mutex_lock(&fd->ref_lock);
- pthread_mutex_lock(&fd->refs->lock);
- s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
- pthread_mutex_unlock(&fd->refs->lock);
- pthread_mutex_unlock(&fd->ref_lock);
+ } else {
+ cr->ref_id = 0;
}
} else {
cr->ref_id = ref_id; // Forced constant in CRAM 1.0
}
- if (!c->comp_hdr->RL_codec) return -1;
- r |= c->comp_hdr->RL_codec->decode(s, c->comp_hdr->RL_codec, blk,
- (char *)&cr->len, &out_sz);
+ if (ds & CRAM_RL) {
+ if (!c->comp_hdr->codecs[DS_RL]) return -1;
+ r |= c->comp_hdr->codecs[DS_RL]
+ ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
+ (char *)&cr->len, &out_sz);
+ }
- if (!c->comp_hdr->AP_codec) return -1;
- r |= c->comp_hdr->AP_codec->decode(s, c->comp_hdr->AP_codec, blk,
- (char *)&cr->apos, &out_sz);
- if (c->comp_hdr->AP_delta)
- cr->apos += s->last_apos;
- s->last_apos= cr->apos;
+ if (ds & CRAM_AP) {
+ if (!c->comp_hdr->codecs[DS_AP]) return -1;
+ r |= c->comp_hdr->codecs[DS_AP]
+ ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
+ (char *)&cr->apos, &out_sz);
+ if (c->comp_hdr->AP_delta)
+ cr->apos += s->last_apos;
+ s->last_apos= cr->apos;
+ } else {
+ cr->apos = c->ref_seq_start;
+ }
- if (!c->comp_hdr->RG_codec) return -1;
- r |= c->comp_hdr->RG_codec->decode(s, c->comp_hdr->RG_codec, blk,
- (char *)&cr->rg, &out_sz);
- if (cr->rg == unknown_rg)
+ if (ds & CRAM_RG) {
+ if (!c->comp_hdr->codecs[DS_RG]) return -1;
+ r |= c->comp_hdr->codecs[DS_RG]
+ ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
+ (char *)&cr->rg, &out_sz);
+ if (cr->rg == unknown_rg)
+ cr->rg = -1;
+ } else {
cr->rg = -1;
+ }
cr->name_len = 0;
@@ -1552,28 +2110,38 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
// Read directly into name cram_block
cr->name = BLOCK_SIZE(s->name_blk);
- if (!c->comp_hdr->RN_codec) return -1;
- r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec, blk,
- (char *)s->name_blk, &out_sz2);
- cr->name_len = out_sz2;
+ if (ds & CRAM_RN) {
+ if (!c->comp_hdr->codecs[DS_RN]) return -1;
+ r |= c->comp_hdr->codecs[DS_RN]
+ ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
+ (char *)s->name_blk, &out_sz2);
+ cr->name_len = out_sz2;
+ }
}
+ cr->mate_pos = 0;
cr->mate_line = -1;
cr->mate_ref_id = -1;
- if (cf & CRAM_FLAG_DETACHED) {
- if (fd->version == CRAM_1_VERS) {
- /* MF is byte in 1.0, int32 in 2.0 */
- unsigned char mf;
- if (!c->comp_hdr->MF_codec) return -1;
- r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec,
- blk, (char *)&mf, &out_sz);
- cr->mate_flags = mf;
+ if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
+ if (ds & CRAM_MF) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ /* MF is byte in 1.0, int32 in 2.0 */
+ unsigned char mf;
+ if (!c->comp_hdr->codecs[DS_MF]) return -1;
+ r |= c->comp_hdr->codecs[DS_MF]
+ ->decode(s, c->comp_hdr->codecs[DS_MF],
+ blk, (char *)&mf, &out_sz);
+ cr->mate_flags = mf;
+ } else {
+ if (!c->comp_hdr->codecs[DS_MF]) return -1;
+ r |= c->comp_hdr->codecs[DS_MF]
+ ->decode(s, c->comp_hdr->codecs[DS_MF],
+ blk,
+ (char *)&cr->mate_flags,
+ &out_sz);
+ }
} else {
- if (!c->comp_hdr->MF_codec) return -1;
- r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec,
- blk,
- (char *)&cr->mate_flags,
- &out_sz);
+ cr->mate_flags = 0;
}
if (!c->comp_hdr->read_names_included) {
@@ -1581,16 +2149,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
// Read directly into name cram_block
cr->name = BLOCK_SIZE(s->name_blk);
- if (!c->comp_hdr->RN_codec) return -1;
- r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec,
- blk, (char *)s->name_blk,
- &out_sz2);
- cr->name_len = out_sz2;
+ if (ds & CRAM_RN) {
+ if (!c->comp_hdr->codecs[DS_RN]) return -1;
+ r |= c->comp_hdr->codecs[DS_RN]
+ ->decode(s, c->comp_hdr->codecs[DS_RN],
+ blk, (char *)s->name_blk,
+ &out_sz2);
+ cr->name_len = out_sz2;
+ }
}
- if (!c->comp_hdr->NS_codec) return -1;
- r |= c->comp_hdr->NS_codec->decode(s, c->comp_hdr->NS_codec, blk,
- (char *)&cr->mate_ref_id, &out_sz);
+ if (ds & CRAM_NS) {
+ if (!c->comp_hdr->codecs[DS_NS]) return -1;
+ r |= c->comp_hdr->codecs[DS_NS]
+ ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
+ (char *)&cr->mate_ref_id, &out_sz);
+ }
// Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
// if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
@@ -1598,25 +2172,40 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
// cr->flags |= BAM_FMUNMAP;
// }
- if (!c->comp_hdr->NP_codec) return -1;
- r |= c->comp_hdr->NP_codec->decode(s, c->comp_hdr->NP_codec, blk,
- (char *)&cr->mate_pos, &out_sz);
- if (!c->comp_hdr->TS_codec) return -1;
- r |= c->comp_hdr->TS_codec->decode(s, c->comp_hdr->TS_codec, blk,
- (char *)&cr->tlen, &out_sz);
- } else if (cf & CRAM_FLAG_MATE_DOWNSTREAM) {
- if (!c->comp_hdr->NF_codec) return -1;
- r |= c->comp_hdr->NF_codec->decode(s, c->comp_hdr->NF_codec, blk,
- (char *)&cr->mate_line, &out_sz);
- cr->mate_line += rec + 1;
-
- //cr->name_len = sprintf(name, "%d", name_id++);
- //cr->name = DSTRING_LEN(name_ds);
- //dstring_nappend(name_ds, name, cr->name_len);
+ if (ds & CRAM_NP) {
+ if (!c->comp_hdr->codecs[DS_NP]) return -1;
+ r |= c->comp_hdr->codecs[DS_NP]
+ ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
+ (char *)&cr->mate_pos, &out_sz);
+ }
- cr->mate_ref_id = -1;
- cr->tlen = INT_MIN;
- cr->mate_pos = 0;
+ if (ds & CRAM_TS) {
+ if (!c->comp_hdr->codecs[DS_TS]) return -1;
+ r |= c->comp_hdr->codecs[DS_TS]
+ ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
+ (char *)&cr->tlen, &out_sz);
+ } else {
+ cr->tlen = INT_MIN;
+ }
+ } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
+ if (ds & CRAM_NF) {
+ if (!c->comp_hdr->codecs[DS_NF]) return -1;
+ r |= c->comp_hdr->codecs[DS_NF]
+ ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
+ (char *)&cr->mate_line, &out_sz);
+ cr->mate_line += rec + 1;
+
+ //cr->name_len = sprintf(name, "%d", name_id++);
+ //cr->name = DSTRING_LEN(name_ds);
+ //dstring_nappend(name_ds, name, cr->name_len);
+
+ cr->mate_ref_id = -1;
+ cr->tlen = INT_MIN;
+ cr->mate_pos = 0;
+ } else {
+ cr->mate_flags = 0;
+ cr->tlen = INT_MIN;
+ }
} else {
cr->mate_flags = 0;
cr->tlen = INT_MIN;
@@ -1635,31 +2224,40 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
*/
/* Auxiliary tags */
- if (fd->version == CRAM_1_VERS)
+ if (CRAM_MAJOR_VERS(fd->version) == 1)
r |= cram_decode_aux_1_0(c, s, blk, cr);
else
r |= cram_decode_aux(c, s, blk, cr);
/* Fake up dynamic string growth and appending */
- cr->seq = BLOCK_SIZE(s->seqs_blk);
- BLOCK_GROW(s->seqs_blk, cr->len);
- seq = (char *)BLOCK_END(s->seqs_blk);
- BLOCK_SIZE(s->seqs_blk) += cr->len;
+ if (ds & CRAM_RL) {
+ cr->seq = BLOCK_SIZE(s->seqs_blk);
+ BLOCK_GROW(s->seqs_blk, cr->len);
+ seq = (char *)BLOCK_END(s->seqs_blk);
+ BLOCK_SIZE(s->seqs_blk) += cr->len;
- if (!seq)
- return -1;
+ if (!seq)
+ return -1;
- cr->qual = BLOCK_SIZE(s->qual_blk);
- BLOCK_GROW(s->qual_blk, cr->len);
- qual = (char *)BLOCK_END(s->qual_blk);
- BLOCK_SIZE(s->qual_blk) += cr->len;
+ cr->qual = BLOCK_SIZE(s->qual_blk);
+ BLOCK_GROW(s->qual_blk, cr->len);
+ qual = (char *)BLOCK_END(s->qual_blk);
+ BLOCK_SIZE(s->qual_blk) += cr->len;
- if (!s->ref)
- memset(seq, '=', cr->len);
+ if (!s->ref)
+ memset(seq, '=', cr->len);
+ }
if (!(bf & BAM_FUNMAP)) {
/* Decode sequence and generate CIGAR */
- r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual);
+ if (ds & (CRAM_SEQ | CRAM_MQ)) {
+ r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual);
+ } else {
+ cr->cigar = 0;
+ cr->ncigar = 0;
+ cr->aend = cr->apos;
+ cr->mqual = 0;
+ }
} else {
int out_sz2 = cr->len;
@@ -1669,17 +2267,24 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
cr->aend = cr->apos;
cr->mqual = 0;
- if (!c->comp_hdr->BA_codec) return -1;
- r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk,
- (char *)seq, &out_sz2);
+ if (ds & CRAM_BA) {
+ if (!c->comp_hdr->codecs[DS_BA]) return -1;
+ r |= c->comp_hdr->codecs[DS_BA]
+ ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
+ (char *)seq, &out_sz2);
+ }
- if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
+ if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
out_sz2 = cr->len;
- if (!c->comp_hdr->Qs_codec) return -1;
- r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec,
- blk, qual, &out_sz2);
+ if (ds & CRAM_QS) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS],
+ blk, qual, &out_sz2);
+ }
} else {
- memset(qual, 30, cr->len);
+ if (ds & CRAM_RL)
+ memset(qual, 30, cr->len);
}
}
}
@@ -1698,7 +2303,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
pthread_mutex_unlock(&fd->ref_lock);
/* Resolve mate pair cross-references between recs within this slice */
- cram_decode_slice_xref(s);
+ cram_decode_slice_xref(s, fd->required_fields);
return r;
}
@@ -1738,7 +2343,7 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
j->s = s;
j->h = bfd;
- nonblock = t_pool_results_queue_len(fd->rqueue) ? 0 : 1;
+ nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0;
if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
j, nonblock)) {
@@ -1776,21 +2381,33 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
char name_a[1024], *name;
int name_len;
char *aux, *aux_orig;
+ char *seq, *qual;
/* Assign names if not explicitly set */
- if (cr->name_len) {
- name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
- name_len = cr->name_len;
+ if (fd->required_fields & SAM_QNAME) {
+ if (cr->name_len) {
+ name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
+ name_len = cr->name_len;
+ } else {
+ name = name_a;
+ name_len = strlen(fd->prefix);
+ memcpy(name, fd->prefix, name_len);
+ name += name_len;
+ *name++ = ':';
+ if (cr->mate_line >= 0 && cr->mate_line < rec)
+ name = (char *)append_uint64((unsigned char *)name,
+ s->hdr->record_counter +
+ cr->mate_line + 1);
+ else
+ name = (char *)append_uint64((unsigned char *)name,
+ s->hdr->record_counter +
+ rec + 1);
+ name_len = name - name_a;
+ name = name_a;
+ }
} else {
- // FIXME: add prefix, container number, slice number, etc
- name = name_a;
-
- if (cr->mate_line >= 0 && cr->mate_line < rec)
- name_len = sprintf(name_a, "%s:%"PRId64":%d",
- fd->prefix, s->id, cr->mate_line);
- else
- name_len = sprintf(name_a, "%s:%"PRId64":%d",
- fd->prefix, s->id, rec);
+ name = "?";
+ name_len = 1;
}
/* Generate BAM record */
@@ -1798,10 +2415,23 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
return -1;
rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
- if (!BLOCK_DATA(s->seqs_blk))
- return -1;
- if (!BLOCK_DATA(s->qual_blk))
- return -1;
+ if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
+ if (!BLOCK_DATA(s->seqs_blk))
+ return -1;
+ seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
+ } else {
+ seq = "*";
+ cr->len = 1;
+ }
+
+
+ if (fd->required_fields & SAM_QUAL) {
+ if (!BLOCK_DATA(s->qual_blk))
+ return -1;
+ qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
+ } else {
+ qual = NULL;
+ }
bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
name, name_len,
@@ -1815,8 +2445,8 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
cr->mate_pos,
cr->tlen,
cr->len,
- (char *)BLOCK_DATA(s->seqs_blk) + cr->seq,
- (char *)BLOCK_DATA(s->qual_blk) + cr->qual);
+ seq,
+ qual);
if (bam_idx == -1)
return -1;
@@ -1837,12 +2467,6 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
*aux++ = 0;
}
-#ifndef SAMTOOLS
- bam_set_blk_size(*bam, bam_blk_size(*bam) + (aux - aux_orig));
-#endif
-
- *aux++ = 0;
-
return bam_idx + (aux - aux_orig);
}
@@ -1853,8 +2477,6 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
cram_container *c;
cram_slice *s = NULL;
- fd->eof = 0;
-
if (!(c = fd->ctr)) {
// Load first container.
do {
@@ -1899,8 +2521,11 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
}
}
- if ((s = c->slice))
+ if ((s = c->slice)) {
+ c->slice = NULL;
cram_free_slice(s);
+ s = NULL;
+ }
if (c->curr_slice == c->max_slice) {
cram_free_container(c);
@@ -1934,14 +2559,22 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
/* Skip containers not yet spanning our range */
if (fd->range.refid != -2 && c->ref_seq_id != -2) {
+ fd->required_fields |= SAM_POS;
+
if (c->ref_seq_id != fd->range.refid) {
+ cram_free_container(c);
+ fd->ctr = NULL;
+ fd->ooc = 1;
fd->eof = 1;
- return NULL;
+ break;
}
if (c->ref_seq_start > fd->range.end) {
+ cram_free_container(c);
+ fd->ctr = NULL;
+ fd->ooc = 1;
fd->eof = 1;
- return NULL;
+ break;
}
if (c->ref_seq_start + c->ref_seq_span-1 <
@@ -2028,7 +2661,9 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
if (!fd->pool || fd->job_pending)
break;
- if (t_pool_results_queue_sz(fd->rqueue) > fd->pool->qsize)
+ // Push it a bit far, to qsize in queue rather than pending arrival,
+ // as cram tends to be a bit bursty in decode timings.
+ if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize)
break;
}
@@ -2054,6 +2689,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
c = j->c;
s = j->s;
+ fd->ctr = c;
+
t_pool_delete_result(res, 1);
}
diff --git a/htslib/cram/cram_encode.c b/htslib/cram/cram_encode.c
index 94c2ceb..8057e9c 100644
--- a/htslib/cram/cram_encode.c
+++ b/htslib/cram/cram_encode.c
@@ -47,25 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cram/os.h"
#include "cram/md5.h"
-#ifdef SAMTOOLS
-# define bam_copy(dst, src) bam_copy1(*(dst), (src))
-#else
-void bam_copy(bam_seq_t **bt, bam_seq_t *bf) {
- size_t a;
-
- if (bf->alloc > (*bt)->alloc) {
- a = ((int)((bf->alloc+15)/16))*16;
- *bt = realloc(*bt, a);
- memcpy(*bt, bf, bf->alloc);
- } else {
- a = (*bt)->alloc;
- memcpy(*bt, bf, bf->alloc);
- }
-
- (*bt)->alloc = a;
-}
-#endif
-
#define Z_CRAM_STRAT Z_FILTERED
//#define Z_CRAM_STRAT Z_RLE
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
@@ -111,7 +92,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
*/
// Duplicated from container itself, and removed in 1.1
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
itf8_put_blk(cb, h->ref_seq_id);
itf8_put_blk(cb, h->ref_seq_start);
itf8_put_blk(cb, h->ref_seq_span);
@@ -135,7 +116,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 1;
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
k = kh_put(map, h->preservation_map, "PI", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
@@ -269,160 +250,197 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
/* rec encoding map */
mc = 0;
BLOCK_SIZE(map) = 0;
- if (h->BF_codec) {
- if (-1 == h->BF_codec->store(h->BF_codec, map, "BF", fd->version))
+ if (h->codecs[DS_BF]) {
+ if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_CF]) {
+ if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
+ fd->version))
return NULL;
mc++;
}
- if (h->CF_codec) {
- if (-1 == h->CF_codec->store(h->CF_codec, map, "CF", fd->version))
+ if (h->codecs[DS_RL]) {
+ if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
+ fd->version))
return NULL;
mc++;
}
- if (h->RL_codec) {
- if (-1 == h->RL_codec->store(h->RL_codec, map, "RL", fd->version))
+ if (h->codecs[DS_AP]) {
+ if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
+ fd->version))
return NULL;
mc++;
}
- if (h->AP_codec) {
- if (-1 == h->AP_codec->store(h->AP_codec, map, "AP", fd->version))
+ if (h->codecs[DS_RG]) {
+ if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
+ fd->version))
return NULL;
mc++;
}
- if (h->RG_codec) {
- if (-1 == h->RG_codec->store(h->RG_codec, map, "RG", fd->version))
+ if (h->codecs[DS_MF]) {
+ if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
+ fd->version))
return NULL;
mc++;
}
- if (h->MF_codec) {
- if (-1 == h->MF_codec->store(h->MF_codec, map, "MF", fd->version))
+ if (h->codecs[DS_NS]) {
+ if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
+ fd->version))
return NULL;
mc++;
}
- if (h->NS_codec) {
- if (-1 == h->NS_codec->store(h->NS_codec, map, "NS", fd->version))
+ if (h->codecs[DS_NP]) {
+ if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
+ fd->version))
return NULL;
mc++;
}
- if (h->NP_codec) {
- if (-1 == h->NP_codec->store(h->NP_codec, map, "NP", fd->version))
+ if (h->codecs[DS_TS]) {
+ if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
+ fd->version))
return NULL;
mc++;
}
- if (h->TS_codec) {
- if (-1 == h->TS_codec->store(h->TS_codec, map, "TS", fd->version))
+ if (h->codecs[DS_NF]) {
+ if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
+ fd->version))
return NULL;
mc++;
}
- if (h->NF_codec) {
- if (-1 == h->NF_codec->store(h->NF_codec, map, "NF", fd->version))
+ if (h->codecs[DS_TC]) {
+ if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
+ fd->version))
return NULL;
mc++;
}
- if (h->TC_codec) {
- if (-1 == h->TC_codec->store(h->TC_codec, map, "TC", fd->version))
+ if (h->codecs[DS_TN]) {
+ if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
+ fd->version))
return NULL;
mc++;
}
- if (h->TN_codec) {
- if (-1 == h->TN_codec->store(h->TN_codec, map, "TN", fd->version))
+ if (h->codecs[DS_TL]) {
+ if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
+ fd->version))
return NULL;
mc++;
}
- if (h->TL_codec) {
- if (-1 == h->TL_codec->store(h->TL_codec, map, "TL", fd->version))
+ if (h->codecs[DS_FN]) {
+ if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
+ fd->version))
return NULL;
mc++;
}
- if (h->FN_codec) {
- if (-1 == h->FN_codec->store(h->FN_codec, map, "FN", fd->version))
+ if (h->codecs[DS_FC]) {
+ if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
+ fd->version))
return NULL;
mc++;
}
- if (h->FC_codec) {
- if (-1 == h->FC_codec->store(h->FC_codec, map, "FC", fd->version))
+ if (h->codecs[DS_FP]) {
+ if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
+ fd->version))
return NULL;
mc++;
}
- if (h->FP_codec) {
- if (-1 == h->FP_codec->store(h->FP_codec, map, "FP", fd->version))
+ if (h->codecs[DS_BS]) {
+ if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
+ fd->version))
return NULL;
mc++;
}
- if (h->BS_codec) {
- if (-1 == h->BS_codec->store(h->BS_codec, map, "BS", fd->version))
+ if (h->codecs[DS_IN]) {
+ if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
+ fd->version))
return NULL;
mc++;
}
- if (h->IN_codec) {
- if (-1 == h->IN_codec->store(h->IN_codec, map, "IN", fd->version))
+ if (h->codecs[DS_DL]) {
+ if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
+ fd->version))
return NULL;
mc++;
}
- if (h->DL_codec) {
- if (-1 == h->DL_codec->store(h->DL_codec, map, "DL", fd->version))
+ if (h->codecs[DS_BA]) {
+ if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
+ fd->version))
return NULL;
mc++;
}
- if (h->BA_codec) {
- if (-1 == h->BA_codec->store(h->BA_codec, map, "BA", fd->version))
+ if (h->codecs[DS_BB]) {
+ if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
+ fd->version))
return NULL;
mc++;
}
- if (h->MQ_codec) {
- if (-1 == h->MQ_codec->store(h->MQ_codec, map, "MQ", fd->version))
+ if (h->codecs[DS_MQ]) {
+ if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
+ fd->version))
return NULL;
mc++;
}
- if (h->RN_codec) {
- if (-1 == h->RN_codec->store(h->RN_codec, map, "RN", fd->version))
+ if (h->codecs[DS_RN]) {
+ if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
+ fd->version))
return NULL;
mc++;
}
- if (h->QS_codec) {
- if (-1 == h->QS_codec->store(h->QS_codec, map, "QS", fd->version))
+ if (h->codecs[DS_QS]) {
+ if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
+ fd->version))
return NULL;
mc++;
}
- if (h->Qs_codec) {
- if (-1 == h->Qs_codec->store(h->Qs_codec, map, "Qs", fd->version))
+ if (h->codecs[DS_QQ]) {
+ if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
+ fd->version))
return NULL;
mc++;
}
- if (h->RI_codec) {
- if (-1 == h->RI_codec->store(h->RI_codec, map, "RI", fd->version))
+ if (h->codecs[DS_RI]) {
+ if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
+ fd->version))
return NULL;
mc++;
}
- if (fd->version != CRAM_1_VERS) {
- if (h->SC_codec) {
- if (-1 == h->SC_codec->store(h->SC_codec, map, "SC", fd->version))
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
+ if (h->codecs[DS_SC]) {
+ if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
+ fd->version))
return NULL;
mc++;
}
- if (h->RS_codec) {
- if (-1 == h->RS_codec->store(h->RS_codec, map, "RS", fd->version))
+ if (h->codecs[DS_RS]) {
+ if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
+ fd->version))
return NULL;
mc++;
}
- if (h->PD_codec) {
- if (-1 == h->PD_codec->store(h->PD_codec, map, "PD", fd->version))
+ if (h->codecs[DS_PD]) {
+ if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
+ fd->version))
return NULL;
mc++;
}
- if (h->HC_codec) {
- if (-1 == h->HC_codec->store(h->HC_codec, map, "HC", fd->version))
+ if (h->codecs[DS_HC]) {
+ if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
+ fd->version))
return NULL;
mc++;
}
}
- if (h->TM_codec) {
- if (-1 == h->TM_codec->store(h->TM_codec, map, "TM", fd->version))
+ if (h->codecs[DS_TM]) {
+ if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
+ fd->version))
return NULL;
mc++;
}
- if (h->TV_codec) {
- if (-1 == h->TV_codec->store(h->TV_codec, map, "TV", fd->version))
+ if (h->codecs[DS_TV]) {
+ if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
+ fd->version))
return NULL;
mc++;
}
@@ -458,7 +476,10 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
if (c->tags_used) {
khint_t k;
+#define TAG_ID(a) ((#a[0]<<8)+#a[1])
+
for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
+ int key;
if (!kh_exist(c->tags_used, k))
continue;
@@ -466,23 +487,73 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
itf8_put_blk(map, kh_key(c->tags_used, k));
// use block content id 4
- switch(kh_key(c->tags_used, k) & 0xff) {
+ switch((key = kh_key(c->tags_used, k)) & 0xff) {
case 'Z': case 'H':
// string as byte_array_stop
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\005" // len
"\t" // stop-byte is also SAM separator
- CRAM_EXT_TAG_S "\000\000\000",
+ DS_aux_S "\000\000\000",
7);
} else {
- BLOCK_APPEND(map,
- "\005" // BYTE_ARRAY_STOP
- "\002" // len
- "\t" // stop-byte is also SAM separator
- CRAM_EXT_TAG_S,
- 4);
+ if (key>>8 == TAG_ID(OQ))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_OQ_S,
+ 4);
+ else if (key>>8 == TAG_ID(BQ))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_BQ_S,
+ 4);
+ else if (key>>8 == TAG_ID(BD))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_BD_S,
+ 4);
+ else if (key>>8 == TAG_ID(BI))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_BI_S,
+ 4);
+ else if ((key>>8 == TAG_ID(Q2)) ||
+ (key>>8 == TAG_ID(U2)) ||
+ (key>>8 == TAG_ID(QT)) ||
+ (key>>8 == TAG_ID(CQ)))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_oq_S,
+ 4);
+ else if ((key>>8 == TAG_ID(R2)) ||
+ (key>>8 == TAG_ID(E2)) ||
+ (key>>8 == TAG_ID(CS)) ||
+ (key>>8 == TAG_ID(BC)) ||
+ (key>>8 == TAG_ID(RT)))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_os_S,
+ 4);
+ else
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_oz_S,
+ 4);
}
break;
@@ -499,7 +570,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
"\000" // length=0
"\001" // EXTERNAL (val)
"\001" // external-len
- CRAM_EXT_TAG_S,// content-id
+ DS_aux_S,// content-id
11);
break;
@@ -516,7 +587,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
"\000" // length=0
"\001" // EXTERNAL (val)
"\001" // external-len
- CRAM_EXT_TAG_S,// content-id
+ DS_aux_S,// content-id
11);
break;
@@ -533,7 +604,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
"\000" // length=0
"\001" // EXTERNAL (val)
"\001" // external-len
- CRAM_EXT_TAG_S,// content-id
+ DS_aux_S,// content-id
11);
break;
@@ -543,16 +614,28 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
// after slice header construction). So we use
// BYTE_ARRAY_LEN with the length codec being external
// too.
- BLOCK_APPEND(map,
- "\004" // BYTE_ARRAY_LEN
- "\006" // length
- "\001" // EXTERNAL (len)
- "\001" // external-len
- "\004" // content-id
- "\001" // EXTERNAL (val)
- "\001" // external-len
- CRAM_EXT_TAG_S,// content-id
- 8);
+ if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM)))
+ BLOCK_APPEND(map,
+ "\004" // BYTE_ARRAY_LEN
+ "\006" // length
+ "\001" // EXTERNAL (len)
+ "\001" // external-len
+ DS_aux_FZ_S // content-id
+ "\001" // EXTERNAL (val)
+ "\001" // external-len
+ DS_aux_FZ_S,// content-id
+ 8);
+ else
+ BLOCK_APPEND(map,
+ "\004" // BYTE_ARRAY_LEN
+ "\006" // length
+ "\001" // EXTERNAL (len)
+ "\001" // external-len
+ DS_aux_S // content-id
+ "\001" // EXTERNAL (val)
+ "\001" // external-len
+ DS_aux_S,// content-id
+ 8);
break;
default:
@@ -603,8 +686,10 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
cp += itf8_put(cp, s->hdr->ref_seq_start);
cp += itf8_put(cp, s->hdr->ref_seq_span);
cp += itf8_put(cp, s->hdr->num_records);
- if (fd->version != CRAM_1_VERS)
+ if (CRAM_MAJOR_VERS(fd->version) == 2)
cp += itf8_put(cp, s->hdr->record_counter);
+ else if (CRAM_MAJOR_VERS(fd->version) >= 3)
+ cp += ltf8_put(cp, s->hdr->record_counter);
cp += itf8_put(cp, s->hdr->num_blocks);
cp += itf8_put(cp, s->hdr->num_content_ids);
for (j = 0; j < s->hdr->num_content_ids; j++) {
@@ -613,7 +698,7 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
if (s->hdr->content_type == MAPPED_SLICE)
cp += itf8_put(cp, s->hdr->ref_base_id);
- if (fd->version != CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
memcpy(cp, s->hdr->md5, 16); cp += 16;
}
@@ -627,451 +712,506 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
/*
- * Encodes a single slice from a container
- * FIXME: break into smaller components.
+ * Encodes a single read.
*
* Returns 0 on success
* -1 on failure
*/
-static int cram_encode_slice(cram_fd *fd, cram_container *c,
- cram_block_compression_hdr *h, cram_slice *s) {
- int rec, r = 0, last_pos;
- cram_block *core;
- int nblk, embed_ref;
+static int cram_encode_slice_read(cram_fd *fd,
+ cram_container *c,
+ cram_block_compression_hdr *h,
+ cram_slice *s,
+ cram_record *cr,
+ int *last_pos) {
+ int r = 0;
+ int32_t i32;
+ unsigned char uc;
- embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
+ //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
- /*
- * Slice external blocks:
- * ID 0 => base calls (insertions, soft-clip)
- * ID 1 => qualities
- * ID 2 => names
- * ID 3 => TS (insert size), NP (next frag)
- * ID 4 => tag values
- * ID 5 => BA, ifdef BA_external
- * ID 6 => tag IDs (TN), ifdef TN_external and CRAM_1_VERS
- * ID 7 => TD tag dictionary, if !CRAM_1_VERS
- */
+ //printf("BF=0x%x\n", cr->flags);
+ // bf = cram_flag_swap[cr->flags];
+ i32 = fd->cram_flag_swap[cr->flags & 0xfff];
+ r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
- /* Create cram slice header, num_blocks etc */
- s->hdr->ref_base_id = embed_ref ? CRAM_EXT_REF : -1;
- s->hdr->record_counter = c->num_records + c->record_counter;
- c->num_records += s->hdr->num_records;
- nblk = (fd->version == CRAM_1_VERS) ? 5 : 6;
-#ifdef BA_external
- nblk++;
-#endif
-#ifdef TN_external
- if (fd->version == CRAM_1_VERS) {
- nblk++;
- }
-#endif
- if (embed_ref)
- nblk++;
-
- s->hdr->num_content_ids = nblk;
- s->hdr->num_blocks = s->hdr->num_content_ids+1;
- s->block = calloc(s->hdr->num_blocks, sizeof(s->block[0]));
- s->hdr->block_content_ids = malloc(s->hdr->num_content_ids *
- sizeof(int32_t));
- if (!s->block || !s->hdr->block_content_ids)
- return -1;
- s->hdr->block_content_ids[0] = 0; // core
- s->hdr->block_content_ids[1] = CRAM_EXT_QUAL;
- s->hdr->block_content_ids[2] = CRAM_EXT_NAME;
- s->hdr->block_content_ids[3] = CRAM_EXT_TS_NP;
- s->hdr->block_content_ids[4] = CRAM_EXT_TAG;
- s->hdr->block_content_ids[5] = CRAM_EXT_SC;
- nblk = (fd->version == CRAM_1_VERS) ? 5 : 6;
-#ifdef BA_external
- s->hdr->block_content_ids[(s->ba_id = ++nblk)-1] = CRAM_EXT_BA;
-#endif
-#ifdef TN_external
- if (fd->version == CRAM_1_VERS) {
- s->hdr->block_content_ids[(s->tn_id = ++nblk)-1] = CRAM_EXT_TN;
- }
-#endif
- if (embed_ref)
- s->hdr->block_content_ids[(s->ref_id = ++nblk)-1] = CRAM_EXT_REF;
-
- if (!(s->block[0] = cram_new_block(CORE, 0))) return -1;
- if (!(s->block[1] = cram_new_block(EXTERNAL, CRAM_EXT_IN))) return -1;
- if (!(s->block[2] = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) return -1;
- if (!(s->block[3] = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) return -1;
- if (!(s->block[4] = cram_new_block(EXTERNAL, CRAM_EXT_TS_NP))) return -1;
- if (!(s->block[5] = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) return -1;
- if (fd->version != CRAM_1_VERS) {
- if (!(s->block[6] = cram_new_block(EXTERNAL, CRAM_EXT_SC)))
- return -1;
- }
-#ifdef BA_external
- if (!(s->block[s->ba_id] = cram_new_block(EXTERNAL, CRAM_EXT_BA)))
- return -1;
-#endif
-#ifdef TN_external
- if (fd->version == CRAM_1_VERS) {
- if (!(s->block[s->tn_id] = cram_new_block(EXTERNAL, CRAM_EXT_TN)))
- return -1;
- }
-#endif
- if (embed_ref) {
- if (!(s->block[s->ref_id] = cram_new_block(EXTERNAL, CRAM_EXT_REF)))
- return -1;
- BLOCK_APPEND(s->block[s->ref_id],
- c->ref + c->first_base - c->ref_start,
- c->last_base - c->first_base + 1);
- }
+ i32 = cr->cram_flags;
+ r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
- core = s->block[0];
-
- /* Create a formal method for stealing from dstrings! */
- s->block[4]->data = calloc(10, s->hdr->num_records); // NP TS
- if (!s->block[4]->data)
- return -1;
- s->block[4]->comp_size = s->block[4]->uncomp_size = 0;
+ if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
+ r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
-#ifdef BA_external
- s->block[s->ba_id]->data = calloc(1, s->BA_len);
- if (!s->block[s->ba_id]->data)
- return -1;
- s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size = 0;
-#endif
+ r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
- /* Generate core block */
- if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
- return -1;
+ if (c->pos_sorted) {
+ i32 = cr->apos - *last_pos;
+ r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
+ *last_pos = cr->apos;
+ } else {
+ i32 = cr->apos;
+ r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
+ }
- last_pos = s->hdr->ref_seq_start;
- for (rec = 0; rec < s->hdr->num_records; rec++) {
- cram_record *cr = &s->crecs[rec];
- int32_t i32;
- unsigned char uc;
+ r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
- //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
+ if (c->comp_hdr->read_names_included) {
+ // RN codec: Already stored in block[3].
+ }
- //printf("BF=0x%x\n", cr->flags);
- // bf = cram_flag_swap[cr->flags];
- i32 = fd->cram_flag_swap[cr->flags & 0xfff];
- r |= h->BF_codec->encode(s, h->BF_codec, core, (char *)&i32, 1);
+ if (cr->cram_flags & CRAM_FLAG_DETACHED) {
+ i32 = cr->mate_flags;
+ r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
- i32 = cr->cram_flags;
- r |= h->CF_codec->encode(s, h->CF_codec, core,
- (char *)&i32, 1);
+ if (!c->comp_hdr->read_names_included) {
+ // RN codec: Already stored in block[3].
+ }
- if (fd->version != CRAM_1_VERS)
- r |= h->RI_codec->encode(s, h->RI_codec, core,
- (char *)&cr->ref_id, 1);
+ r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
+ (char *)&cr->mate_ref_id, 1);
- r |= h->RL_codec->encode(s, h->RL_codec, core,
- (char *)&cr->len, 1);
+ r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
+ (char *)&cr->mate_pos, 1);
- if (c->pos_sorted) {
- i32 = cr->apos - last_pos;
- r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1);
- last_pos = cr->apos;
- } else {
- i32 = cr->apos;
- r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1);
- }
+ r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
+ (char *)&cr->tlen, 1);
+ } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
+ r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
+ (char *)&cr->mate_line, 1);
+ }
- r |= h->RG_codec->encode(s, h->RG_codec, core,
- (char *)&cr->rg, 1);
+ /* Aux tags */
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ int j;
+ uc = cr->ntags;
+ r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
- if (c->comp_hdr->read_names_included) {
- // RN codec: Already stored in block[3].
+ for (j = 0; j < cr->ntags; j++) {
+ uint32_t i32 = s->TN[cr->TN_idx + j]; // id
+ r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
}
+ } else {
+ r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
+ }
- if (cr->cram_flags & CRAM_FLAG_DETACHED) {
- i32 = cr->mate_flags;
- r |= h->MF_codec->encode(s, h->MF_codec, core, (char *)&i32, 1);
+ // qual
+ // QS codec : Already stored in block[2].
- if (!c->comp_hdr->read_names_included) {
- // RN codec: Already stored in block[3].
- }
+ // features (diffs)
+ if (!(cr->flags & BAM_FUNMAP)) {
+ int prev_pos = 0, j;
-#ifndef NS_external
- r |= h->NS_codec->encode(s, h->NS_codec, core,
- (char *)&cr->mate_ref_id, 1);
-#else
- s->block[4]->uncomp_size +=
- itf8_put(&s->block[4]->data[s->block[4]->uncomp_size],
- cr->mate_ref_id);
-#endif
+ r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
+ (char *)&cr->nfeature, 1);
+ for (j = 0; j < cr->nfeature; j++) {
+ cram_feature *f = &s->features[cr->feature + j];
-#ifndef TS_external
- r |= h->NP_codec->encode(s, h->NP_codec, core,
- (char *)&cr->mate_pos, 1);
+ uc = f->X.code;
+ r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
+ i32 = f->X.pos - prev_pos;
+ r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
+ prev_pos = f->X.pos;
- r |= h->TS_codec->encode(s, h->TS_codec, core,
- (char *)&cr->tlen, 1);
-#else
- s->block[4]->uncomp_size +=
- itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size],
- cr->mate_pos);
- s->block[4]->uncomp_size +=
- itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size],
- cr->tlen);
-#endif
- } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
- r |= h->NF_codec->encode(s, h->NF_codec, core,
- (char *)&cr->mate_line, 1);
- }
+ switch(f->X.code) {
+ //char *seq;
- /* Aux tags */
- if (fd->version == CRAM_1_VERS) {
- uc = cr->ntags;
- r |= h->TC_codec->encode(s, h->TC_codec, core, (char *)&uc, 1);
-#ifndef TN_external
- {
- int j;
- for (j = 0; j < cr->ntags; j++) {
- uint32_t i32 = s->TN[cr->TN_idx + j]; // id
- r |= h->TN_codec->encode(s, h->TN_codec, core,
- (char *)&i32, 1);
- }
- }
-#endif
- } else {
- r |= h->TL_codec->encode(s, h->TL_codec, core, (char *)&cr->TL, 1);
- }
-
- // qual
- // QS codec : Already stored in block[2].
+ case 'X':
+ //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
+
+ uc = f->X.base;
+ r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
+ (char *)&uc, 1);
+ break;
+ case 'S':
+ // Already done
+// r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
+// BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
+// f->S.len);
+
+// if (IS_CRAM_3_VERS(fd)) {
+// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
+// BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
+// f->S.len);
+// }
+ break;
+ case 'I':
+ //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
+ //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
+ // seq, f->S.len);
+// if (IS_CRAM_3_VERS(fd)) {
+// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
+// BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
+// f->I.len);
+// }
+ break;
+ case 'i':
+ uc = f->i.base;
+ r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
+ (char *)&uc, 1);
+ //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
+ //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
+ // seq, 1);
+ break;
+ case 'D':
+ i32 = f->D.len;
+ r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
+ (char *)&i32, 1);
+ break;
- // features (diffs)
- if (!(cr->flags & BAM_FUNMAP)) {
- int prev_pos = 0, j;
+ case 'B':
+ // // Used when we try to store a non ACGTN base or an N
+ // // that aligns against a non ACGTN reference
- r |= h->FN_codec->encode(s, h->FN_codec, core,
- (char *)&cr->nfeature, 1);
- for (j = 0; j < cr->nfeature; j++) {
- cram_feature *f = &s->features[cr->feature + j];
+ uc = f->B.base;
+ r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
+ (char *)&uc, 1);
- uc = f->X.code;
- r |= h->FC_codec->encode(s, h->FC_codec, core,
- (char *)&uc, 1);
- i32 = f->X.pos - prev_pos;
- r |= h->FP_codec->encode(s, h->FP_codec, core,
- (char *)&i32, 1);
- prev_pos = f->X.pos;
+ // Already added
+ // uc = f->B.qual;
+ // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
+ // (char *)&uc, 1);
+ break;
- switch(f->X.code) {
- //char *seq;
+ case 'b':
+ // string of bases
+ r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
+ (char *)BLOCK_DATA(s->seqs_blk)
+ + f->b.seq_idx,
+ f->b.len);
+ break;
- case 'X':
- //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
-
- uc = f->X.base;
- r |= h->BS_codec->encode(s, h->BS_codec, core,
- (char *)&uc, 1);
- break;
- case 'S':
- //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
- //r |= h->SC_codec->encode(s, h->SC_codec, core,
- // seq, f->S.len);
- break;
- case 'I':
- //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
- //r |= h->IN_codec->encode(s, h->IN_codec, core,
- // seq, f->S.len);
- break;
- case 'i':
- uc = f->i.base;
-#ifdef BA_external
- s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc;
-#else
- r |= h->BA_codec->encode(s, h->BA_codec, core,
- (char *)&uc, 1);
-#endif
- //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
- //r |= h->IN_codec->encode(s, h->IN_codec, core,
- // seq, 1);
- break;
- case 'D':
- i32 = f->D.len;
- r |= h->DL_codec->encode(s, h->DL_codec, core,
- (char *)&i32, 1);
- break;
-
- case 'B':
-// // Used when we try to store a non ACGTN base or an N
-// // that aligns against a non ACGTN reference
-
- uc = f->B.base;
-#ifdef BA_external
- s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc;
-#else
- r |= h->BA_codec->encode(s, h->BA_codec, core,
- (char *)&uc, 1);
-#endif
+ case 'Q':
+ // Already added
+ // uc = f->B.qual;
+ // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
+ // (char *)&uc, 1);
+ break;
-// Already added
-// uc = f->B.qual;
-// r |= h->QS_codec->encode(s, h->QS_codec, core,
-// (char *)&uc, 1);
- break;
-
- case 'Q':
-// Already added
-// uc = f->B.qual;
-// r |= h->QS_codec->encode(s, h->QS_codec, core,
-// (char *)&uc, 1);
- break;
-
- case 'N':
- i32 = f->N.len;
- r |= h->RS_codec->encode(s, h->RS_codec, core,
- (char *)&i32, 1);
- break;
+ case 'N':
+ i32 = f->N.len;
+ r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
+ (char *)&i32, 1);
+ break;
- case 'P':
- i32 = f->P.len;
- r |= h->PD_codec->encode(s, h->PD_codec, core,
- (char *)&i32, 1);
- break;
+ case 'P':
+ i32 = f->P.len;
+ r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
+ (char *)&i32, 1);
+ break;
- case 'H':
- i32 = f->H.len;
- r |= h->HC_codec->encode(s, h->HC_codec, core,
- (char *)&i32, 1);
- break;
+ case 'H':
+ i32 = f->H.len;
+ r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
+ (char *)&i32, 1);
+ break;
- default:
- fprintf(stderr, "unhandled feature code %c\n",
- f->X.code);
- return -1;
- }
+ default:
+ fprintf(stderr, "unhandled feature code %c\n",
+ f->X.code);
+ return -1;
}
-
- r |= h->MQ_codec->encode(s, h->MQ_codec, core,
- (char *)&cr->mqual, 1);
- } else {
- char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
-#ifdef BA_external
- memcpy(&s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size],
- seq, cr->len);
- s->block[s->ba_id]->uncomp_size += cr->len;
-#else
- r |= h->BA_codec->encode(s, h->BA_codec, core, seq, cr->len);
-#endif
}
- if (r)
- return -1;
- }
- s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
- s->block[0]->comp_size = s->block[0]->uncomp_size;
-
- // FIXME: we should avoid creating these in the first place and just
- // point them to s->base_blk et al.
- cram_free_block(s->block[1]);
- cram_free_block(s->block[2]);
- cram_free_block(s->block[3]);
- cram_free_block(s->block[5]);
- if (fd->version != CRAM_1_VERS) {
- cram_free_block(s->block[6]);
- BLOCK_UPLEN(s->soft_blk);
- s->block[6] = s->soft_blk;
- s->soft_blk = NULL;
- }
- BLOCK_UPLEN(s->base_blk); s->block[1] = s->base_blk; s->base_blk = NULL;
- BLOCK_UPLEN(s->qual_blk); s->block[2] = s->qual_blk; s->qual_blk = NULL;
- BLOCK_UPLEN(s->name_blk); s->block[3] = s->name_blk; s->name_blk = NULL;
- BLOCK_UPLEN(s->aux_blk); s->block[5] = s->aux_blk; s->aux_blk = NULL;
-
-#ifdef TN_external
- if (fd->version == CRAM_1_VERS) {
- cram_free_block(s->block[s->tn_id]);
- BLOCK_UPLEN(s->tn_blk); s->block[s->tn_id] = s->tn_blk;
- s->tn_blk = NULL;
+ r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
+ (char *)&cr->mqual, 1);
+ } else {
+ char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
+ r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
}
-#endif
- s->block[4]->comp_size = s->block[4]->uncomp_size;
-
-#ifdef BA_external
- s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size;
-#endif
+ return r ? -1 : 0;
+}
- /* Compress the CORE Block too, with minimal zlib level */
- if (fd->level > 5)
- cram_compress_block(fd, s->block[0], NULL, 1, Z_CRAM_STRAT, -1, -1);
-#define USE_METRICS
+/*
+ * Applies various compression methods to specific blocks, depending on
+ * known observations of how data series compress.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+static int cram_compress_slice(cram_fd *fd, cram_slice *s) {
+ int level = fd->level, i;
+ int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
-#ifdef USE_METRICS
-# define LEVEL2 1
-# define STRAT2 Z_RLE
-#else
-# define LEVEL2 -1
-# define STRAT2 -1
-#endif
+ /* Compress the CORE Block too, with minimal zlib level */
+ if (level > 5 && s->block[0]->uncomp_size > 500)
+ cram_compress_block(fd, s->block[0], NULL, GZIP, 1);
+
+ if (fd->use_bz2)
+ method |= 1<<BZIP2;
+
+ if (fd->use_rans)
+ method |= (1<<RANS0) | (1<<RANS1);
+
+ if (fd->use_lzma)
+ method |= (1<<LZMA);
+
+ /* Faster method for data series we only need entropy encoding on */
+ methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
+ if (level >= 6)
+ methodF = method;
+
- /* Compress the other blocks */
- if (cram_compress_block(fd, s->block[1], NULL, //IN (seq)
- fd->level, Z_CRAM_STRAT,
- -1, -1))
+ /* Specific compression methods for certain block types */
+ if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
+ method, level))
return -1;
if (fd->level == 0) {
/* Do nothing */
} else if (fd->level == 1) {
- if (cram_compress_block(fd, s->block[2], fd->m[1], //qual
- 1, Z_RLE, -1, -1))
- return -1;
- if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags
- 1, Z_RLE, -1, -1))
+ if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
+ methodF, 1))
return -1;
+ for (i = DS_aux; i <= DS_aux_oz; i++) {
+ if (s->block[i])
+ if (cram_compress_block(fd, s->block[i], fd->m[i],
+ method, 1))
+ return -1;
+ }
} else if (fd->level < 3) {
- if (cram_compress_block(fd, s->block[2], fd->m[1], //qual
- 1, Z_RLE,
- 1, Z_HUFFMAN_ONLY))
+ if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
+ method, 1))
return -1;
- if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags
- 1, Z_RLE,
- 1, Z_HUFFMAN_ONLY))
+ if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
+ method, 1))
return -1;
+ if (s->block[DS_BB])
+ if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
+ method, 1))
+ return -1;
+ for (i = DS_aux; i <= DS_aux_oz; i++) {
+ if (s->block[i])
+ if (cram_compress_block(fd, s->block[i], fd->m[i],
+ method, level))
+ return -1;
+ }
} else {
- if (cram_compress_block(fd, s->block[2], fd->m[1], //qual
- fd->level, Z_CRAM_STRAT,
- LEVEL2, STRAT2))
+ if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
+ method, level))
return -1;
- if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags
- fd->level, Z_CRAM_STRAT,
- LEVEL2, STRAT2))
+ if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
+ method, level))
return -1;
+ if (s->block[DS_BB])
+ if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
+ method, level))
+ return -1;
+ for (i = DS_aux; i <= DS_aux_oz; i++) {
+ if (s->block[i])
+ if (cram_compress_block(fd, s->block[i], fd->m[i],
+ method, level))
+ return -1;
+ }
}
- if (cram_compress_block(fd, s->block[3], NULL, //Name
- fd->level, Z_CRAM_STRAT,
- -1, -1))
- return -1;
- if (cram_compress_block(fd, s->block[4], NULL, //TS, NP
- fd->level, Z_CRAM_STRAT,
- -1, -1))
+
+ // NAME: best is generally xz, bzip2, zlib then rans1
+ // It benefits well from a little bit extra compression level.
+ if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN],
+ method & ~(1<<RANS0 | 1<<GZIP_RLE),
+ MIN(9,level)))
return -1;
- if (fd->version != CRAM_1_VERS) {
- if (cram_compress_block(fd, s->block[6], NULL, //SC (seq)
- fd->level, Z_CRAM_STRAT,
- -1, -1))
+
+ // NS shows strong local correlation as rearrangements are localised
+ if (s->block[DS_NS] != s->block[0])
+ if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS],
+ method, level))
return -1;
+
+
+ /*
+ * Minimal compression of any block still uncompressed, bar CORE
+ */
+ {
+ int i;
+ for (i = 1; i < DS_END; i++) {
+ if (!s->block[i] || s->block[i] == s->block[0])
+ continue;
+
+ // fast methods only
+ if (s->block[i]->method == RAW) {
+ cram_compress_block(fd, s->block[i], fd->m[i],
+ methodF, level);
+ }
+ }
}
-#ifdef BA_external
- if (cram_compress_block(fd, s->block[s->ba_id], NULL,
- fd->level, Z_CRAM_STRAT, -1, -1))
+
+ return 0;
+}
+
+/*
+ * Encodes a single slice from a container
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+static int cram_encode_slice(cram_fd *fd, cram_container *c,
+ cram_block_compression_hdr *h, cram_slice *s) {
+ int rec, r = 0, last_pos;
+ int embed_ref;
+ enum cram_DS_ID id;
+
+ embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
+
+ /*
+ * Slice external blocks:
+ * ID 0 => base calls (insertions, soft-clip)
+ * ID 1 => qualities
+ * ID 2 => names
+ * ID 3 => TS (insert size), NP (next frag)
+ * ID 4 => tag values
+ * ID 6 => tag IDs (TN), if CRAM_V1.0
+ * ID 7 => TD tag dictionary, if !CRAM_V1.0
+ */
+
+ /* Create cram slice header */
+ s->hdr->ref_base_id = embed_ref ? DS_ref : -1;
+ s->hdr->record_counter = c->num_records + c->record_counter;
+ c->num_records += s->hdr->num_records;
+
+ s->block = calloc(DS_END, sizeof(s->block[0]));
+ s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
+ if (!s->block || !s->hdr->block_content_ids)
return -1;
-#endif
-#ifdef TN_external
- if (fd->version == CRAM_1_VERS) {
- if (cram_compress_block(fd, s->block[s->tn_id], NULL,
- fd->level, Z_DEFAULT_STRATEGY, -1, -1))
- return -1;
+
+ // Create first fixed blocks, always external.
+ // CORE
+ if (!(s->block[0] = cram_new_block(CORE, 0)))
+ return -1;
+
+ // TN block for CRAM v1
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
+ if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
+ h->codecs[DS_TN]->external.content_id = DS_TN;
+ } else {
+ s->block[DS_TN] = s->block[0];
+ }
+ s->block[DS_TN] = s->block[DS_TN];
}
-#endif
+
+ // Embedded reference
if (embed_ref) {
- BLOCK_UPLEN(s->block[s->ref_id]);
- if (cram_compress_block(fd, s->block[s->ref_id], NULL,
- fd->level, Z_DEFAULT_STRATEGY, -1, -1))
+ if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
+ return -1;
+ s->ref_id = DS_ref; // needed?
+ BLOCK_APPEND(s->block[DS_ref],
+ c->ref + c->first_base - c->ref_start,
+ c->last_base - c->first_base + 1);
+ }
+
+ /*
+ * All the data-series blocks if appropriate.
+ */
+ for (id = DS_BF; id < DS_TN; id++) {
+ if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL ||
+ h->codecs[id]->codec == E_BYTE_ARRAY_STOP ||
+ h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) {
+ switch (h->codecs[id]->codec) {
+ case E_EXTERNAL:
+ if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
+ return -1;
+ h->codecs[id]->external.content_id = id;
+ break;
+
+ case E_BYTE_ARRAY_STOP:
+ if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
+ return -1;
+ h->codecs[id]->byte_array_stop.content_id = id;
+ break;
+
+ case E_BYTE_ARRAY_LEN: {
+ cram_codec *cc;
+
+ cc = h->codecs[id]->e_byte_array_len.len_codec;
+ if (cc->codec == E_EXTERNAL) {
+ int eid = cc->external.content_id;
+ if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
+ return -1;
+ cc->external.content_id = eid;
+ cc->out = s->block[eid];
+ }
+
+ cc = h->codecs[id]->e_byte_array_len.val_codec;
+ if (cc->codec == E_EXTERNAL) {
+ int eid = cc->external.content_id;
+ if (!s->block[eid])
+ if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
+ return -1;
+ cc->external.content_id = eid;
+ cc->out = s->block[eid];
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ } else {
+ if (!(id == DS_BB && !h->codecs[DS_BB]))
+ s->block[id] = s->block[0];
+ }
+ if (h->codecs[id])
+ h->codecs[id]->out = s->block[id];
+ }
+
+ /* Encode reads */
+ last_pos = s->hdr->ref_seq_start;
+ for (rec = 0; rec < s->hdr->num_records; rec++) {
+ cram_record *cr = &s->crecs[rec];
+ if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
+ return -1;
+ }
+
+ s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
+ s->block[0]->comp_size = s->block[0]->uncomp_size;
+
+ // Make sure the fixed blocks point to the correct sources
+ s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
+ s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
+ s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
+ s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
+ s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL;
+ s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL;
+ s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL;
+ s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL;
+ s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL;
+ s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL;
+ s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL;
+ s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL;
+ s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL;
+
+ // Ensure block sizes are up to date.
+ for (id = 1; id < DS_END; id++) {
+ if (!s->block[id] || s->block[id] == s->block[0])
+ continue;
+
+ if (s->block[id]->uncomp_size == 0)
+ BLOCK_UPLEN(s->block[id]);
+ }
+
+ // Compress it all
+ if (cram_compress_slice(fd, s) == -1)
+ return -1;
+
+ // Collapse empty blocks and create hdr_block
+ {
+ int i, j;
+ for (i = j = 1; i < DS_END; i++) {
+ if (!s->block[i] || s->block[i] == s->block[0])
+ continue;
+ if (s->block[i]->uncomp_size == 0) {
+ cram_free_block(s->block[i]);
+ s->block[i] = NULL;
+ continue;
+ }
+ s->block[j] = s->block[i];
+ s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
+ j++;
+ }
+ s->hdr->num_content_ids = j-1;
+ s->hdr->num_blocks = j;
+
+ if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
return -1;
}
@@ -1096,14 +1236,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
nref = fd->refs->nref;
pthread_mutex_unlock(&fd->ref_lock);
- if (c->refs_used) {
+ if (!fd->no_ref && c->refs_used) {
for (i = 0; i < nref; i++) {
- if (c->refs_used[i]) {
+ if (c->refs_used[i])
cram_get_ref(fd, i, 1, 0);
- }
}
}
+ /* To create M5 strings */
/* Fetch reference sequence */
if (!fd->no_ref) {
bam_seq_t *b = c->bams[0];
@@ -1123,7 +1263,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
c->ref_seq_id = c->ref_id; // FIXME remove one var!
}
} else {
- c->ref_seq_id = c->ref_id; // FIXME remove one var!
+ c->ref_id = bam_ref(c->bams[0]);
+ cram_ref_incr(fd->refs, c->ref_id);
+ c->ref_seq_id = c->ref_id;
}
/* Turn bams into cram_records and gather basic stats */
@@ -1183,6 +1325,11 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
s->hdr->num_records = r2;
}
+ if (c->multi_seq && !fd->no_ref) {
+ if (c->ref_seq_id >= 0)
+ cram_ref_decr(fd->refs, c->ref_seq_id);
+ }
+
/* Link our bams[] array onto the spare bam list for reuse */
spares = malloc(sizeof(*spares));
pthread_mutex_lock(&fd->bam_list_lock);
@@ -1193,8 +1340,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
c->bams = NULL;
/* Detect if a multi-seq container */
- cram_stats_encoding(fd, c->RI_stats);
- multi_ref = c->RI_stats->nvals > 1;
+ cram_stats_encoding(fd, c->stats[DS_RI]);
+ multi_ref = c->stats[DS_RI]->nvals > 1;
if (multi_ref) {
if (fd->verbose)
@@ -1209,7 +1356,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
for (i = 0; i < c->curr_slice; i++) {
cram_slice *s = c->slices[i];
- if (fd->version != CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
MD5_CTX md5;
MD5_Init(&md5);
@@ -1228,201 +1375,203 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
c->length = 0;
//fprintf(stderr, "=== BF ===\n");
- h->BF_codec = cram_encoder_init(cram_stats_encoding(fd, c->BF_stats),
- c->BF_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
+ c->stats[DS_BF], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== CF ===\n");
- h->CF_codec = cram_encoder_init(cram_stats_encoding(fd, c->CF_stats),
- c->CF_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
+ c->stats[DS_CF], E_INT, NULL,
+ fd->version);
// fprintf(stderr, "=== RN ===\n");
-// h->RN_codec = cram_encoder_init(cram_stats_encoding(fd, c->RN_stats),
-// c->RN_stats, E_BYTE_ARRAY, NULL,
+// h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
+// c->stats[DS_RN], E_BYTE_ARRAY, NULL,
// fd->version);
//fprintf(stderr, "=== AP ===\n");
if (c->pos_sorted) {
- h->AP_codec = cram_encoder_init(cram_stats_encoding(fd, c->AP_stats),
- c->AP_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
+ c->stats[DS_AP], E_INT, NULL,
+ fd->version);
} else {
int p[2] = {0, c->max_apos};
- h->AP_codec = cram_encoder_init(E_BETA, NULL, E_INT, p, fd->version);
+ h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p,
+ fd->version);
}
//fprintf(stderr, "=== RG ===\n");
- h->RG_codec = cram_encoder_init(cram_stats_encoding(fd, c->RG_stats),
- c->RG_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
+ c->stats[DS_RG], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== MQ ===\n");
- h->MQ_codec = cram_encoder_init(cram_stats_encoding(fd, c->MQ_stats),
- c->MQ_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
+ c->stats[DS_MQ], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== NS ===\n");
-#ifdef NS_external
- h->NS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT,
- (void *)CRAM_EXT_NS,
- fd->version);
-#else
- h->NS_codec = cram_encoder_init(cram_stats_encoding(fd, c->NS_stats),
- c->NS_stats, E_INT, NULL,
- fd->version);
-#endif
+ h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
+ c->stats[DS_NS], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== MF ===\n");
- h->MF_codec = cram_encoder_init(cram_stats_encoding(fd, c->MF_stats),
- c->MF_stats, E_INT, NULL,
- fd->version);
-
-#ifdef TS_external
- h->TS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT,
- (void *)CRAM_EXT_TS_NP,
- fd->version);
- h->NP_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT,
- (void *)CRAM_EXT_TS_NP,
- fd->version);
-#else
+ h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
+ c->stats[DS_MF], E_INT, NULL,
+ fd->version);
+
//fprintf(stderr, "=== TS ===\n");
- h->TS_codec = cram_encoder_init(cram_stats_encoding(fd, c->TS_stats),
- c->TS_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
+ c->stats[DS_TS], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== NP ===\n");
- h->NP_codec = cram_encoder_init(cram_stats_encoding(fd, c->NP_stats),
- c->NP_stats, E_INT, NULL,
- fd->version);
-#endif
-
+ h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
+ c->stats[DS_NP], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== NF ===\n");
- h->NF_codec = cram_encoder_init(cram_stats_encoding(fd, c->NF_stats),
- c->NF_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
+ c->stats[DS_NF], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== RL ===\n");
- h->RL_codec = cram_encoder_init(cram_stats_encoding(fd, c->RL_stats),
- c->RL_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
+ c->stats[DS_RL], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== FN ===\n");
- h->FN_codec = cram_encoder_init(cram_stats_encoding(fd, c->FN_stats),
- c->FN_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
+ c->stats[DS_FN], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== FC ===\n");
- h->FC_codec = cram_encoder_init(cram_stats_encoding(fd, c->FC_stats),
- c->FC_stats, E_BYTE, NULL,
- fd->version);
+ h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
+ c->stats[DS_FC], E_BYTE, NULL,
+ fd->version);
//fprintf(stderr, "=== FP ===\n");
- h->FP_codec = cram_encoder_init(cram_stats_encoding(fd, c->FP_stats),
- c->FP_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
+ c->stats[DS_FP], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== DL ===\n");
- h->DL_codec = cram_encoder_init(cram_stats_encoding(fd, c->DL_stats),
- c->DL_stats, E_INT, NULL,
- fd->version);
-
-#ifdef BA_external
- h->BA_codec = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
- (void *)CRAM_EXT_BA,
- fd->version);
-#else
+ h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
+ c->stats[DS_DL], E_INT, NULL,
+ fd->version);
+
//fprintf(stderr, "=== BA ===\n");
- h->BA_codec = cram_encoder_init(cram_stats_encoding(fd, c->BA_stats),
- c->BA_stats, E_BYTE, NULL,
- fd->version);
-#endif
+ h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
+ c->stats[DS_BA], E_BYTE, NULL,
+ fd->version);
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ cram_byte_array_len_encoder e;
+
+ e.len_encoding = E_EXTERNAL;
+ e.len_dat = (void *)DS_BB_len;
+ //e.len_dat = (void *)DS_BB;
+
+ e.val_encoding = E_EXTERNAL;
+ e.val_dat = (void *)DS_BB;
+
+ h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
+ E_BYTE_ARRAY, (void *)&e,
+ fd->version);
+ } else {
+ h->codecs[DS_BB] = NULL;
+ }
//fprintf(stderr, "=== BS ===\n");
- h->BS_codec = cram_encoder_init(cram_stats_encoding(fd, c->BS_stats),
- c->BS_stats, E_BYTE, NULL,
- fd->version);
-
- if (fd->version == CRAM_1_VERS) {
- h->TL_codec = NULL;
- h->RI_codec = NULL;
- h->RS_codec = NULL;
- h->PD_codec = NULL;
- h->HC_codec = NULL;
- h->SC_codec = NULL;
+ h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
+ c->stats[DS_BS], E_BYTE, NULL,
+ fd->version);
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ h->codecs[DS_TL] = NULL;
+ h->codecs[DS_RI] = NULL;
+ h->codecs[DS_RS] = NULL;
+ h->codecs[DS_PD] = NULL;
+ h->codecs[DS_HC] = NULL;
+ h->codecs[DS_SC] = NULL;
//fprintf(stderr, "=== TC ===\n");
- h->TC_codec = cram_encoder_init(cram_stats_encoding(fd, c->TC_stats),
- c->TC_stats, E_BYTE, NULL,
- fd->version);
+ h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
+ c->stats[DS_TC], E_BYTE, NULL,
+ fd->version);
//fprintf(stderr, "=== TN ===\n");
-#ifdef TN_external
- h->TN_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT,
- (void *)CRAM_EXT_TN,
- fd->version);
-#else
- h->TN_codec = cram_encoder_init(cram_stats_encoding(fd, c->TN_stats),
- c->TN_stats, E_INT, NULL,
- fd->version);
-#endif
+ h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
+ c->stats[DS_TN], E_INT, NULL,
+ fd->version);
} else {
- int i2[2] = {0, CRAM_EXT_SC};
-
- h->TC_codec = NULL;
- h->TN_codec = NULL;
+ h->codecs[DS_TC] = NULL;
+ h->codecs[DS_TN] = NULL;
//fprintf(stderr, "=== TL ===\n");
- h->TL_codec = cram_encoder_init(cram_stats_encoding(fd, c->TL_stats),
- c->TL_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
+ c->stats[DS_TL], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== RI ===\n");
- h->RI_codec = cram_encoder_init(cram_stats_encoding(fd, c->RI_stats),
- c->RI_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
+ c->stats[DS_RI], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== RS ===\n");
- h->RS_codec = cram_encoder_init(cram_stats_encoding(fd, c->RS_stats),
- c->RS_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
+ c->stats[DS_RS], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== PD ===\n");
- h->PD_codec = cram_encoder_init(cram_stats_encoding(fd, c->PD_stats),
- c->PD_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
+ c->stats[DS_PD], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== HC ===\n");
- h->HC_codec = cram_encoder_init(cram_stats_encoding(fd, c->HC_stats),
- c->HC_stats, E_INT, NULL,
- fd->version);
+ h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
+ c->stats[DS_HC], E_INT, NULL,
+ fd->version);
//fprintf(stderr, "=== SC ===\n");
- h->SC_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
- E_BYTE_ARRAY, (void *)i2,
- fd->version);
+ if (1) {
+ int i2[2] = {0, DS_SC};
+
+ h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
+ E_BYTE_ARRAY, (void *)i2,
+ fd->version);
+ } else {
+ // Appears to be no practical benefit to using this method,
+ // but it may work better if we start mixing SC, IN and BB
+ // elements into the same external block.
+ cram_byte_array_len_encoder e;
+
+ e.len_encoding = E_EXTERNAL;
+ e.len_dat = (void *)DS_SC_len;
+
+ e.val_encoding = E_EXTERNAL;
+ e.val_dat = (void *)DS_SC;
+
+ h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
+ E_BYTE_ARRAY, (void *)&e,
+ fd->version);
+ }
}
//fprintf(stderr, "=== IN ===\n");
{
- int i2[2] = {0, CRAM_EXT_IN};
- h->IN_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
- E_BYTE_ARRAY, (void *)i2,
- fd->version);
+ int i2[2] = {0, DS_IN};
+ h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
+ E_BYTE_ARRAY, (void *)i2,
+ fd->version);
}
+ h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
+ (void *)DS_QS,
+ fd->version);
{
- //int i2[2] = {0, 1};
- //h->QS_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, (void *)i2,
- // fd->version);
- h->QS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
- (void *)CRAM_EXT_QUAL,
- fd->version);
- }
- {
- int i2[2] = {0, CRAM_EXT_NAME};
- h->RN_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
- E_BYTE_ARRAY, (void *)i2,
- fd->version);
+ int i2[2] = {0, DS_RN};
+ h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
+ E_BYTE_ARRAY, (void *)i2,
+ fd->version);
}
@@ -1465,7 +1614,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
slice_offset = c_hdr->method == RAW
? c_hdr->uncomp_size
: c_hdr->comp_size;
- slice_offset += 2 +
+ slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(c_hdr->content_id) +
itf8_size(c_hdr->comp_size) +
itf8_size(c_hdr->uncomp_size);
@@ -1490,13 +1639,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
? s->hdr_block->uncomp_size
: s->hdr_block->comp_size;
- slice_offset += 2 +
+ slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(s->hdr_block->content_id) +
itf8_size(s->hdr_block->comp_size) +
itf8_size(s->hdr_block->uncomp_size);
for (j = 0; j < s->hdr->num_blocks; j++) {
- slice_offset += 2 +
+ slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(s->block[j]->content_id) +
itf8_size(s->block[j]->comp_size) +
itf8_size(s->block[j]->uncomp_size);
@@ -1515,7 +1664,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
}
/* Cache references up-front if we have unsorted access patterns */
- if (c->refs_used) {
+ if (!fd->no_ref && c->refs_used) {
for (i = 0; i < fd->refs->nref; i++) {
if (c->refs_used[i])
cram_ref_decr(fd->refs, i);
@@ -1546,12 +1695,12 @@ static int cram_add_feature(cram_container *c, cram_slice *s,
if (!r->nfeature++) {
r->feature = s->nfeatures;
- cram_stats_add(c->FP_stats, f->X.pos);
+ cram_stats_add(c->stats[DS_FP], f->X.pos);
} else {
- cram_stats_add(c->FP_stats,
+ cram_stats_add(c->stats[DS_FP],
f->X.pos - s->features[r->feature + r->nfeature-2].X.pos);
}
- cram_stats_add(c->FC_stats, f->X.code);
+ cram_stats_add(c->stats[DS_FC], f->X.code);
s->features[s->nfeatures++] = *f;
@@ -1568,19 +1717,32 @@ static int cram_add_substitution(cram_fd *fd, cram_container *c,
f.X.pos = pos+1;
f.X.code = 'X';
f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
- cram_stats_add(c->BS_stats, f.X.base);
+ cram_stats_add(c->stats[DS_BS], f.X.base);
} else {
f.B.pos = pos+1;
f.B.code = 'B';
f.B.base = base;
f.B.qual = qual;
- cram_stats_add(c->BA_stats, f.B.base);
- cram_stats_add(c->QS_stats, f.B.qual);
+ cram_stats_add(c->stats[DS_BA], f.B.base);
+ cram_stats_add(c->stats[DS_QS], f.B.qual);
BLOCK_APPEND_CHAR(s->qual_blk, qual);
}
return cram_add_feature(c, s, r, &f);
}
+static int cram_add_bases(cram_fd *fd, cram_container *c,
+ cram_slice *s, cram_record *r,
+ int pos, int len, char *base) {
+ cram_feature f;
+
+ f.b.pos = pos+1;
+ f.b.code = 'b';
+ f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
+ f.b.len = len;
+
+ return cram_add_feature(c, s, r, &f);
+}
+
static int cram_add_base(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, char base, char qual) {
@@ -1589,12 +1751,8 @@ static int cram_add_base(cram_fd *fd, cram_container *c,
f.B.code = 'B';
f.B.base = base;
f.B.qual = qual;
-#ifdef BA_external
- s->BA_len++;
-#else
- cram_stats_add(c->BA_stats, base);
-#endif
- cram_stats_add(c->QS_stats, qual);
+ cram_stats_add(c->stats[DS_BA], base);
+ cram_stats_add(c->stats[DS_QS], qual);
BLOCK_APPEND_CHAR(s->qual_blk, qual);
return cram_add_feature(c, s, r, &f);
}
@@ -1606,7 +1764,7 @@ static int cram_add_quality(cram_fd *fd, cram_container *c,
f.Q.pos = pos+1;
f.Q.code = 'Q';
f.Q.qual = qual;
- cram_stats_add(c->QS_stats, qual);
+ cram_stats_add(c->stats[DS_QS], qual);
BLOCK_APPEND_CHAR(s->qual_blk, qual);
return cram_add_feature(c, s, r, &f);
}
@@ -1617,7 +1775,7 @@ static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
f.D.pos = pos+1;
f.D.code = 'D';
f.D.len = len;
- cram_stats_add(c->DL_stats, len);
+ cram_stats_add(c->stats[DS_DL], len);
return cram_add_feature(c, s, r, &f);
}
@@ -1627,11 +1785,15 @@ static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
f.S.pos = pos+1;
f.S.code = 'S';
f.S.len = len;
- if (version == CRAM_1_VERS) {
+ switch (CRAM_MAJOR_VERS(version)) {
+ case 1:
f.S.seq_idx = BLOCK_SIZE(s->base_blk);
BLOCK_APPEND(s->base_blk, base, len);
BLOCK_APPEND_CHAR(s->base_blk, '\0');
- } else {
+ break;
+
+ case 2:
+ default:
f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
if (base) {
BLOCK_APPEND(s->soft_blk, base, len);
@@ -1641,6 +1803,11 @@ static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
BLOCK_APPEND_CHAR(s->soft_blk, 'N');
}
BLOCK_APPEND_CHAR(s->soft_blk, '\0');
+ break;
+
+// default:
+// // v3.0 onwards uses BB data-series
+// f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
}
return cram_add_feature(c, s, r, &f);
}
@@ -1651,7 +1818,7 @@ static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
f.S.pos = pos+1;
f.S.code = 'H';
f.S.len = len;
- cram_stats_add(c->HC_stats, len);
+ cram_stats_add(c->stats[DS_HC], len);
return cram_add_feature(c, s, r, &f);
}
@@ -1661,7 +1828,7 @@ static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
f.S.pos = pos+1;
f.S.code = 'N';
f.S.len = len;
- cram_stats_add(c->RS_stats, len);
+ cram_stats_add(c->stats[DS_RS], len);
return cram_add_feature(c, s, r, &f);
}
@@ -1671,7 +1838,7 @@ static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
f.S.pos = pos+1;
f.S.code = 'P';
f.S.len = len;
- cram_stats_add(c->PD_stats, len);
+ cram_stats_add(c->stats[DS_PD], len);
return cram_add_feature(c, s, r, &f);
}
@@ -1683,11 +1850,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
char b = base ? *base : 'N';
f.i.code = 'i';
f.i.base = b;
-#ifdef BA_external
- s->BA_len++;
-#else
- cram_stats_add(c->BA_stats, b);
-#endif
+ cram_stats_add(c->stats[DS_BA], b);
} else {
f.I.code = 'I';
f.I.len = len;
@@ -1711,7 +1874,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
*/
static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
cram_slice *s, cram_record *cr) {
- char *aux, *tmp, *rg = NULL, *tmp_tn;
+ char *aux, *tmp, *rg = NULL;
int aux_size = bam_blk_size(b) -
((char *)bam_aux(b) - (char *)&bam_ref(b));
@@ -1719,15 +1882,9 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_blk);
-#ifdef TN_external
- BLOCK_GROW(s->tn_blk, aux_size);
- tmp_tn = (char *)BLOCK_END(s->tn_blk);
-#endif
-
aux = (char *)bam_aux(b);
-#ifndef TN_external
cr->TN_idx = s->nTN;
-#endif
+
while (aux[0] != 0) {
int32_t i32;
int r;
@@ -1759,17 +1916,13 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
if (-1 == r)
return NULL;
-#ifndef TN_external
if (s->nTN >= s->aTN) {
s->aTN = s->aTN ? s->aTN*2 : 1024;
if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN))))
return NULL;
}
s->TN[s->nTN++] = i32;
- cram_stats_add(c->TN_stats, i32);
-#else
- tmp_tn += itf8_put(tmp_tn, i32);
-#endif
+ cram_stats_add(c->stats[DS_TN], i32);
switch(aux[2]) {
case 'A': case 'C': case 'c':
@@ -1842,19 +1995,13 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
return NULL;
}
}
- cram_stats_add(c->TC_stats, cr->ntags);
+ cram_stats_add(c->stats[DS_TC], cr->ntags);
cr->aux = BLOCK_SIZE(s->aux_blk);
cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
assert(s->aux_blk->byte <= s->aux_blk->alloc);
-#ifdef TN_external
- cr->tn = BLOCK_SIZE(s->tn_blk);
- BLOCK_SIZE(s->tn_blk) = (uc *)tmp_tn - BLOCK_DATA(s->tn_blk);
- assert(s->tn_blk->byte <= s->tn_blk->alloc);
-#endif
-
return rg;
}
@@ -1868,12 +2015,7 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
cram_slice *s, cram_record *cr) {
char *aux, *orig, *tmp, *rg = NULL;
-#ifdef SAMTOOLS
int aux_size = bam_get_l_aux(b);
-#else
- int aux_size = bam_blk_size(b) -
- ((char *)bam_aux(b) - (char *)&bam_ref(b));
-#endif
cram_block *td_b = c->comp_hdr->TD_blk;
int TD_blk_size = BLOCK_SIZE(td_b), new;
char *key;
@@ -1920,6 +2062,150 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
if (-1 == r)
return NULL;
+ // BQ:Z
+ if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_BQ_blk)
+ if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ)))
+ return NULL;
+ BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_BQ_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk);
+ continue;
+ }
+
+ // BD:Z
+ if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_BD_blk)
+ if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD)))
+ return NULL;
+ BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_BD_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk);
+ continue;
+ }
+
+ // BI:Z
+ if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_BI_blk)
+ if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI)))
+ return NULL;
+ BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_BI_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk);
+ continue;
+ }
+
+ // OQ:Z:
+ if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_OQ_blk)
+ if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ)))
+ return NULL;
+ BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_OQ_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk);
+ continue;
+ }
+
+ // FZ:B or ZM:B
+ if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') ||
+ (aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) {
+ int type = aux[3], blen;
+ uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
+ (((unsigned char *)aux)[5]<< 8) +
+ (((unsigned char *)aux)[6]<<16) +
+ (((unsigned char *)aux)[7]<<24));
+ char *tmp;
+ if (!s->aux_FZ_blk)
+ if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ)))
+ return NULL;
+ BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_FZ_blk);
+
+ // skip TN field
+ aux+=3;
+
+ // We use BYTE_ARRAY_LEN with external length, so store that first
+ switch (type) {
+ case 'c': case 'C':
+ blen = count;
+ break;
+ case 's': case 'S':
+ blen = 2*count;
+ break;
+ case 'i': case 'I': case 'f':
+ blen = 4*count;
+ break;
+ default:
+ fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
+ type);
+ return NULL;
+
+ }
+
+ blen += 5; // sub-type & length
+ tmp += itf8_put(tmp, blen);
+
+ // The tag data itself
+ memcpy(tmp, aux, blen); tmp += blen; aux += blen;
+
+ BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk);
+ continue;
+ }
+
+ // Other quality data - {Q2,E2,U2,CQ}:Z and similar
+ if (((aux[0] == 'Q' && aux[1] == '2') ||
+ (aux[0] == 'U' && aux[1] == '2') ||
+ (aux[0] == 'Q' && aux[1] == 'T') ||
+ (aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_oq_blk)
+ if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq)))
+ return NULL;
+ BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_oq_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk);
+ continue;
+ }
+
+ // Other sequence data - {R2,E2,CS,BC,RT}:Z and similar
+ if (((aux[0] == 'R' && aux[1] == '2') ||
+ (aux[0] == 'E' && aux[1] == '2') ||
+ (aux[0] == 'C' && aux[1] == 'S') ||
+ (aux[0] == 'B' && aux[1] == 'C') ||
+ (aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_os_blk)
+ if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os)))
+ return NULL;
+ BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_os_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk);
+ continue;
+ }
+
+
switch(aux[2]) {
case 'A': case 'C': case 'c':
aux+=3;
@@ -1940,11 +2226,22 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ break;
case 'Z': case 'H':
- aux+=3;
- while ((*tmp++=*aux++));
- *tmp++ = '\t'; // stop byte
+ {
+ char *tmp;
+ if (!s->aux_oz_blk)
+ if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz)))
+ return NULL;
+ BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_oz_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp -
+ BLOCK_DATA(s->aux_oz_blk);
+ }
break;
case 'B': {
@@ -1974,10 +2271,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
}
- tmp += itf8_put(tmp, blen+5);
-
- *tmp++=*aux++; // sub-type & length
- *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ blen += 5; // sub-type & length
+ tmp += itf8_put(tmp, blen);
// The tag data itself
memcpy(tmp, aux, blen); tmp += blen; aux += blen;
@@ -2011,7 +2306,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
}
cr->TL = kh_val(c->comp_hdr->TD_hash, k);
- cram_stats_add(c->TL_stats, cr->TL);
+ cram_stats_add(c->stats[DS_TL], cr->TL);
cr->aux = BLOCK_SIZE(s->aux_blk);
cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
@@ -2135,13 +2430,14 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
static int process_one_read(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *cr,
bam_seq_t *b, int rnum) {
- int i, fake_qual = 0;
+ int i, fake_qual = -1;
char *cp, *rg;
char *ref, *seq, *qual;
// FIXME: multi-ref containers
ref = c->ref;
+ cr->len = bam_seq_len(b); cram_stats_add(c->stats[DS_RL], cr->len);
//fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
@@ -2149,8 +2445,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
//cr->mate_line; // index to another cram_record
//cr->mate_flags; // MF
//cr->ntags; // TC
- cr->ntags = 0; //cram_stats_add(c->TC_stats, cr->ntags);
- if (fd->version == CRAM_1_VERS)
+ cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
+ if (CRAM_MAJOR_VERS(fd->version) == 1)
rg = cram_encode_aux_1_0(fd, b, c, s, cr);
else
rg = cram_encode_aux(fd, b, c, s, cr);
@@ -2163,45 +2459,46 @@ static int process_one_read(cram_fd *fd, cram_container *c,
if (rg) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
cr->rg = brg ? brg->id : -1;
- } else if (fd->version == CRAM_1_VERS) {
+ } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
assert(brg);
} else {
cr->rg = -1;
}
- cram_stats_add(c->RG_stats, cr->rg);
+ cram_stats_add(c->stats[DS_RG], cr->rg);
- cr->ref_id = bam_ref(b); cram_stats_add(c->RI_stats, cr->ref_id);
+ cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id);
cr->flags = bam_flag(b);
if (bam_cigar_len(b) == 0)
cr->flags |= BAM_FUNMAP;
- cram_stats_add(c->BF_stats, fd->cram_flag_swap[cr->flags & 0xfff]);
+ cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]);
- if (!fd->no_ref)
+ // Non reference based encoding means storing the bases verbatim as features, which in
+ // turn means every base also has a quality already stored.
+ if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES;
else
cr->cram_flags = 0;
- //cram_stats_add(c->CF_stats, cr->cram_flags);
+ //cram_stats_add(c->stats[DS_CF], cr->cram_flags);
- cr->len = bam_seq_len(b); cram_stats_add(c->RL_stats, cr->len);
c->num_bases += cr->len;
cr->apos = bam_pos(b)+1;
if (c->pos_sorted) {
if (cr->apos < s->last_apos) {
c->pos_sorted = 0;
} else {
- cram_stats_add(c->AP_stats, cr->apos - s->last_apos);
+ cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos);
s->last_apos = cr->apos;
}
} else {
- //cram_stats_add(c->AP_stats, cr->apos);
+ //cram_stats_add(c->stats[DS_AP], cr->apos);
}
c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
cr->name = BLOCK_SIZE(s->name_blk);
cr->name_len = bam_name_len(b);
- cram_stats_add(c->RN_stats, cr->name_len);
+ cram_stats_add(c->stats[DS_RN], cr->name_len);
BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
@@ -2209,7 +2506,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
/*
* This seqs_ds is largely pointless and it could reuse the same memory
* over and over.
- * s->base_ds is what we need for encoding.
+ * s->base_blk is what we need for encoding.
*/
cr->seq = BLOCK_SIZE(s->seqs_blk);
cr->qual = BLOCK_SIZE(s->qual_blk);
@@ -2218,14 +2515,57 @@ static int process_one_read(cram_fd *fd, cram_container *c,
seq = cp = (char *)BLOCK_END(s->seqs_blk);
*seq = 0;
- for (i = 0; i < cr->len; i++) {
- // FIXME: do 2 char at a time for efficiency
-#ifdef SAMTOOLS
- cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
+#ifdef ALLOW_UAC
+ {
+ // Convert seq 2 bases at a time for speed.
+ static const uint16_t code2base[256] = {
+ 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077,
+ 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029,
+ 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081,
+ 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033,
+ 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083,
+ 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035,
+ 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093,
+ 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045,
+ 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087,
+ 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039,
+ 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098,
+ 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050,
+ 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099,
+ 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051,
+ 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102,
+ 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054,
+ 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100,
+ 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052,
+ 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103,
+ 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055,
+ 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105,
+ 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057,
+ 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088,
+ 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040,
+ 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091,
+ 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043,
+ 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084,
+ 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036,
+ 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082,
+ 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034,
+ 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094,
+ 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046
+ };
+
+ int l2 = cr->len / 2;
+ unsigned char *from = (unsigned char *)bam_seq(b);
+ uint16_t *cpi = (uint16_t *)cp;
+ cp[0] = 0;
+ for (i = 0; i < l2; i++)
+ cpi[i] = le_int2(code2base[from[i]]);
+ if ((i *= 2) < cr->len)
+ cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
+ }
#else
- cp[i] = bam_nt16_rev_table[bam_seqi(bam_seq(b), i)];
+ for (i = 0; i < cr->len; i++)
+ cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
#endif
- }
BLOCK_SIZE(s->seqs_blk) += cr->len;
qual = cp = (char *)bam_qual(b);
@@ -2269,24 +2609,63 @@ static int process_one_read(cram_fd *fd, cram_container *c,
if (!fd->no_ref && cr->len) {
int end = cig_len+apos < c->ref_end
? cig_len : c->ref_end - apos;
- for (l = 0; l < end && seq[spos]; l++, apos++, spos++) {
- if (ref[apos] != seq[spos]) {
- //fprintf(stderr, "Subst: %d; %c vs %c\n",
- // spos, ref[apos], seq[spos]);
- if (cram_add_substitution(fd, c, s, cr, spos,
- seq[spos], qual[spos],
- ref[apos]))
- return -1;
+ char *sp = &seq[spos];
+ char *rp = &ref[apos];
+ char *qp = &qual[spos];
+ for (l = 0; l < end; l++) {
+ if (rp[l] != sp[l]) {
+ if (!sp[l])
+ break;
+ if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
+ // Disabled for the time being as it doesn't
+ // seem to gain us much.
+ int ol=l;
+ while (l<end && rp[l] != sp[l])
+ l++;
+ if (l-ol > 1) {
+ if (cram_add_bases(fd, c, s, cr, spos+ol,
+ l-ol, &seq[spos+ol]))
+ return -1;
+ l--;
+ } else {
+ l = ol;
+ if (cram_add_substitution(fd, c, s, cr,
+ spos+l, sp[l],
+ qp[l], rp[l]))
+ return -1;
+ }
+ } else {
+ if (cram_add_substitution(fd, c, s, cr, spos+l,
+ sp[l], qp[l], rp[l]))
+ return -1;
+ }
}
}
+ spos += l;
+ apos += l;
}
if (l < cig_len && cr->len) {
- /* off end of sequence or non-ref based output */
- for (; l < cig_len && seq[spos]; l++, spos++) {
- if (cram_add_base(fd, c, s, cr, spos,
- seq[spos], qual[spos]))
- return -1;
+ if (fd->no_ref) {
+ if (CRAM_MAJOR_VERS(fd->version) == 3) {
+ if (cram_add_bases(fd, c, s, cr, spos,
+ cig_len-l, &seq[spos]))
+ return -1;
+ spos += cig_len-l;
+ } else {
+ for (; l < cig_len && seq[spos]; l++, spos++) {
+ if (cram_add_base(fd, c, s, cr, spos,
+ seq[spos], qual[spos]))
+ return -1;
+ }
+ }
+ } else {
+ /* off end of sequence or non-ref based output */
+ for (; l < cig_len && seq[spos]; l++, spos++) {
+ if (cram_add_base(fd, c, s, cr, spos,
+ seq[spos], qual[spos]))
+ return -1;
+ }
}
apos += cig_len;
} else if (!cr->len) {
@@ -2326,7 +2705,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
cr->len ? &seq[spos] : NULL,
fd->version))
return -1;
- if (fd->no_ref) {
+ if (fd->no_ref &&
+ !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
if (cr->len) {
for (l = 0; l < cig_len; l++, spos++) {
cram_add_quality(fd, c, s, cr, spos, qual[spos]);
@@ -2354,7 +2734,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
}
fake_qual = spos;
cr->aend = MIN(apos, c->ref_end);
- cram_stats_add(c->FN_stats, cr->nfeature);
+ cram_stats_add(c->stats[DS_FN], cr->nfeature);
} else {
// Unmapped
cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
@@ -2362,12 +2742,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
cr->ncigar = 0;
cr->nfeature = 0;
cr->aend = cr->apos;
-#ifdef BA_external
- s->BA_len += cr->len;
-#else
for (i = 0; i < cr->len; i++)
- cram_stats_add(c->BA_stats, seq[i]);
-#endif
+ cram_stats_add(c->stats[DS_BA], seq[i]);
}
/*
@@ -2378,7 +2754,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
/* Special case of seq "*" */
if (cr->len == 0) {
- cram_stats_add(c->RL_stats, cr->len = fake_qual);
+ cram_stats_add(c->stats[DS_RL], cr->len = fake_qual);
BLOCK_GROW(s->qual_blk, cr->len);
cp = (char *)BLOCK_END(s->qual_blk);
memset(cp, 255, cr->len);
@@ -2393,7 +2769,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
BLOCK_SIZE(s->qual_blk) += cr->len;
} else {
if (cr->len == 0) {
- cram_stats_add(c->RL_stats, cr->len = cr->aend - cr->apos + 1);
+ cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
+ cram_stats_add(c->stats[DS_RL], cr->len);
}
}
@@ -2401,6 +2778,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
{
int new;
khint_t k;
+ int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
//fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
// cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
@@ -2411,63 +2789,116 @@ static int process_one_read(cram_fd *fd, cram_container *c,
if (!key)
return -1;
- k = kh_put(m_s2i, s->pair, key, &new);
+ k = kh_put(m_s2i, s->pair[sec], key, &new);
if (-1 == new)
return -1;
else if (new > 0)
- kh_val(s->pair, k) = rnum;
+ kh_val(s->pair[sec], k) = rnum;
} else {
new = 1;
}
if (new == 0) {
- cram_record *p = &s->crecs[kh_val(s->pair, k)];
+ cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
+ int aleft, aright, sign;
+
+ aleft = MIN(cr->apos, p->apos);
+ aright = MAX(cr->aend, p->aend);
+ if (cr->apos < p->apos) {
+ sign = 1;
+ } else if (cr->apos > p->apos) {
+ sign = -1;
+ } else if (cr->flags & BAM_FREAD1) {
+ sign = 1;
+ } else {
+ sign = -1;
+ }
- //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair, k));
+ //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k));
- // copy from p to cr
- cr->mate_pos = p->apos;
- cram_stats_add(c->NP_stats, cr->mate_pos);
+ // This vs p: tlen, matepos, flags
+ if (bam_ins_size(b) != sign*(aright-aleft+1))
+ goto detached;
- cr->tlen = cr->aend - p->apos;
- cram_stats_add(c->TS_stats, cr->tlen);
+ if (MAX(bam_mate_pos(b)+1, 0) != p->apos)
+ goto detached;
- cr->mate_flags =
- ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
- ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
- cram_stats_add(c->MF_stats, cr->mate_flags);
+ if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
+ ((p->flags & BAM_FUNMAP) != 0))
+ goto detached;
+
+ if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
+ ((p->flags & BAM_FREVERSE) != 0))
+ goto detached;
+
+
+ // p vs this: tlen, matepos, flags
+ if (p->tlen != -sign*(aright-aleft+1))
+ goto detached;
+
+ if (p->mate_pos != cr->apos)
+ goto detached;
- // copy from cr to p
- cram_stats_del(c->NP_stats, p->mate_pos);
- p->mate_pos = cr->apos;
- cram_stats_add(c->NP_stats, p->mate_pos);
+ if (((p->flags & BAM_FMUNMAP) != 0) !=
+ ((p->mate_flags & CRAM_M_UNMAP) != 0))
+ goto detached;
- cram_stats_del(c->MF_stats, p->mate_flags);
- p->mate_flags =
- ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
- ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
- cram_stats_add(c->MF_stats, p->mate_flags);
+ if (((p->flags & BAM_FMREVERSE) != 0) !=
+ ((p->mate_flags & CRAM_M_REVERSE) != 0))
+ goto detached;
- cram_stats_del(c->TS_stats, p->tlen);
- p->tlen = p->apos - cr->aend;
- cram_stats_add(c->TS_stats, p->tlen);
+ // Supplementary reads are just too ill defined
+ if ((cr->flags & BAM_FSUPPLEMENTARY) ||
+ (p->flags & BAM_FSUPPLEMENTARY))
+ goto detached;
+
+ /*
+ * The fields below are unused when encoding this read as it is
+ * no longer detached. In theory they may get referred to when
+ * processing a 3rd or 4th read in this template?, so we set them
+ * here just to be sure.
+ *
+ * They do not need cram_stats_add() calls those as they are
+ * not emitted.
+ */
+ cr->mate_pos = p->apos;
+ cr->tlen = sign*(aright-aleft+1);
+ cr->mate_flags =
+ ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
+ ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
+
+ // Decrement statistics aggregated earlier
+ cram_stats_del(c->stats[DS_NP], p->mate_pos);
+ cram_stats_del(c->stats[DS_MF], p->mate_flags);
+ cram_stats_del(c->stats[DS_TS], p->tlen);
+ cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
+
+ /* Similarly we could correct the p-> values too, but these will no
+ * longer have any code that refers back to them as the new 'p'
+ * for this template is our current 'cr'.
+ */
+ //p->mate_pos = cr->apos;
+ //p->mate_flags =
+ // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
+ // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
+ //p->tlen = p->apos - cr->aend;
// Clear detached from cr flags
- //cram_stats_del(c->CF_stats, cr->cram_flags);
cr->cram_flags &= ~CRAM_FLAG_DETACHED;
- cram_stats_add(c->CF_stats, cr->cram_flags);
+ cram_stats_add(c->stats[DS_CF], cr->cram_flags);
// Clear detached from p flags and set downstream
- cram_stats_del(c->CF_stats, p->cram_flags);
+ cram_stats_del(c->stats[DS_CF], p->cram_flags);
p->cram_flags &= ~CRAM_FLAG_DETACHED;
p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM;
- cram_stats_add(c->CF_stats, p->cram_flags);
+ cram_stats_add(c->stats[DS_CF], p->cram_flags);
- p->mate_line = rnum - (kh_val(s->pair, k) + 1);
- cram_stats_add(c->NF_stats, p->mate_line);
+ p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
+ cram_stats_add(c->stats[DS_NF], p->mate_line);
- kh_val(s->pair, k) = rnum;
+ kh_val(s->pair[sec], k) = rnum;
} else {
+ detached:
//fprintf(stderr, "unpaired\n");
/* Derive mate flags from this flag */
@@ -2477,24 +2908,24 @@ static int process_one_read(cram_fd *fd, cram_container *c,
if (bam_flag(b) & BAM_FMREVERSE)
cr->mate_flags |= CRAM_M_REVERSE;
- cram_stats_add(c->MF_stats, cr->mate_flags);
+ cram_stats_add(c->stats[DS_MF], cr->mate_flags);
cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
- cram_stats_add(c->NP_stats, cr->mate_pos);
+ cram_stats_add(c->stats[DS_NP], cr->mate_pos);
cr->tlen = bam_ins_size(b);
- cram_stats_add(c->TS_stats, cr->tlen);
+ cram_stats_add(c->stats[DS_TS], cr->tlen);
cr->cram_flags |= CRAM_FLAG_DETACHED;
- cram_stats_add(c->CF_stats, cr->cram_flags);
+ cram_stats_add(c->stats[DS_CF], cr->cram_flags);
+ cram_stats_add(c->stats[DS_NS], bam_mate_ref(b));
}
}
cr->mqual = bam_map_qual(b);
- cram_stats_add(c->MQ_stats, cr->mqual);
+ cram_stats_add(c->stats[DS_MQ], cr->mqual);
cr->mate_ref_id = bam_mate_ref(b);
- cram_stats_add(c->NS_stats, cr->mate_ref_id);
if (!(bam_flag(b) & BAM_FUNMAP)) {
if (c->first_base > cr->apos)
@@ -2549,10 +2980,17 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
slice_rec = c->slice_rec;
curr_rec = c->curr_rec;
- if (fd->version == CRAM_1_VERS ||
- c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice)
- if (NULL == (c = cram_next_container(fd, b)))
+ if (CRAM_MAJOR_VERS(fd->version) == 1 ||
+ c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) {
+ if (NULL == (c = cram_next_container(fd, b))) {
+ if (fd->ctr) {
+ // prevent cram_close attempting to flush
+ cram_free_container(fd->ctr);
+ fd->ctr = NULL;
+ }
return -1;
+ }
+ }
/*
* Due to our processing order, some things we've already done we
@@ -2579,7 +3017,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
// Have we seen this reference before?
if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
- !fd->unsorted) {
+ !fd->unsorted && multi_seq) {
if (!c->refs_used) {
pthread_mutex_lock(&fd->ref_lock);
@@ -2618,7 +3056,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
/* Copy or alloc+copy the bam record, for later encoding */
if (c->bams[c->curr_c_rec])
- bam_copy(&c->bams[c->curr_c_rec], b);
+ bam_copy1(c->bams[c->curr_c_rec], b);
else
c->bams[c->curr_c_rec] = bam_dup(b);
diff --git a/htslib/cram/cram_index.c b/htslib/cram/cram_index.c
index d16f601..8667223 100644
--- a/htslib/cram/cram_index.c
+++ b/htslib/cram/cram_index.c
@@ -86,6 +86,52 @@ static void dump_index(cram_fd *fd) {
}
#endif
+static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
+ int sign = 1;
+ int32_t val = 0;
+ size_t p = *pos;
+
+ while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
+ p++;
+
+ if (p < k->l && k->s[p] == '-')
+ sign = -1, p++;
+
+ if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
+ return -1;
+
+ while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
+ val = val*10 + k->s[p++]-'0';
+
+ *pos = p;
+ *val_p = sign*val;
+
+ return 0;
+}
+
+static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
+ int sign = 1;
+ int64_t val = 0;
+ size_t p = *pos;
+
+ while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
+ p++;
+
+ if (p < k->l && k->s[p] == '-')
+ sign = -1, p++;
+
+ if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
+ return -1;
+
+ while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
+ val = val*10 + k->s[p++]-'0';
+
+ *pos = p;
+ *val_p = sign*val;
+
+ return 0;
+}
+
/*
* Loads a CRAM .crai index into memory.
*
@@ -162,21 +208,24 @@ int cram_index_load(cram_fd *fd, const char *fn) {
// Parse it line at a time
do {
- int nchars;
- char *line = &kstr.s[pos];
-
/* 1.1 layout */
- if (sscanf(line, "%d\t%d\t%d\t%"PRId64"\t%d\t%d%n",
- &e.refid,
- &e.start,
- &e.end,
- &e.offset,
- &e.slice,
- &e.len,
- &nchars) != 6) {
- free(kstr.s);
- free(idx_stack);
- return -1;
+ if (kget_int32(&kstr, &pos, &e.refid) == -1) {
+ free(kstr.s); free(idx_stack); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.start) == -1) {
+ free(kstr.s); free(idx_stack); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.end) == -1) {
+ free(kstr.s); free(idx_stack); return -1;
+ }
+ if (kget_int64(&kstr, &pos, &e.offset) == -1) {
+ free(kstr.s); free(idx_stack); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.slice) == -1) {
+ free(kstr.s); free(idx_stack); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.len) == -1) {
+ free(kstr.s); free(idx_stack); return -1;
}
e.end += e.start-1;
@@ -227,7 +276,6 @@ int cram_index_load(cram_fd *fd, const char *fn) {
}
idx_stack[idx_stack_ptr] = idx;
- pos += nchars;
while (pos < kstr.l && kstr.s[pos] != '\n')
pos++;
pos++;
@@ -313,6 +361,9 @@ cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
continue;
}
}
+ // i==j or i==j-1. Check if j is better.
+ if (from->e[j].start < pos && from->e[j].refid == refid)
+ i = j;
/* The above found *a* bin overlapping, but not necessarily the first */
while (i > 0 && from->e[i-1].end >= pos)
@@ -359,6 +410,7 @@ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
if (fd->ctr) {
cram_free_container(fd->ctr);
fd->ctr = NULL;
+ fd->ooc = 0;
}
return 0;
diff --git a/htslib/cram/cram_io.c b/htslib/cram/cram_io.c
index c5a4c4e..5efc92d 100644
--- a/htslib/cram/cram_io.c
+++ b/htslib/cram/cram_io.c
@@ -57,6 +57,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef HAVE_LIBBZ2
#include <bzlib.h>
#endif
+#ifdef HAVE_LIBLZMA
+#include <lzma.h>
+#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <math.h>
@@ -66,6 +69,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cram/os.h"
#include "cram/md5.h"
#include "cram/open_trace_file.h"
+#include "cram/rANS_static.h"
//#define REF_DEBUG
@@ -78,19 +82,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define RP(...)
#endif
-#ifdef SAMTOOLS
#include "htslib/hfile.h"
-#define paranoid_hclose(fp) (hclose(fp))
-#else
-#define hclose_abruptly(fp) (fclose(fp))
-#define hflush(fp) (fflush(fp))
-#define hgetc(fp) (getc(fp))
-#define hputc(c, fp) (putc((c), (fp)))
-#define hread(fp, buffer, nbytes) (fread((buffer), 1, (nbytes), (fp)))
-#define hseek(fp, offset, whence) (fseeko((fp), (offset), (whence)))
-#define hwrite(fp, buffer, nbytes) (fwrite((buffer), 1, (nbytes), (fp)))
-#define paranoid_hclose(fp) (paranoid_fclose(fp))
-#endif
+#include "htslib/bgzf.h"
+#include "htslib/faidx.h"
+
+#define TRIAL_SPAN 50
+#define NTRIALS 3
+
/* ----------------------------------------------------------------------
* ITF8 encoding and decoding.
@@ -644,6 +642,90 @@ static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
return (char *)cdata;
}
+#ifdef HAVE_LIBLZMA
+/* ------------------------------------------------------------------------ */
+/*
+ * Data compression routines using liblzma (xz)
+ *
+ * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
+ * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
+ * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
+ * as compression times.
+ *
+ * For now we disable this functionality. If it's to be reenabled make sure you
+ * improve the mem_inflate implementation as it's just a test hack at the
+ * moment.
+ */
+
+static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
+ int level) {
+ char *out;
+ size_t out_size = lzma_stream_buffer_bound(size);
+ *cdata_size = 0;
+
+ out = malloc(out_size);
+
+ /* Single call compression */
+ if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
+ (uint8_t *)data, size,
+ (uint8_t *)out, cdata_size,
+ out_size))
+ return NULL;
+
+ return out;
+}
+
+static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
+ lzma_stream strm = LZMA_STREAM_INIT;
+ size_t out_size = 0, out_pos = 0;
+ char *out = NULL;
+ int r;
+
+ /* Initiate the decoder */
+ if (LZMA_OK != lzma_stream_decoder(&strm, 50000000, 0))
+ return NULL;
+
+ /* Decode loop */
+ strm.avail_in = csize;
+ strm.next_in = (uint8_t *)cdata;
+
+ for (;strm.avail_in;) {
+ if (strm.avail_in > out_size - out_pos) {
+ out_size += strm.avail_in * 4 + 32768;
+ out = realloc(out, out_size);
+ }
+ strm.avail_out = out_size - out_pos;
+ strm.next_out = (uint8_t *)&out[out_pos];
+
+ r = lzma_code(&strm, LZMA_RUN);
+ if (LZMA_OK != r && LZMA_STREAM_END != r) {
+ fprintf(stderr, "r=%d\n", r);
+ fprintf(stderr, "mem=%"PRId64"d\n", (int64_t)lzma_memusage(&strm));
+ return NULL;
+ }
+
+ out_pos = strm.total_out;
+
+ if (r == LZMA_STREAM_END)
+ break;
+ }
+
+ /* finish up any unflushed data; necessary? */
+ r = lzma_code(&strm, LZMA_FINISH);
+ if (r != LZMA_OK && r != LZMA_STREAM_END) {
+ fprintf(stderr, "r=%d\n", r);
+ return NULL;
+ }
+
+ out = realloc(out, strm.total_out);
+ *size = strm.total_out;
+
+ lzma_end(&strm);
+
+ return out;
+}
+#endif
+
/* ----------------------------------------------------------------------
* CRAM blocks - the dynamically growable data block. We have code to
* create, update, (un)compress and read/write.
@@ -716,6 +798,32 @@ cram_block *cram_read_block(cram_fd *fd) {
}
}
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ unsigned char dat[100], *cp = dat;;
+ uint32_t crc;
+
+
+ if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
+ free(b);
+ return NULL;
+ }
+
+ *cp++ = b->method;
+ *cp++ = b->content_type;
+ cp += itf8_put(cp, b->content_id);
+ cp += itf8_put(cp, b->comp_size);
+ cp += itf8_put(cp, b->uncomp_size);
+ crc = crc32(0L, dat, cp-dat);
+ crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
+
+ if (crc != b->crc32) {
+ fprintf(stderr, "Block CRC32 failure\n");
+ free(b->data);
+ free(b);
+ return NULL;
+ }
+ }
+
b->orig_method = b->method;
b->idx = 0;
b->byte = 0;
@@ -746,6 +854,27 @@ int cram_write_block(cram_fd *fd, cram_block *b) {
return -1;
}
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ unsigned char dat[100], *cp = dat;;
+ uint32_t crc;
+
+ *cp++ = b->method;
+ *cp++ = b->content_type;
+ cp += itf8_put(cp, b->content_id);
+ cp += itf8_put(cp, b->comp_size);
+ cp += itf8_put(cp, b->uncomp_size);
+ crc = crc32(0L, dat, cp-dat);
+
+ if (b->method == RAW) {
+ b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
+ } else {
+ b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
+ }
+
+ if (-1 == int32_encode(fd, b->crc32))
+ return -1;
+ }
+
return 0;
}
@@ -775,15 +904,16 @@ int cram_uncompress_block(cram_block *b) {
switch (b->method) {
case RAW:
- b->uncomp_size = b->comp_size;
return 0;
case GZIP:
uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
if (!uncomp)
return -1;
- if ((int)uncomp_size != b->uncomp_size)
+ if ((int)uncomp_size != b->uncomp_size) {
+ free(uncomp);
return -1;
+ }
free(b->data);
b->data = (unsigned char *)uncomp;
b->alloc = uncomp_size;
@@ -801,6 +931,7 @@ int cram_uncompress_block(cram_block *b) {
free(uncomp);
return -1;
}
+ free(b->data);
b->data = (unsigned char *)uncomp;
b->alloc = usize;
b->method = RAW;
@@ -814,7 +945,39 @@ int cram_uncompress_block(cram_block *b) {
return -1;
#endif
- case BM_ERROR:
+#ifdef HAVE_LIBLZMA
+ case LZMA:
+ uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
+ if (!uncomp)
+ return -1;
+ if ((int)uncomp_size != b->uncomp_size)
+ return -1;
+ free(b->data);
+ b->data = (unsigned char *)uncomp;
+ b->alloc = uncomp_size;
+ b->method = RAW;
+ break;
+#else
+ case LZMA:
+ fprintf(stderr, "Lzma compression is not compiled into this "
+ "version.\nPlease rebuild and try again.\n");
+ return -1;
+ break;
+#endif
+
+ case RANS: {
+ unsigned int usize = b->uncomp_size, usize2;
+ uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
+ assert(usize == usize2);
+ free(b->data);
+ b->data = (unsigned char *)uncomp;
+ b->alloc = usize2;
+ b->method = RAW;
+ b->uncomp_size = usize2; // Just incase it differs
+ //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
+ break;
+ }
+
default:
return -1;
}
@@ -822,38 +985,68 @@ int cram_uncompress_block(cram_block *b) {
return 0;
}
+static char *cram_compress_by_method(char *in, size_t in_size,
+ size_t *out_size,
+ enum cram_block_method method,
+ int level, int strat) {
+ switch (method) {
+ case GZIP:
+ return zlib_mem_deflate(in, in_size, out_size, level, strat);
+
+ case BZIP2: {
#ifdef HAVE_LIBBZ2
-static int cram_compress_block_bzip2(cram_fd *fd, cram_block *b,
- cram_metrics *metrics, int level) {
- unsigned int comp_size = b->uncomp_size*1.01 + 600;
- char *comp = malloc(comp_size);
- char *data = (char *)b->data;
+ unsigned int comp_size = in_size*1.01 + 600;
+ char *comp = malloc(comp_size);
+ if (!comp)
+ return NULL;
- if (!comp)
- return -1;
+ if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
+ in, in_size,
+ level, 0, 30)) {
+ free(comp);
+ return NULL;
+ }
+ *out_size = comp_size;
+ return comp;
+#else
+ return NULL;
+#endif
+ }
- if (!data)
- data = "";
+ case LZMA:
+#ifdef HAVE_LIBLZMA
+ return lzma_mem_deflate(in, in_size, out_size, level);
+#else
+ return NULL;
+#endif
- if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
- data, b->uncomp_size,
- level, 0, 30)) {
- free(comp);
- return -1;
+ case RANS0: {
+ unsigned int out_size_i;
+ unsigned char *cp;
+ cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
+ *out_size = out_size_i;
+ return (char *)cp;
}
- free(b->data);
- b->data = (unsigned char *)comp;
- b->method = BZIP2;
- b->comp_size = comp_size;
+ case RANS1: {
+ unsigned int out_size_i;
+ unsigned char *cp;
+
+ cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
+ *out_size = out_size_i;
+ return (char *)cp;
+ }
- if (fd->verbose)
- fprintf(stderr, "Compressed block ID %d from %d to %d\n",
- b->content_id, b->uncomp_size, b->comp_size);
+ case RAW:
+ break;
- return 0;
+ default:
+ return NULL;
+ }
+
+ return NULL;
}
-#endif
+
/*
* Compresses a block using one of two different zlib strategies. If we only
@@ -864,114 +1057,347 @@ static int cram_compress_block_bzip2(cram_fd *fd, cram_block *b,
* significantly faster.
*/
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
- int level, int strat,
- int level2, int strat2) {
+ int method, int level) {
+
char *comp = NULL;
size_t comp_size = 0;
+ int strat;
+
+ //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
- if (level == 0) {
+ if (method == RAW || level == 0 || b->uncomp_size == 0) {
b->method = RAW;
b->comp_size = b->uncomp_size;
+ //fprintf(stderr, "Skip block id %d\n", b->content_id);
return 0;
}
- if (b->method != RAW) {
- fprintf(stderr, "Attempt to compress an already compressed block.\n");
- return 0;
- }
+ if (metrics) {
+ pthread_mutex_lock(&fd->metrics_lock);
+ if (metrics->trial > 0 || --metrics->next_trial <= 0) {
+ size_t sz_best = INT_MAX;
+ size_t sz_gz_rle = 0;
+ size_t sz_gz_def = 0;
+ size_t sz_rans0 = 0;
+ size_t sz_rans1 = 0;
+ size_t sz_bzip2 = 0;
+ size_t sz_lzma = 0;
+ int method_best = 0;
+ char *c_best = NULL, *c = NULL;
+
+ if (metrics->revised_method)
+ method = metrics->revised_method;
+ else
+ metrics->revised_method = method;
+
+ if (metrics->next_trial == 0) {
+ metrics->next_trial = TRIAL_SPAN;
+ metrics->trial = NTRIALS;
+ metrics->sz_gz_rle /= 2;
+ metrics->sz_gz_def /= 2;
+ metrics->sz_rans0 /= 2;
+ metrics->sz_rans1 /= 2;
+ metrics->sz_bzip2 /= 2;
+ metrics->sz_lzma /= 2;
+ }
-#ifdef HAVE_LIBBZ2
- if (fd->use_bz2)
- // metrics ignored for bzip2
- return cram_compress_block_bzip2(fd, b, metrics, level);
-#endif
+ pthread_mutex_unlock(&fd->metrics_lock);
+
+ if (method & (1<<GZIP_RLE)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_gz_rle, GZIP, 1, Z_RLE);
+ if (c && sz_best > sz_gz_rle) {
+ sz_best = sz_gz_rle;
+ method_best = GZIP_RLE;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_gz_rle = b->uncomp_size*2+1000;
+ }
- pthread_mutex_lock(&fd->metrics_lock);
- if (strat2 >= 0)
- if (fd->verbose > 1)
- fprintf(stderr, "metrics trial %d, next_trial %d, m1 %d, m2 %d\n",
- metrics->trial, metrics->next_trial,
- metrics->m1, metrics->m2);
-
- if (strat2 >= 0 && (metrics->trial > 0 || --metrics->next_trial <= 0)) {
- char *c1, *c2;
- size_t s1, s2;
-
- if (metrics->next_trial == 0) {
- metrics->next_trial = 100;
- metrics->trial = 3;
- metrics->m1 = metrics->m2 = 0;
- }
- pthread_mutex_unlock(&fd->metrics_lock);
-
- c1 = zlib_mem_deflate((char *)b->data, b->uncomp_size,
- &s1, level, strat);
- c2 = zlib_mem_deflate((char *)b->data, b->uncomp_size,
- &s2, level2, strat2);
- if (!c1 || !c2)
- return -1;
-
- //fprintf(stderr, "1: %6d 2: %6d %5.1f\n", s1, s2, 100.0*s1/s2);
+ //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
+ }
- pthread_mutex_lock(&fd->metrics_lock);
- if (s1 < 0.98 * s2) { // 2nd one should be faster alternative
- if (fd->verbose > 1)
- fprintf(stderr, "M1 wins %d vs %d\n", (int)s1, (int)s2);
- comp = c1; comp_size = s1;
- free(c2);
- metrics->m1++;
+ if (method & (1<<GZIP)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_gz_def, GZIP, level,
+ Z_FILTERED);
+ if (c && sz_best > sz_gz_def) {
+ sz_best = sz_gz_def;
+ method_best = GZIP;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_gz_def = b->uncomp_size*2+1000;
+ }
+
+ //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
+ }
+
+ if (method & (1<<RANS0)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_rans0, RANS0, 0, 0);
+ if (c && sz_best > sz_rans0) {
+ sz_best = sz_rans0;
+ method_best = RANS0;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_rans0 = b->uncomp_size*2+1000;
+ }
+ }
+
+ if (method & (1<<RANS1)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_rans1, RANS1, 0, 0);
+ if (c && sz_best > sz_rans1) {
+ sz_best = sz_rans1;
+ method_best = RANS1;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_rans1 = b->uncomp_size*2+1000;
+ }
+ }
+
+ if (method & (1<<BZIP2)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_bzip2, BZIP2, level, 0);
+ if (c && sz_best > sz_bzip2) {
+ sz_best = sz_bzip2;
+ method_best = BZIP2;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_bzip2 = b->uncomp_size*2+1000;
+ }
+ }
+
+ if (method & (1<<LZMA)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_lzma, LZMA, level, 0);
+ if (c && sz_best > sz_lzma) {
+ sz_best = sz_lzma;
+ method_best = LZMA;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_lzma = b->uncomp_size*2+1000;
+ }
+ }
+
+ //fprintf(stderr, "sz_best = %d\n", sz_best);
+
+ free(b->data);
+ b->data = (unsigned char *)c_best;
+ //printf("method_best = %s\n", cram_block_method2str(method_best));
+ b->method = method_best == GZIP_RLE ? GZIP : method_best;
+ b->comp_size = sz_best;
+
+ pthread_mutex_lock(&fd->metrics_lock);
+ metrics->sz_gz_rle += sz_gz_rle;
+ metrics->sz_gz_def += sz_gz_def;
+ metrics->sz_rans0 += sz_rans0;
+ metrics->sz_rans1 += sz_rans1;
+ metrics->sz_bzip2 += sz_bzip2;
+ metrics->sz_lzma += sz_lzma;
+ if (--metrics->trial == 0) {
+ int best_method = RAW;
+ int best_sz = INT_MAX;
+
+ // Scale methods by cost
+ if (fd->level <= 3) {
+ metrics->sz_rans1 *= 1.02;
+ metrics->sz_gz_def *= 1.04;
+ metrics->sz_bzip2 *= 1.08;
+ metrics->sz_lzma *= 1.10;
+ } else if (fd->level <= 6) {
+ metrics->sz_rans1 *= 1.01;
+ metrics->sz_gz_def *= 1.02;
+ metrics->sz_bzip2 *= 1.03;
+ metrics->sz_lzma *= 1.05;
+ }
+
+ if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
+ best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
+
+ if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
+ best_sz = metrics->sz_gz_def, best_method = GZIP;
+
+ if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
+ best_sz = metrics->sz_rans0, best_method = RANS0;
+
+ if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
+ best_sz = metrics->sz_rans1, best_method = RANS1;
+
+ if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
+ best_sz = metrics->sz_bzip2, best_method = BZIP2;
+
+ if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
+ best_sz = metrics->sz_lzma, best_method = LZMA;
+
+ if (best_method == GZIP_RLE) {
+ metrics->method = GZIP;
+ metrics->strat = Z_RLE;
+ } else {
+ metrics->method = best_method;
+ metrics->strat = Z_FILTERED;
+ }
+
+ // If we see at least MAXFAIL trials in a row for a specific
+ // compression method with more than MAXDELTA aggregate
+ // size then we drop this from the list of methods used
+ // for this block type.
+#define MAXDELTA 0.20
+#define MAXFAILS 4
+ if (best_method == GZIP_RLE) {
+ metrics->gz_rle_cnt = 0;
+ metrics->gz_rle_extra = 0;
+ } else if (best_sz < metrics->sz_gz_rle) {
+ double r = (double)metrics->sz_gz_rle / best_sz - 1;
+ if (++metrics->gz_rle_cnt >= MAXFAILS &&
+ (metrics->gz_rle_extra += r) >= MAXDELTA)
+ method &= ~(1<<GZIP_RLE);
+ }
+
+ if (best_method == GZIP) {
+ metrics->gz_def_cnt = 0;
+ metrics->gz_def_extra = 0;
+ } else if (best_sz < metrics->sz_gz_def) {
+ double r = (double)metrics->sz_gz_def / best_sz - 1;
+ if (++metrics->gz_def_cnt >= MAXFAILS &&
+ (metrics->gz_def_extra += r) >= MAXDELTA)
+ method &= ~(1<<GZIP);
+ }
+
+ if (best_method == RANS0) {
+ metrics->rans0_cnt = 0;
+ metrics->rans0_extra = 0;
+ } else if (best_sz < metrics->sz_rans0) {
+ double r = (double)metrics->sz_rans0 / best_sz - 1;
+ if (++metrics->rans0_cnt >= MAXFAILS &&
+ (metrics->rans0_extra += r) >= MAXDELTA)
+ method &= ~(1<<RANS0);
+ }
+
+ if (best_method == RANS1) {
+ metrics->rans1_cnt = 0;
+ metrics->rans1_extra = 0;
+ } else if (best_sz < metrics->sz_rans1) {
+ double r = (double)metrics->sz_rans1 / best_sz - 1;
+ if (++metrics->rans1_cnt >= MAXFAILS &&
+ (metrics->rans1_extra += r) >= MAXDELTA)
+ method &= ~(1<<RANS1);
+ }
+
+ if (best_method == BZIP2) {
+ metrics->bzip2_cnt = 0;
+ metrics->bzip2_extra = 0;
+ } else if (best_sz < metrics->sz_bzip2) {
+ double r = (double)metrics->sz_bzip2 / best_sz - 1;
+ if (++metrics->bzip2_cnt >= MAXFAILS &&
+ (metrics->bzip2_extra += r) >= MAXDELTA)
+ method &= ~(1<<BZIP2);
+ }
+
+ if (best_method == LZMA) {
+ metrics->lzma_cnt = 0;
+ metrics->lzma_extra = 0;
+ } else if (best_sz < metrics->sz_lzma) {
+ double r = (double)metrics->sz_lzma / best_sz - 1;
+ if (++metrics->lzma_cnt >= MAXFAILS &&
+ (metrics->lzma_extra += r) >= MAXDELTA)
+ method &= ~(1<<LZMA);
+ }
+
+ //if (method != metrics->revised_method)
+ // fprintf(stderr, "%d: method from %x to %x\n",
+ // b->content_id, metrics->revised_method, method);
+ metrics->revised_method = method;
+ }
+ pthread_mutex_unlock(&fd->metrics_lock);
} else {
- if (fd->verbose > 1)
- fprintf(stderr, "M2 wins %d vs %d\n", (int)s1, (int)s2);
- comp = c2; comp_size = s2;
- free(c1);
- metrics->m2++;
- }
- metrics->trial--;
- pthread_mutex_unlock(&fd->metrics_lock);
- } else if (strat2 >= 0) {
- int xlevel = metrics->m1 > metrics->m2 ? level : level2;
- int xstrat = metrics->m1 > metrics->m2 ? strat : strat2;
- pthread_mutex_unlock(&fd->metrics_lock);
- comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size,
- xlevel, xstrat);
+ strat = metrics->strat;
+ method = metrics->method;
+
+ pthread_mutex_unlock(&fd->metrics_lock);
+ comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &comp_size, method,
+ level, strat);
+ if (!comp)
+ return -1;
+ free(b->data);
+ b->data = (unsigned char *)comp;
+ b->comp_size = comp_size;
+ b->method = method;
+ }
+
} else {
- pthread_mutex_unlock(&fd->metrics_lock);
- comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size,
- level, strat);
+ // no cached metrics, so just do zlib?
+ comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &comp_size, GZIP, level, Z_FILTERED);
+ if (!comp) {
+ fprintf(stderr, "Compression failed!\n");
+ return -1;
+ }
+ free(b->data);
+ b->data = (unsigned char *)comp;
+ b->comp_size = comp_size;
+ b->method = GZIP;
}
- if (!comp)
- return -1;
-
- free(b->data);
- b->data = (unsigned char *)comp;
- b->method = GZIP;
- b->comp_size = comp_size;
-
if (fd->verbose)
- fprintf(stderr, "Compressed block ID %d from %d to %d\n",
- b->content_id, b->uncomp_size, b->comp_size);
+ fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n",
+ b->content_id, b->uncomp_size, b->comp_size,
+ cram_block_method2str(b->method));
+
+ if (b->method == RANS1)
+ b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
return 0;
}
cram_metrics *cram_new_metrics(void) {
- cram_metrics *m = malloc(sizeof(*m));
+ cram_metrics *m = calloc(1, sizeof(*m));
if (!m)
return NULL;
- m->m1 = m->m2 = 0;
- m->trial = 2;
- m->next_trial = 100;
+ m->trial = NTRIALS-1;
+ m->next_trial = TRIAL_SPAN;
+ m->method = RAW;
+ m->strat = 0;
+ m->revised_method = 0;
+
return m;
}
char *cram_block_method2str(enum cram_block_method m) {
switch(m) {
- case RAW: return "RAW";
- case GZIP: return "GZIP";
- case BZIP2: return "BZIP2";
- case BM_ERROR: break;
+ case RAW: return "RAW";
+ case GZIP: return "GZIP";
+ case BZIP2: return "BZIP2";
+ case LZMA: return "LZMA";
+ case RANS0: return "RANS0";
+ case RANS1: return "RANS1";
+ case GZIP_RLE: return "GZIP_RLE";
+ case ERROR: break;
}
return "?";
}
@@ -1069,7 +1495,7 @@ void refs_free(refs_t *r) {
free(r->ref_id);
if (r->fp)
- fclose(r->fp);
+ bgzf_close(r->fp);
pthread_mutex_destroy(&r->lock);
@@ -1105,6 +1531,37 @@ static refs_t *refs_create(void) {
}
/*
+ * Opens a reference fasta file as a BGZF stream, allowing for
+ * compressed files. It automatically builds a .fai file if
+ * required and if compressed a .gzi bgzf index too.
+ *
+ * Returns a BGZF handle on success;
+ * NULL on failure.
+ */
+static BGZF *bgzf_open_ref(char *fn, char *mode) {
+ BGZF *fp;
+ char fai_file[PATH_MAX];
+
+ snprintf(fai_file, PATH_MAX, "%s.fai", fn);
+ if (access(fai_file, R_OK) != 0)
+ if (fai_build(fn) != 0)
+ return NULL;
+
+ if (!(fp = bgzf_open(fn, mode))) {
+ perror(fn);
+ return NULL;
+ }
+
+ if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
+ fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn);
+ bgzf_close(fp);
+ return NULL;
+ }
+
+ return fp;
+}
+
+/*
* Loads a FAI file for a reference.fasta.
* "is_err" indicates whether failure to load is worthy of emitting an
* error message. In some cases (eg with embedded references) we
@@ -1120,6 +1577,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
char line[8192];
refs_t *r = r_orig;
size_t fn_l = strlen(fn);
+ int id = 0, id_alloc = 0;
RP("refs_load_fai %s\n", fn);
@@ -1135,7 +1593,8 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
}
if (r->fp)
- fclose(r->fp);
+ if (bgzf_close(r->fp) != 0)
+ goto err;
r->fp = NULL;
if (!(r->fn = string_dup(r->pool, fn)))
@@ -1144,11 +1603,8 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
r->fn[fn_l-4] = 0;
- if (!(r->fp = fopen(r->fn, "r"))) {
- if (is_err)
- perror(fn);
+ if (!(r->fp = bgzf_open_ref(r->fn, "r")))
goto err;
- }
/* Parse .fai file and load meta-data */
sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
@@ -1224,6 +1680,18 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
kh_val(r->h_meta, k) = e;
}
}
+
+ if (id >= id_alloc) {
+ int x;
+
+ id_alloc = id_alloc ?id_alloc*2 : 16;
+ r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
+
+ for (x = id; x < id_alloc; x++)
+ r->ref_id[x] = NULL;
+ }
+ r->ref_id[id] = e;
+ r->nref = ++id;
}
return r;
@@ -1277,7 +1745,7 @@ int refs2id(refs_t *r, SAM_hdr *h) {
* -1 on failure
*/
static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
- int i;
+ int i, j;
if (!h || h->nref == 0)
return 0;
@@ -1285,48 +1753,46 @@ static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
//fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
/* Existing refs are fine, as long as they're compatible with the hdr. */
- i = r->nref;
- if (r->nref < h->nref)
- r->nref = h->nref;
-
- if (!(r->ref_id = realloc(r->ref_id, r->nref * sizeof(*r->ref_id))))
+ if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
return -1;
- for (; i < r->nref; i++)
- r->ref_id[i] = NULL;
-
/* Copy info from h->ref[i] over to r */
- for (i = 0; i < h->nref; i++) {
+ for (i = 0, j = r->nref; i < h->nref; i++) {
SAM_hdr_type *ty;
SAM_hdr_tag *tag;
khint_t k;
int n;
- if (r->ref_id[i] && 0 == strcmp(r->ref_id[i]->name, h->ref[i].name))
+ k = kh_get(refs, r->h_meta, h->ref[i].name);
+ if (k != kh_end(r->h_meta))
+ // Ref already known about
continue;
- if (!(r->ref_id[i] = calloc(1, sizeof(ref_entry))))
+ if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
return -1;
- if (!h->ref[i].name)
+ if (!h->ref[j].name)
return -1;
- r->ref_id[i]->name = string_dup(r->pool, h->ref[i].name);
- r->ref_id[i]->length = 0; // marker for not yet loaded
+ r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
+ r->ref_id[j]->length = 0; // marker for not yet loaded
/* Initialise likely filename if known */
if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
- r->ref_id[i]->fn = string_dup(r->pool, tag->str+3);
- //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[i]->name, r->ref_id[i]->fn);
+ r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
+ //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
}
}
- k = kh_put(refs, r->h_meta, r->ref_id[i]->name, &n);
+ k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
if (n <= 0) // already exists or error
return -1;
- kh_val(r->h_meta, k) = r->ref_id[i];
+ kh_val(r->h_meta, k) = r->ref_id[j];
+
+ j++;
}
+ r->nref = j;
return 0;
}
@@ -1339,6 +1805,8 @@ static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
* in).
*/
int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
+ if (fd->header)
+ sam_hdr_free(fd->header);
fd->header = hdr;
return refs_from_header(fd->refs, fd, hdr);
}
@@ -1416,6 +1884,30 @@ void mkdir_prefix(char *path, int mode) {
}
/*
+ * Return the cache directory to use, based on the first of these
+ * environment variables to be set to a non-empty value.
+ */
+static const char *get_cache_basedir(const char **extra) {
+ char *base;
+
+ *extra = "";
+
+ base = getenv("XDG_CACHE_HOME");
+ if (base && *base) return base;
+
+ base = getenv("HOME");
+ if (base && *base) { *extra = "/.cache"; return base; }
+
+ base = getenv("TMPDIR");
+ if (base && *base) return base;
+
+ base = getenv("TEMP");
+ if (base && *base) return base;
+
+ return "/tmp";
+}
+
+/*
* Queries the M5 string from the header and attempts to populate the
* reference from this using the REF_PATH environment.
*
@@ -1426,15 +1918,28 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
char *ref_path = getenv("REF_PATH");
SAM_hdr_type *ty;
SAM_hdr_tag *tag;
- char path[PATH_MAX], path_tmp[PATH_MAX];
+ char path[PATH_MAX], path_tmp[PATH_MAX], cache[PATH_MAX];
char *local_cache = getenv("REF_CACHE");
mFILE *mf;
if (fd->verbose)
fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
- if (!ref_path || *ref_path == 0)
+ if (!ref_path || *ref_path == '\0') {
+ /*
+ * If we have no ref path, we use the EBI server.
+ * However to avoid spamming it we require a local ref cache too.
+ */
ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
+ if (!local_cache || *local_cache == '\0') {
+ const char *extra;
+ const char *base = get_cache_basedir(&extra);
+ snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
+ local_cache = cache;
+ if (fd->verbose)
+ fprintf(stderr, "Populating local cache: %s\n", local_cache);
+ }
+ }
if (!r->name)
return -1;
@@ -1451,18 +1956,19 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
/* Use cache if available */
if (local_cache && *local_cache) {
struct stat sb;
- FILE *fp;
+ BGZF *fp;
expand_cache_path(path, local_cache, tag->str+3);
- if (0 == stat(path, &sb) && (fp = fopen(path, "r"))) {
+ if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
r->length = sb.st_size;
r->offset = r->line_length = r->bases_per_line = 0;
r->fn = string_dup(fd->refs->pool, path);
if (fd->refs->fp)
- fclose(fd->refs->fp);
+ if (bgzf_close(fd->refs->fp) != 0)
+ return -1;
fd->refs->fp = fp;
fd->refs->fn = r->fn;
@@ -1491,14 +1997,16 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
: tag->str+3;
if (fd->refs->fp) {
- fclose(fd->refs->fp);
+ if (bgzf_close(fd->refs->fp) != 0)
+ return -1;
fd->refs->fp = NULL;
}
if (!(refs = refs_load_fai(fd->refs, fn, 0)))
return -1;
fd->refs = refs;
if (fd->refs->fp) {
- fclose(fd->refs->fp);
+ if (bgzf_close(fd->refs->fp) != 0)
+ return -1;
fd->refs->fp = NULL;
}
@@ -1590,10 +2098,8 @@ static void cram_ref_decr_locked(refs_t *r, int id) {
r->ref_id[r->last_id]->seq = NULL;
r->ref_id[r->last_id]->length = 0;
}
- r->last_id = -1;
- } else {
- r->last_id = id;
}
+ r->last_id = id;
}
}
@@ -1612,7 +2118,7 @@ void cram_ref_decr(refs_t *r, int id) {
* Returns all or part of a reference sequence on success (malloced);
* NULL on failure.
*/
-static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) {
+static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
off_t offset, len;
char *seq;
@@ -1633,8 +2139,8 @@ static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) {
(end-1) % e->bases_per_line
: end-1) - offset + 1;
- if (0 != fseeko(fp, offset, SEEK_SET)) {
- perror("fseeko() on reference file");
+ if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
+ perror("bgzf_useek() on reference file");
return NULL;
}
@@ -1642,8 +2148,8 @@ static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) {
return NULL;
}
- if (len != fread(seq, 1, len, fp)) {
- perror("fread() on reference file");
+ if (len != bgzf_read(fp, seq, len)) {
+ perror("bgzf_read() on reference file");
free(seq);
return NULL;
}
@@ -1714,12 +2220,11 @@ ref_entry *cram_ref_load(refs_t *r, int id) {
/* Open file if it's not already the current open reference */
if (strcmp(r->fn, e->fn) || r->fp == NULL) {
if (r->fp)
- fclose(r->fp);
+ if (bgzf_close(r->fp) != 0)
+ return NULL;
r->fn = e->fn;
- if (!(r->fp = fopen(r->fn, "r"))) {
- perror(r->fn);
+ if (!(r->fp = bgzf_open_ref(r->fn, "r")))
return NULL;
- }
}
RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
@@ -1834,6 +2339,8 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
return NULL;
}
r = fd->refs->ref_id[id];
+ if (fd->unsorted)
+ cram_ref_incr_locked(fd->refs, id);
}
@@ -1924,10 +2431,10 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
/* Open file if it's not already the current open reference */
if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
if (fd->refs->fp)
- fclose(fd->refs->fp);
+ if (bgzf_close(fd->refs->fp) != 0)
+ return NULL;
fd->refs->fn = r->fn;
- if (!(fd->refs->fp = fopen(fd->refs->fn, "r"))) {
- perror(fd->refs->fn);
+ if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r"))) {
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
@@ -1969,14 +2476,17 @@ int cram_load_reference(cram_fd *fd, char *fn) {
fd->ref_fn = fn;
if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
+ if (fd->refs)
+ refs_free(fd->refs);
if (!(fd->refs = refs_create()))
return -1;
if (-1 == refs_from_header(fd->refs, fd, fd->header))
return -1;
}
- if (-1 == refs2id(fd->refs, fd->header))
- return -1;
+ if (fd->header)
+ if (-1 == refs2id(fd->refs, fd->header))
+ return -1;
return fn ? 0 : -1;
}
@@ -1994,6 +2504,8 @@ int cram_load_reference(cram_fd *fd, char *fn) {
*/
cram_container *cram_new_container(int nrec, int nslice) {
cram_container *c = calloc(1, sizeof(*c));
+ enum cram_DS_ID id;
+
if (!c)
return NULL;
@@ -2023,32 +2535,8 @@ cram_container *cram_new_container(int nrec, int nslice) {
goto err;
c->comp_hdr_block = NULL;
- if (!(c->BF_stats = cram_stats_create())) goto err;
- if (!(c->CF_stats = cram_stats_create())) goto err;
- if (!(c->RN_stats = cram_stats_create())) goto err;
- if (!(c->AP_stats = cram_stats_create())) goto err;
- if (!(c->RG_stats = cram_stats_create())) goto err;
- if (!(c->MQ_stats = cram_stats_create())) goto err;
- if (!(c->NS_stats = cram_stats_create())) goto err;
- if (!(c->NP_stats = cram_stats_create())) goto err;
- if (!(c->TS_stats = cram_stats_create())) goto err;
- if (!(c->MF_stats = cram_stats_create())) goto err;
- if (!(c->NF_stats = cram_stats_create())) goto err;
- if (!(c->RL_stats = cram_stats_create())) goto err;
- if (!(c->FN_stats = cram_stats_create())) goto err;
- if (!(c->FC_stats = cram_stats_create())) goto err;
- if (!(c->FP_stats = cram_stats_create())) goto err;
- if (!(c->DL_stats = cram_stats_create())) goto err;
- if (!(c->BA_stats = cram_stats_create())) goto err;
- if (!(c->QS_stats = cram_stats_create())) goto err;
- if (!(c->BS_stats = cram_stats_create())) goto err;
- if (!(c->TC_stats = cram_stats_create())) goto err;
- if (!(c->TN_stats = cram_stats_create())) goto err;
- if (!(c->TL_stats = cram_stats_create())) goto err;
- if (!(c->RI_stats = cram_stats_create())) goto err;
- if (!(c->RS_stats = cram_stats_create())) goto err;
- if (!(c->PD_stats = cram_stats_create())) goto err;
- if (!(c->HC_stats = cram_stats_create())) goto err;
+ for (id = DS_RN; id < DS_TN; id++)
+ if (!(c->stats[id] = cram_stats_create())) goto err;
//c->aux_B_stats = cram_stats_create();
@@ -2068,6 +2556,7 @@ cram_container *cram_new_container(int nrec, int nslice) {
}
void cram_free_container(cram_container *c) {
+ enum cram_DS_ID id;
int i;
if (!c)
@@ -2092,34 +2581,8 @@ void cram_free_container(cram_container *c) {
free(c->slices);
}
- if (c->TS_stats) cram_stats_free(c->TS_stats);
- if (c->RG_stats) cram_stats_free(c->RG_stats);
- if (c->FP_stats) cram_stats_free(c->FP_stats);
- if (c->NS_stats) cram_stats_free(c->NS_stats);
- if (c->RN_stats) cram_stats_free(c->RN_stats);
- if (c->CF_stats) cram_stats_free(c->CF_stats);
- if (c->TN_stats) cram_stats_free(c->TN_stats);
- if (c->BA_stats) cram_stats_free(c->BA_stats);
- if (c->TV_stats) cram_stats_free(c->TV_stats);
- if (c->BS_stats) cram_stats_free(c->BS_stats);
- if (c->FC_stats) cram_stats_free(c->FC_stats);
- if (c->BF_stats) cram_stats_free(c->BF_stats);
- if (c->AP_stats) cram_stats_free(c->AP_stats);
- if (c->NF_stats) cram_stats_free(c->NF_stats);
- if (c->MF_stats) cram_stats_free(c->MF_stats);
- if (c->FN_stats) cram_stats_free(c->FN_stats);
- if (c->RL_stats) cram_stats_free(c->RL_stats);
- if (c->DL_stats) cram_stats_free(c->DL_stats);
- if (c->TC_stats) cram_stats_free(c->TC_stats);
- if (c->TL_stats) cram_stats_free(c->TL_stats);
- if (c->MQ_stats) cram_stats_free(c->MQ_stats);
- if (c->TM_stats) cram_stats_free(c->TM_stats);
- if (c->QS_stats) cram_stats_free(c->QS_stats);
- if (c->NP_stats) cram_stats_free(c->NP_stats);
- if (c->RI_stats) cram_stats_free(c->RI_stats);
- if (c->RS_stats) cram_stats_free(c->RS_stats);
- if (c->PD_stats) cram_stats_free(c->PD_stats);
- if (c->HC_stats) cram_stats_free(c->HC_stats);
+ for (id = DS_RN; id < DS_TN; id++)
+ if (c->stats[id]) cram_stats_free(c->stats[id]);
//if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
@@ -2140,9 +2603,10 @@ cram_container *cram_read_container(cram_fd *fd) {
size_t rd = 0;
fd->err = 0;
+ fd->eof = 0;
memset(&c2, 0, sizeof(c2));
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
if ((s = itf8_decode(fd, &c2.length)) == -1) {
fd->eof = fd->empty_container ? 1 : 2;
return NULL;
@@ -2151,7 +2615,11 @@ cram_container *cram_read_container(cram_fd *fd) {
}
} else {
if ((s = int32_decode(fd, &c2.length)) == -1) {
- fd->eof = fd->empty_container ? 1 : 2;
+ if (CRAM_MAJOR_VERS(fd->version) == 2 &&
+ CRAM_MINOR_VERS(fd->version) == 0)
+ fd->eof = 1; // EOF blocks arrived in v2.1
+ else
+ fd->eof = fd->empty_container ? 1 : 2;
return NULL;
} else {
rd+=s;
@@ -2162,14 +2630,23 @@ cram_container *cram_read_container(cram_fd *fd) {
if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s;
if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s;
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
c2.record_counter = 0;
c2.num_bases = 0;
} else {
- if ((s = itf8_decode(fd, &c2.record_counter)) == -1)
- return NULL;
- else
- rd += s;
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ if ((s = ltf8_decode(fd, &c2.record_counter)) == -1)
+ return NULL;
+ else
+ rd += s;
+ } else {
+ int32_t i32;
+ if ((s = itf8_decode(fd, &i32)) == -1)
+ return NULL;
+ else
+ rd += s;
+ c2.record_counter = i32;
+ }
if ((s = ltf8_decode(fd, &c2.num_bases))== -1)
return NULL;
@@ -2198,8 +2675,52 @@ cram_container *cram_read_container(cram_fd *fd) {
rd += s;
}
}
- c->offset = rd;
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ uint32_t crc, i;
+ unsigned char *dat = malloc(50 + 5*(c->num_landmarks)), *cp = dat;
+ if (!dat) {
+ cram_free_container(c);
+ return NULL;
+ }
+ if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
+ return NULL;
+ else
+ rd+=4;
+
+ /* Reencode first as we can't easily access the original byte stream.
+ *
+ * FIXME: Technically this means this may not be fool proof. We could
+ * create a CRAM file using a 2 byte ITF8 value that can fit in a
+ * 1 byte field, meaning the encoding is different to the original
+ * form and so has a different CRC.
+ *
+ * The correct implementation would be to have an alternative form
+ * of itf8_decode which also squirrels away the raw byte stream
+ * during decoding so we can then CRC that.
+ */
+ *(unsigned int *)cp = le_int4(c->length); cp += 4;
+ cp += itf8_put(cp, c->ref_seq_id);
+ cp += itf8_put(cp, c->ref_seq_start);
+ cp += itf8_put(cp, c->ref_seq_span);
+ cp += itf8_put(cp, c->num_records);
+ cp += ltf8_put((char *)cp, c->record_counter);
+ cp += itf8_put(cp, c->num_bases);
+ cp += itf8_put(cp, c->num_blocks);
+ cp += itf8_put(cp, c->num_landmarks);
+ for (i = 0; i < c->num_landmarks; i++) {
+ cp += itf8_put(cp, c->landmark[i]);
+ }
+
+ crc = crc32(0L, dat, cp-dat);
+ if (crc != c->crc32) {
+ fprintf(stderr, "Container header CRC32 failure\n");
+ cram_free_container(c);
+ return NULL;
+ }
+ }
+
+ c->offset = rd;
c->slices = NULL;
c->curr_slice = 0;
c->max_slice = c->num_landmarks;
@@ -2230,11 +2751,11 @@ int cram_write_container(cram_fd *fd, cram_container *c) {
char buf_a[1024], *buf = buf_a, *cp;
int i;
- if (50 + c->num_landmarks * 5 >= 1024)
- buf = malloc(50 + c->num_landmarks * 5);
+ if (55 + c->num_landmarks * 5 >= 1024)
+ buf = malloc(55 + c->num_landmarks * 5);
cp = buf;
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
cp += itf8_put(cp, c->length);
} else {
*(int32_t *)cp = le_int4(c->length);
@@ -2250,14 +2771,28 @@ int cram_write_container(cram_fd *fd, cram_container *c) {
cp += itf8_put(cp, c->ref_seq_span);
}
cp += itf8_put(cp, c->num_records);
- if (fd->version != CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 2) {
cp += itf8_put(cp, c->record_counter);
cp += ltf8_put(cp, c->num_bases);
+ } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ cp += ltf8_put(cp, c->record_counter);
+ cp += ltf8_put(cp, c->num_bases);
}
+
cp += itf8_put(cp, c->num_blocks);
cp += itf8_put(cp, c->num_landmarks);
for (i = 0; i < c->num_landmarks; i++)
cp += itf8_put(cp, c->landmark[i]);
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ c->crc32 = crc32(0L, (uc *)buf, cp-buf);
+ cp[0] = c->crc32 & 0xff;
+ cp[1] = (c->crc32 >> 8) & 0xff;
+ cp[2] = (c->crc32 >> 16) & 0xff;
+ cp[3] = (c->crc32 >> 24) & 0xff;
+ cp += 4;
+ }
+
if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
if (buf != buf_a)
free(buf);
@@ -2452,35 +2987,10 @@ void cram_free_compression_header(cram_block_compression_hdr *hdr) {
}
}
- if (hdr->BF_codec) hdr->BF_codec->free(hdr->BF_codec);
- if (hdr->CF_codec) hdr->CF_codec->free(hdr->CF_codec);
- if (hdr->RL_codec) hdr->RL_codec->free(hdr->RL_codec);
- if (hdr->AP_codec) hdr->AP_codec->free(hdr->AP_codec);
- if (hdr->RG_codec) hdr->RG_codec->free(hdr->RG_codec);
- if (hdr->MF_codec) hdr->MF_codec->free(hdr->MF_codec);
- if (hdr->NS_codec) hdr->NS_codec->free(hdr->NS_codec);
- if (hdr->NP_codec) hdr->NP_codec->free(hdr->NP_codec);
- if (hdr->TS_codec) hdr->TS_codec->free(hdr->TS_codec);
- if (hdr->NF_codec) hdr->NF_codec->free(hdr->NF_codec);
- if (hdr->TC_codec) hdr->TC_codec->free(hdr->TC_codec);
- if (hdr->TN_codec) hdr->TN_codec->free(hdr->TN_codec);
- if (hdr->TL_codec) hdr->TL_codec->free(hdr->TL_codec);
- if (hdr->FN_codec) hdr->FN_codec->free(hdr->FN_codec);
- if (hdr->FC_codec) hdr->FC_codec->free(hdr->FC_codec);
- if (hdr->FP_codec) hdr->FP_codec->free(hdr->FP_codec);
- if (hdr->BS_codec) hdr->BS_codec->free(hdr->BS_codec);
- if (hdr->IN_codec) hdr->IN_codec->free(hdr->IN_codec);
- if (hdr->SC_codec) hdr->SC_codec->free(hdr->SC_codec);
- if (hdr->DL_codec) hdr->DL_codec->free(hdr->DL_codec);
- if (hdr->BA_codec) hdr->BA_codec->free(hdr->BA_codec);
- if (hdr->MQ_codec) hdr->MQ_codec->free(hdr->MQ_codec);
- if (hdr->RN_codec) hdr->RN_codec->free(hdr->RN_codec);
- if (hdr->QS_codec) hdr->QS_codec->free(hdr->QS_codec);
- if (hdr->Qs_codec) hdr->Qs_codec->free(hdr->Qs_codec);
- if (hdr->RI_codec) hdr->RI_codec->free(hdr->RI_codec);
- if (hdr->RS_codec) hdr->RS_codec->free(hdr->RS_codec);
- if (hdr->PD_codec) hdr->PD_codec->free(hdr->PD_codec);
- if (hdr->HC_codec) hdr->HC_codec->free(hdr->HC_codec);
+ for (i = 0; i < DS_END; i++) {
+ if (hdr->codecs[i])
+ hdr->codecs[i]->free(hdr->codecs[i]);
+ }
if (hdr->TL)
free(hdr->TL);
@@ -2547,17 +3057,30 @@ void cram_free_slice(cram_slice *s) {
if (s->aux_blk)
cram_free_block(s->aux_blk);
+ if (s->aux_OQ_blk)
+ cram_free_block(s->aux_OQ_blk);
+
+ if (s->aux_BQ_blk)
+ cram_free_block(s->aux_BQ_blk);
+
+ if (s->aux_FZ_blk)
+ cram_free_block(s->aux_FZ_blk);
+
+ if (s->aux_oq_blk)
+ cram_free_block(s->aux_oq_blk);
+
+ if (s->aux_os_blk)
+ cram_free_block(s->aux_os_blk);
+
+ if (s->aux_oz_blk)
+ cram_free_block(s->aux_oz_blk);
+
if (s->base_blk)
cram_free_block(s->base_blk);
if (s->soft_blk)
cram_free_block(s->soft_blk);
-#ifdef TN_external
- if (s->tn_blk)
- cram_free_block(s->tn_blk);
-#endif
-
if (s->cigar)
free(s->cigar);
@@ -2567,16 +3090,16 @@ void cram_free_slice(cram_slice *s) {
if (s->features)
free(s->features);
-#ifndef TN_external
if (s->TN)
free(s->TN);
-#endif
-
+
if (s->pair_keys)
string_pool_destroy(s->pair_keys);
- if (s->pair)
- kh_destroy(m_s2i, s->pair);
+ if (s->pair[0])
+ kh_destroy(m_s2i, s->pair[0]);
+ if (s->pair[1])
+ kh_destroy(m_s2i, s->pair[1]);
free(s);
}
@@ -2601,21 +3124,17 @@ cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
s->block = NULL;
s->block_by_id = NULL;
s->last_apos = 0;
- s->id = 0;
- if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
+ if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
s->cigar = NULL;
s->cigar_alloc = 0;
s->ncigar = 0;
- if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
- if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err;
- if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err;
- if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err;
- if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err;
- if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err;
-#ifdef TN_external
- if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err;
-#endif
+ if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
+ if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
+ if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
+ if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
+ if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
+ if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
s->features = NULL;
s->nfeatures = s->afeatures = 0;
@@ -2627,7 +3146,8 @@ cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
// Volatile keys as we do realloc in dstring
if (!(s->pair_keys = string_pool_create(8192))) goto err;
- if (!(s->pair = kh_init(m_s2i))) goto err;
+ if (!(s->pair[0] = kh_init(m_s2i))) goto err;
+ if (!(s->pair[1] = kh_init(m_s2i))) goto err;
#ifdef BA_external
s->BA_len = 0;
@@ -2706,23 +3226,17 @@ cram_slice *cram_read_slice(cram_fd *fd) {
s->cigar_alloc = 0;
s->ncigar = 0;
- if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
- if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err;
- if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err;
- if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err;
- if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err;
- if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err;
-#ifdef TN_external
- if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err;
-#endif
-
+ if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
+ if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
+ if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
+ if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
+ if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
+ if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
s->crecs = NULL;
s->last_apos = s->hdr->ref_seq_start;
- s->id = fd->slice_num++;
-
return s;
err:
@@ -2760,9 +3274,9 @@ cram_file_def *cram_read_file_def(cram_fd *fd) {
return NULL;
}
- if (def->major_version > 2) {
+ if (def->major_version > 3) {
fprintf(stderr, "CRAM version number mismatch\n"
- "Expected 1.x or 2.x, got %d.%d\n",
+ "Expected 1.x, 2.x or 3.x, got %d.%d\n",
def->major_version, def->minor_version);
free(def);
return NULL;
@@ -2806,7 +3320,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
SAM_hdr *hdr;
/* 1.1 onwards stores the header in the first block of a container */
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
/* Length */
if (-1 == int32_decode(fd, &header_len))
return NULL;
@@ -2837,8 +3351,9 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
cram_free_container(c);
return NULL;
}
+ cram_uncompress_block(b);
- len = b->comp_size + 2 +
+ len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(b->content_id) +
itf8_size(b->uncomp_size) +
itf8_size(b->comp_size);
@@ -2850,12 +3365,13 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
cram_free_block(b);
return NULL;
}
- if (NULL == (header = malloc(header_len))) {
+ if (NULL == (header = malloc(header_len+1))) {
cram_free_container(c);
cram_free_block(b);
return NULL;
}
memcpy(header, BLOCK_END(b), header_len);
+ header[header_len]='\0';
cram_free_block(b);
/* Consume any remaining blocks */
@@ -2864,7 +3380,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
cram_free_container(c);
return NULL;
}
- len += b->comp_size + 2 +
+ len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(b->content_id) +
itf8_size(b->uncomp_size) +
itf8_size(b->comp_size);
@@ -2890,11 +3406,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
}
/* Parse */
-#ifdef SAMTOOLS
hdr = sam_hdr_parse_(header, header_len);
-#else
- hdr = sam_hdr_parse(header, header_len);
-#endif
free(header);
return hdr;
@@ -2930,14 +3442,20 @@ static void full_path(char *out, char *in) {
* Returns 0 on success
* -1 on failure
*/
-//#define BLANK_BLOCK
-//#define PADDED_CONTAINER
-#define PADDED_BLOCK
int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
int header_len;
+ int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
+
+ /* Write CRAM MAGIC if not yet written. */
+ if (fd->file_def->major_version == 0) {
+ fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
+ fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
+ if (0 != cram_write_file_def(fd, fd->file_def))
+ return -1;
+ }
/* 1.0 requires and UNKNOWN read-group */
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
if (sam_hdr_add(hdr, "RG",
"ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
@@ -2996,7 +3514,7 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
/* Length */
header_len = sam_hdr_length(hdr);
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
if (-1 == int32_encode(fd, header_len))
return -1;
@@ -3004,11 +3522,12 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
return -1;
} else {
- /* Create a block inside a container */
+ /* Create block(s) inside a container */
cram_block *b = cram_new_block(FILE_HEADER, 0);
cram_container *c = cram_new_container(0, 0);
int padded_length;
char *pads;
+ int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
if (!b || !c) {
if (b) cram_free_block(b);
@@ -3020,53 +3539,62 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
BLOCK_UPLEN(b);
-#ifndef BLANK_BLOCK
- c->num_blocks = 1;
- c->num_landmarks = 1;
- if (!(c->landmark = malloc(sizeof(*c->landmark)))) {
- cram_free_block(b);
- cram_free_container(c);
- return -1;
- }
- c->landmark[0] = 0;
+ // Compress header block if V3.0 and above
+ if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) {
+ int method = 1<<GZIP;
+ if (fd->use_bz2)
+ method |= 1<<BZIP2;
+ if (fd->use_lzma)
+ method |= 1<<LZMA;
+ cram_compress_block(fd, b, NULL, method, fd->level);
+ }
+
+ if (blank_block) {
+ c->length = b->comp_size + 2 + 4*is_cram_3 +
+ itf8_size(b->content_id) +
+ itf8_size(b->uncomp_size) +
+ itf8_size(b->comp_size);
- c->length = b->uncomp_size + 2 +
- itf8_size(b->content_id) +
- itf8_size(b->uncomp_size) +
- itf8_size(b->comp_size);
-#else
- c->length = b->uncomp_size + 2 +
- itf8_size(b->content_id) +
- itf8_size(b->uncomp_size) +
- itf8_size(b->comp_size);
+ c->num_blocks = 2;
+ c->num_landmarks = 2;
+ if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
+ c->landmark[0] = 0;
+ c->landmark[1] = c->length;
- c->num_blocks = 2;
- c->num_landmarks = 2;
- if (!(c->landmark = malloc(2*sizeof(*c->landmark))))
- return -1;
- c->landmark[0] = 0;
- c->landmark[1] = c->length;
+ // Plus extra storage for uncompressed secondary blank block
+ padded_length = MIN(c->length*.5, 10000);
+ c->length += padded_length + 2 + 4*is_cram_3 +
+ itf8_size(b->content_id) +
+ itf8_size(padded_length)*2;
+ } else {
+ // Pad the block instead.
+ c->num_blocks = 1;
+ c->num_landmarks = 1;
+ if (!(c->landmark = malloc(sizeof(*c->landmark))))
+ return -1;
+ c->landmark[0] = 0;
- c->length *= 2;
-#endif
+ padded_length = MAX(c->length*1.5, 10000) - c->length;
-#ifdef PADDED_BLOCK
- padded_length = MAX(c->length*1.5, 10000) - c->length;
- c->length += padded_length;
- if (NULL == (pads = calloc(1, padded_length))) {
- cram_free_block(b);
- cram_free_container(c);
- return -1;
- }
- BLOCK_APPEND(b, pads, padded_length);
- BLOCK_UPLEN(b);
- free(pads);
-#endif
+ c->length = b->comp_size + padded_length +
+ 2 + 4*is_cram_3 +
+ itf8_size(b->content_id) +
+ itf8_size(b->uncomp_size) +
+ itf8_size(b->comp_size);
-#ifdef PADDED_CONTAINER
- padded_length = MAX(c->length*2, 10000) - c->length;
- c->length += padded_length;
-#endif
+ if (NULL == (pads = calloc(1, padded_length))) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
+ BLOCK_APPEND(b, pads, padded_length);
+ BLOCK_UPLEN(b);
+ free(pads);
+ }
if (-1 == cram_write_container(fd, c)) {
cram_free_block(b);
@@ -3074,32 +3602,27 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
return -1;
}
- // Keep it uncompressed
if (-1 == cram_write_block(fd, b)) {
cram_free_block(b);
cram_free_container(c);
return -1;
}
-#ifdef BLANK_BLOCK
- if (-1 == cram_write_block(fd, b)) {
- cram_free_block(b);
- cram_free_container(c);
- return -1;
+ if (blank_block) {
+ BLOCK_RESIZE(b, padded_length);
+ memset(BLOCK_DATA(b), 0, padded_length);
+ BLOCK_SIZE(b) = padded_length;
+ BLOCK_UPLEN(b);
+ b->method = RAW;
+ if (-1 == cram_write_block(fd, b)) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
}
-#endif
cram_free_block(b);
cram_free_container(c);
-
-#ifdef PADDED_CONTAINER
- // Write out padding to allow for in-line SAM header editing
- if (NULL == (pads = calloc(1, padded_length)))
- return -1;
- if (padded_length != hwrite(fd->fp, pads, padded_length))
- return -1;
- free(pads);
-#endif
}
if (-1 == refs_from_header(fd->refs, fd, fd->header))
@@ -3140,7 +3663,7 @@ static void cram_init_tables(cram_fd *fd) {
fd->L2['T'] = 3; fd->L2['t'] = 3;
fd->L2['N'] = 4; fd->L2['n'] = 4;
- if (fd->version == CRAM_1_VERS) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
for (i = 0; i < 0x200; i++) {
int f = 0;
@@ -3215,7 +3738,7 @@ static int minor_version = 1;
* NULL on failure.
*/
cram_fd *cram_open(const char *filename, const char *mode) {
- cram_FILE *fp;
+ hFILE *fp;
cram_fd *fd;
char fmode[3]= { mode[0], '\0', '\0' };
@@ -3223,15 +3746,7 @@ cram_fd *cram_open(const char *filename, const char *mode) {
fmode[1] = 'b';
}
-#ifdef SAMTOOLS
fp = hopen(filename, fmode);
-#else
- if (strcmp(filename, "-") == 0) {
- fp = (*fmode == 'r') ? stdin : stdout;
- } else {
- fp = fopen(filename, fmode);
- }
-#endif
if (!fp)
return NULL;
@@ -3246,11 +3761,8 @@ cram_fd *cram_open(const char *filename, const char *mode) {
*
* Returns file handle on success;
* NULL on failure.
- *
- * cram_FILE is either htslib's hFILE or stdio's FILE, depending on how
- * cram_structs.h has been configured.
*/
-cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
+cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
int i;
char *cp;
cram_fd *fd = calloc(1, sizeof(*fd));
@@ -3258,8 +3770,12 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
return NULL;
fd->level = 5;
- if (strlen(mode) > 2 && mode[2] >= '0' && mode[2] <= '9')
- fd->level = mode[2] - '0';
+ for (i = 0; mode[i]; i++) {
+ if (mode[i] >= '0' && mode[i] <= '9') {
+ fd->level = mode[i] - '0';
+ break;
+ }
+ }
fd->fp = fp;
fd->mode = *mode;
@@ -3271,7 +3787,7 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
if (!(fd->file_def = cram_read_file_def(fd)))
goto err;
- fd->version = fd->file_def->major_version * 100 +
+ fd->version = fd->file_def->major_version * 256 +
fd->file_def->minor_version;
if (!(fd->header = cram_read_SAM_hdr(fd)))
@@ -3279,22 +3795,24 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
} else {
/* Writer */
- cram_file_def def;
-
- def.magic[0] = 'C';
- def.magic[1] = 'R';
- def.magic[2] = 'A';
- def.magic[3] = 'M';
- def.major_version = major_version;
- def.minor_version = minor_version;
- memset(def.file_id, 0, 20);
- strncpy(def.file_id, filename, 20);
- if (0 != cram_write_file_def(fd, &def))
- goto err;
+ cram_file_def *def = calloc(1, sizeof(*def));
+ if (!def)
+ return NULL;
- fd->version = def.major_version * 100 + def.minor_version;
+ fd->file_def = def;
- /* SAM header written later */
+ def->magic[0] = 'C';
+ def->magic[1] = 'R';
+ def->magic[2] = 'A';
+ def->magic[3] = 'M';
+ def->major_version = 0; // Indicator to write file def later.
+ def->minor_version = 0;
+ memset(def->file_id, 0, 20);
+ strncpy(def->file_id, filename, 20);
+
+ fd->version = major_version * 256 + minor_version;
+
+ /* SAM header written later along with this file_def */
}
cram_init_tables(fd);
@@ -3302,7 +3820,6 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
if (!fd->prefix)
goto err;
- fd->slice_num = 0;
fd->first_base = fd->last_base = -1;
fd->record_counter = 0;
@@ -3321,7 +3838,9 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
fd->no_ref = 0;
fd->ignore_md5 = 0;
fd->use_bz2 = 0;
- fd->multi_seq = 0;
+ fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
+ fd->use_lzma = 0;
+ fd->multi_seq = -1;
fd->unsorted = 0;
fd->shared_ref = 0;
@@ -3331,8 +3850,9 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
fd->rqueue = NULL;
fd->job_pending = NULL;
fd->ooc = 0;
+ fd->required_fields = INT_MAX;
- for (i = 0; i < 7; i++)
+ for (i = 0; i < DS_END; i++)
fd->m[i] = cram_new_metrics();
fd->range.refid = -2; // no ref.
@@ -3363,6 +3883,8 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
int cram_seek(cram_fd *fd, off_t offset, int whence) {
char buf[65536];
+ fd->ooc = 0;
+
if (hseek(fd->fp, offset, whence) >= 0)
return 0;
@@ -3439,16 +3961,25 @@ int cram_close(cram_fd *fd) {
if (fd->mode == 'w') {
/* Write EOF block */
- if (30 != hwrite(fd->fp, "\x0b\x00\x00\x00\xff\xff\xff\xff"
- "\xff\xe0\x45\x4f\x46\x00\x00\x00"
- "\x00\x01\x00\x00\x01\x00\x06\x06"
- "\x01\x00\x01\x00\x01\x00", 30))
- return -1;
-
-// if (1 != fwrite("\x00\x00\x00\x00\xff\xff\xff\xff"
-// "\xff\xe0\x45\x4f\x46\x00\x00\x00"
-// "\x00\x00\x00", 19, 1, fd->fp))
-// return -1;
+ if (CRAM_MAJOR_VERS(fd->version) == 3) {
+ if (38 != hwrite(fd->fp,
+ "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
+ "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
+ "\x00\x01\x00" // Cont HDR
+ "\x05\xbd\xd9\x4f" // CRC32
+ "\x00\x01\x00\x06\x06" // Comp.HDR blk
+ "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
+ "\xee\x63\x01\x4b", // CRC32
+ 38))
+ return -1;
+ } else {
+ if (30 != hwrite(fd->fp,
+ "\x0b\x00\x00\x00\xff\xff\xff\xff"
+ "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
+ "\x00\x01\x00\x00\x01\x00\x06\x06"
+ "\x01\x00\x01\x00\x01\x00", 30))
+ return -1;
+ }
}
for (bl = fd->bl; bl; bl = next) {
@@ -3463,7 +3994,7 @@ int cram_close(cram_fd *fd) {
free(bl);
}
- if (paranoid_hclose(fd->fp) != 0)
+ if (hclose(fd->fp) != 0)
return -1;
if (fd->file_def)
@@ -3482,7 +4013,7 @@ int cram_close(cram_fd *fd) {
if (fd->ref_free)
free(fd->ref_free);
- for (i = 0; i < 7; i++)
+ for (i = 0; i < DS_END; i++)
if (fd->m[i])
free(fd->m[i]);
@@ -3532,6 +4063,9 @@ int cram_set_option(cram_fd *fd, enum cram_option opt, ...) {
int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
refs_t *refs;
+ if (!fd)
+ return -1;
+
switch (opt) {
case CRAM_OPT_DECODE_MD:
fd->decode_md = va_arg(args, int);
@@ -3572,6 +4106,14 @@ int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
fd->use_bz2 = va_arg(args, int);
break;
+ case CRAM_OPT_USE_RANS:
+ fd->use_rans = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_USE_LZMA:
+ fd->use_lzma = va_arg(args, int);
+ break;
+
case CRAM_OPT_SHARED_REF:
fd->shared_ref = 1;
refs = va_arg(args, refs_t *);
@@ -3604,6 +4146,10 @@ int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
"use 1.0, 2.0, 2.1 or 3.0\n");
return -1;
}
+ fd->version = major*256 + minor;
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3)
+ fd->use_rans = 1;
break;
}
@@ -3643,6 +4189,10 @@ int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
//t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
break;
+ case CRAM_OPT_REQUIRED_FIELDS:
+ fd->required_fields = va_arg(args, int);
+ break;
+
default:
fprintf(stderr, "Unknown CRAM option code %d\n", opt);
return -1;
diff --git a/htslib/cram/cram_io.h b/htslib/cram/cram_io.h
index 49073f7..43344c2 100644
--- a/htslib/cram/cram_io.h
+++ b/htslib/cram/cram_io.h
@@ -100,6 +100,9 @@ int itf8_put(char *cp, int32_t val);
#endif
+int ltf8_get(char *cp, int64_t *val_p);
+int ltf8_put(char *cp, int64_t val);
+
/*! Pushes a value in ITF8 format onto the end of a block.
*
* This shouldn't be used for high-volume data as it is not the fastest
@@ -179,8 +182,7 @@ int cram_uncompress_block(cram_block *b);
* -1 on failure
*/
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
- int level, int strat,
- int level2, int strat2);
+ int method, int level);
cram_metrics *cram_new_metrics(void);
char *cram_block_method2str(enum cram_block_method m);
@@ -222,19 +224,84 @@ char *cram_content_type2str(enum cram_content_type t);
(b)->data[(b)->byte++] = (c); \
} while (0)
-/* Append via sprintf with 1 arg */
-#define BLOCK_APPENDF_1(b,buf,fmt, a1) \
- do { \
- int l = sprintf((buf), (fmt), (a1)); \
- BLOCK_APPEND((b), (buf), l); \
+/* Append a single unsigned integer */
+#define BLOCK_APPEND_UINT(b,i) \
+ do { \
+ unsigned char *cp; \
+ BLOCK_GROW((b),11); \
+ cp = &(b)->data[(b)->byte]; \
+ (b)->byte += append_uint32(cp, (i)) - cp; \
} while (0)
-/* Append via sprintf with 2 args */
-#define BLOCK_APPENDF_2(b,buf,fmt, a1,a2) \
- do { \
- int l = sprintf((buf), (fmt), (a1), (a2)); \
- BLOCK_APPEND((b), (buf), l); \
- } while (0)
+static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
+ uint32_t j;
+
+ if (i == 0) {
+ *cp++ = '0';
+ return cp;
+ }
+
+ if (i < 100) goto b1;
+ if (i < 10000) goto b3;
+ if (i < 1000000) goto b5;
+ if (i < 100000000) goto b7;
+
+ if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
+ if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
+ b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
+ if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
+ b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
+ if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
+ b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
+ if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
+ b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
+ if (i) *cp++ = i + '0';
+ return cp;
+
+ x8: *cp++ = i / 100000000 + '0', i %= 100000000;
+ x7: *cp++ = i / 10000000 + '0', i %= 10000000;
+ x6: *cp++ = i / 1000000 + '0', i %= 1000000;
+ x5: *cp++ = i / 100000 + '0', i %= 100000;
+ x4: *cp++ = i / 10000 + '0', i %= 10000;
+ x3: *cp++ = i / 1000 + '0', i %= 1000;
+ x2: *cp++ = i / 100 + '0', i %= 100;
+ x1: *cp++ = i / 10 + '0', i %= 10;
+ x0: *cp++ = i + '0';
+
+ return cp;
+}
+
+static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
+ *cp++ = i / 100000000 + '0', i %= 100000000;
+ *cp++ = i / 10000000 + '0', i %= 10000000;
+ *cp++ = i / 1000000 + '0', i %= 1000000;
+ *cp++ = i / 100000 + '0', i %= 100000;
+ *cp++ = i / 10000 + '0', i %= 10000;
+ *cp++ = i / 1000 + '0', i %= 1000;
+ *cp++ = i / 100 + '0', i %= 100;
+ *cp++ = i / 10 + '0', i %= 10;
+ *cp++ = i + '0';
+
+ return cp;
+}
+
+static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
+ uint64_t j;
+
+ if (i <= 0xffffffff)
+ return append_uint32(cp, i);
+
+ if ((j = i/1000000000) > 1000000000) {
+ cp = append_uint32(cp, j/1000000000);
+ j %= 1000000000;
+ cp = append_sub32(cp, j);
+ } else {
+ cp = append_uint32(cp, i / 1000000000);
+ }
+ cp = append_sub32(cp, i % 1000000000);
+
+ return cp;
+}
#define BLOCK_UPLEN(b) \
(b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
@@ -449,11 +516,8 @@ cram_fd *cram_open(const char *filename, const char *mode);
* @return
* Returns file handle on success;
* NULL on failure.
- *
- * cram_FILE is either htslib's hFILE or stdio's FILE, depending on how
- * cram_structs.h has been configured.
*/
-cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode);
+cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
/*! Closes a CRAM file.
*
diff --git a/htslib/cram/cram_samtools.c b/htslib/cram/cram_samtools.c
index 66f2efa..27c54e5 100644
--- a/htslib/cram/cram_samtools.c
+++ b/htslib/cram/cram_samtools.c
@@ -112,7 +112,10 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
if (i < len)
*cp++ = L[(uc)seq[i]]<<4;
- memcpy(cp, qual, len);
+ if (qual)
+ memcpy(cp, qual, len);
+ else
+ memset(cp, '\xff', len);
return 0;
}
diff --git a/htslib/cram/cram_stats.c b/htslib/cram/cram_stats.c
index 18d0605..9551f00 100644
--- a/htslib/cram/cram_stats.c
+++ b/htslib/cram/cram_stats.c
@@ -209,31 +209,124 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
return E_HUFFMAN;
}
+ if (fd->verbose > 1)
+ fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
+ min_val, max_val, nvals, ntot);
+
+ /* Theoretical entropy */
+// if (fd->verbose > 1) {
+// double dbits = 0;
+// for (i = 0; i < nvals; i++) {
+// dbits += freqs[i] * log((double)freqs[i]/ntot);
+// }
+// dbits /= -log(2);
+// if (fd->verbose > 1)
+// fprintf(stderr, "Entropy = %f\n", dbits);
+// }
+
+ if (nvals > 1 && ntot > 256) {
+#if 0
+ /*
+ * CRUDE huffman estimator. Round to closest and round up from 0
+ * to 1 bit.
+ *
+ * With and without ITF8 incase we have a few discrete values but with
+ * large magnitude.
+ *
+ * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be
+ * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes
+ * may detect the correlation of high bytes to low bytes in multi-
+ * byte values. So this predictor breaks down.
+ */
+ double dbits = 0; // entropy + ~huffman
+ double dbitsH = 0;
+ double dbitsE = 0; // external entropy + ~huffman
+ double dbitsEH = 0;
+ int F[256] = {0}, n = 0;
+ double e = 0; // accumulated error bits
+ for (i = 0; i < nvals; i++) {
+ double x; int X;
+ unsigned int v = vals[i];
+
+ //Better encoding would cope with sign.
+ //v = ABS(vals[i])*2+(vals[i]<0);
+
+ if (!(v & ~0x7f)) {
+ F[v] += freqs[i], n+=freqs[i];
+ } else if (!(v & ~0x3fff)) {
+ F[(v>>8) |0x80] += freqs[i];
+ F[ v &0xff] += freqs[i], n+=2*freqs[i];
+ } else if (!(v & ~0x1fffff)) {
+ F[(v>>16)|0xc0] += freqs[i];
+ F[(v>>8 )&0xff] += freqs[i];
+ F[ v &0xff] += freqs[i], n+=3*freqs[i];
+ } else if (!(v & ~0x0fffffff)) {
+ F[(v>>24)|0xe0] += freqs[i];
+ F[(v>>16)&0xff] += freqs[i];
+ F[(v>>8 )&0xff] += freqs[i];
+ F[ v &0xff] += freqs[i], n+=4*freqs[i];
+ } else {
+ F[(v>>28)|0xf0] += freqs[i];
+ F[(v>>20)&0xff] += freqs[i];
+ F[(v>>12)&0xff] += freqs[i];
+ F[(v>>4 )&0xff] += freqs[i];
+ F[ v &0x0f] += freqs[i], n+=5*freqs[i];
+ }
+
+ x = -log((double)freqs[i]/ntot)/.69314718055994530941;
+ X = x+0.5;
+ if ((int)(x+((double)e/freqs[i])+.5)>X) {
+ X++;
+ } else if ((int)(x+((double)e/freqs[i])+.5)<X) {
+ X--;
+ }
+ e-=freqs[i]*(X-x);
+ X += (X==0);
+
+ //fprintf(stderr, "Val %d = %d x %d (ent %f, %d) e %f\n", i, v, freqs[i], x, X, e);
+
+ dbits += freqs[i] * x;
+ dbitsH += freqs[i] * X;
+ }
+
+ for (i = 0; i < 256; i++) {
+ if (F[i]) {
+ double x = -log((double)F[i]/n)/.69314718055994530941;
+ int X = x+0.5;
+ X += (X==0);
+ dbitsE += F[i] * x;
+ dbitsEH += F[i] * X;
+
+ //fprintf(stderr, "Val %d = %d x %d (e %f, %d)\n", i, i, F[i], x, X);
+ }
+ }
+
+ //fprintf(stderr, "CORE Entropy = %f, %f\n", dbits/8, dbitsH/8);
+ //fprintf(stderr, "Ext. Entropy = %f, %f\n", dbitsE/8, dbitsEH/8);
+
+ if (dbitsE < 1000 || dbitsE / dbits > 1.1) {
+ //fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals);
+ free(vals); free(freqs);
+ return nvals < 200 ? E_HUFFMAN : E_BETA;
+ }
+#endif
+ free(vals); free(freqs);
+ return E_EXTERNAL;
+ }
+
/*
* Avoid complex stats for now, just do heuristic of HUFFMAN for small
* alphabets and BETA for anything large.
*/
free(vals); free(freqs);
return nvals < 200 ? E_HUFFMAN : E_BETA;
+ //return E_HUFFMAN;
+ //return E_EXTERNAL;
+
/* We only support huffman now anyway... */
//free(vals); free(freqs); return E_HUFFMAN;
- if (fd->verbose > 1)
- fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
- min_val, max_val, nvals, ntot);
-
- /* Theoretical entropy */
- {
- double dbits = 0;
- for (i = 0; i < nvals; i++) {
- dbits += freqs[i] * log((double)freqs[i]/ntot);
- }
- dbits /= -log(2);
- if (fd->verbose > 1)
- fprintf(stderr, "Entropy = %f\n", dbits);
- }
-
/* Beta */
bits = nbits(max_val - min_val) * ntot;
if (fd->verbose > 1)
diff --git a/htslib/cram/cram_structs.h b/htslib/cram/cram_structs.h
index 6d3f1a1..ab9f5bf 100644
--- a/htslib/cram/cram_structs.h
+++ b/htslib/cram/cram_structs.h
@@ -53,11 +53,8 @@ extern "C" {
#include <stdint.h>
#include "cram/thread_pool.h"
-
-#ifdef SAMTOOLS
-// From within samtools/HTSlib
-# include "cram/string_alloc.h"
-# include "htslib/khash.h"
+#include "cram/string_alloc.h"
+#include "htslib/khash.h"
// Generic hash-map integer -> integer
KHASH_MAP_INIT_INT(m_i2i, int)
@@ -82,24 +79,12 @@ typedef union {
KHASH_MAP_INIT_STR(map, pmap_t)
struct hFILE;
-typedef struct hFILE cram_FILE;
-
-#else
-// From within io_lib
-# include "cram/bam.h" // For BAM header parsing
-typedef FILE cram_FILE;
-#endif
#define SEQS_PER_SLICE 10000
#define SLICE_PER_CNT 1
#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
-#define TN_external
-//#define NS_external
-#define TS_external
-//#define BA_external
-
#define MAX_STAT_VAL 1024
//#define MAX_STAT_VAL 16
typedef struct {
@@ -131,6 +116,63 @@ enum cram_external_type {
E_BYTE_ARRAY_BLOCK = 5,
};
+/* External IDs used by this implementation (only assumed during writing) */
+enum cram_DS_ID {
+ DS_CORE = 0,
+ DS_aux = 1, // aux_blk
+ DS_aux_OQ = 2,
+ DS_aux_BQ = 3,
+ DS_aux_BD = 4,
+ DS_aux_BI = 5,
+ DS_aux_FZ = 6, // also ZM:B
+ DS_aux_oq = 7, // other qualities
+ DS_aux_os = 8, // other sequences
+ DS_aux_oz = 9, // other strings
+ DS_ref,
+ DS_RN, // name_blk
+ DS_QS, // qual_blk
+ DS_IN, // base_blk
+ DS_SC, // soft_blk
+
+ DS_BF, // start loop
+ DS_CF,
+ DS_AP,
+ DS_RG,
+ DS_MQ,
+ DS_NS,
+ DS_MF,
+ DS_TS,
+ DS_NP,
+ DS_NF,
+ DS_RL,
+ DS_FN,
+ DS_FC,
+ DS_FP,
+ DS_DL,
+ DS_BA,
+ DS_BS,
+ DS_TL,
+ DS_RI,
+ DS_RS,
+ DS_PD,
+ DS_HC,
+ DS_BB,
+ DS_QQ,
+
+ DS_TN, // end loop
+
+ DS_RN_len,
+ DS_SC_len,
+ DS_BB_len,
+ DS_QQ_len,
+
+ DS_TC, // CRAM v1.0 tags
+ DS_TM, // test
+ DS_TV, // test
+
+ DS_END,
+};
+
/* "File Definition Structure" */
typedef struct {
char magic[4];
@@ -139,16 +181,21 @@ typedef struct {
char file_id[20]; // Filename or SHA1 checksum
} cram_file_def;
-#define CRAM_1_VERS 100 // 1.0
-#define CRAM_2_VERS 200 // 1.1, or 2.0?
+#define CRAM_MAJOR_VERS(v) ((v) >> 8)
+#define CRAM_MINOR_VERS(v) ((v) & 0xff)
struct cram_slice;
enum cram_block_method {
- BM_ERROR = -1,
- RAW = 0,
- GZIP = 1,
- BZIP2 = 2,
+ ERROR = -1,
+ RAW = 0,
+ GZIP = 1,
+ BZIP2 = 2,
+ LZMA = 3,
+ RANS = 4, // Generic; either order
+ RANS0 = 4,
+ RANS1 = 10, // Not externalised; stored as RANS (generic)
+ GZIP_RLE = 11, // NB: not externalised in CRAM
};
enum cram_content_type {
@@ -156,17 +203,44 @@ enum cram_content_type {
FILE_HEADER = 0,
COMPRESSION_HEADER = 1,
MAPPED_SLICE = 2,
- UNMAPPED_SLICE = 3, // CRAM_1_VERS only
+ UNMAPPED_SLICE = 3, // CRAM V1.0 only
EXTERNAL = 4,
CORE = 5,
};
/* Compression metrics */
typedef struct {
- int m1;
- int m2;
+ // number of trials and time to next trial
int trial;
int next_trial;
+
+ // aggregate sizes during trials
+ int sz_gz_rle;
+ int sz_gz_def;
+ int sz_rans0;
+ int sz_rans1;
+ int sz_bzip2;
+ int sz_lzma;
+
+ // resultant method from trials
+ int method;
+ int strat;
+
+ // Revisions of method, to allow culling of continually failing ones.
+ int gz_rle_cnt;
+ int gz_def_cnt;
+ int rans0_cnt;
+ int rans1_cnt;
+ int bzip2_cnt;
+ int lzma_cnt;
+ int revised_method;
+
+ double gz_rle_extra;
+ double gz_def_extra;
+ double rans0_extra;
+ double rans1_extra;
+ double bzip2_extra;
+ double lzma_extra;
} cram_metrics;
/* Block */
@@ -176,6 +250,7 @@ typedef struct {
int32_t content_id;
int32_t comp_size;
int32_t uncomp_size;
+ uint32_t crc32;
int32_t idx; /* offset into data */
unsigned char *data;
@@ -221,40 +296,12 @@ typedef struct {
struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
- struct cram_codec *BF_codec; // bam bit flags
- struct cram_codec *CF_codec; // compression flags
- struct cram_codec *RL_codec; // read length
- struct cram_codec *AP_codec; // alignment pos
- struct cram_codec *RG_codec; // read group
- struct cram_codec *MF_codec; // mate flags
- struct cram_codec *NS_codec; // next frag ref ID
- struct cram_codec *NP_codec; // next frag pos
- struct cram_codec *TS_codec; // template size
- struct cram_codec *NF_codec; // next frag distance
- struct cram_codec *TC_codec; // tag count CRAM_1_VERS
- struct cram_codec *TN_codec; // tag name/type CRAM_1_VERS
- struct cram_codec *TL_codec; // tag line CRAM_2_VERS
- struct cram_codec *FN_codec; // no. features
- struct cram_codec *FC_codec; // feature code
- struct cram_codec *FP_codec; // feature pos
- struct cram_codec *BS_codec; // base subst feature
- struct cram_codec *IN_codec; // insertion feature
- struct cram_codec *SC_codec; // soft-clip feature
- struct cram_codec *DL_codec; // deletion len feature
- struct cram_codec *BA_codec; // base feature
- struct cram_codec *RS_codec; // ref skip length feature
- struct cram_codec *PD_codec; // padding length feature
- struct cram_codec *HC_codec; // hard clip length feature
- struct cram_codec *MQ_codec; // mapping quality
- struct cram_codec *RN_codec; // read names
- struct cram_codec *QS_codec; // quality value (single)
- struct cram_codec *Qs_codec; // quality values (string)
- struct cram_codec *RI_codec; // ref ID
- struct cram_codec *TM_codec; // ?
- struct cram_codec *TV_codec; // ?
+ struct cram_codec *codecs[DS_END];
char *uncomp; // A single block of uncompressed data
size_t uncomp_size, uncomp_alloc;
+
+ unsigned int data_series; // See cram_fields enum below
} cram_block_compression_hdr;
typedef struct cram_map {
@@ -273,7 +320,7 @@ typedef struct {
int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */
int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */
int32_t num_records;
- int32_t record_counter;
+ int64_t record_counter;
int32_t num_blocks;
int32_t num_content_ids;
int32_t *block_content_ids;
@@ -297,7 +344,7 @@ typedef struct {
int32_t ref_seq_id;
int32_t ref_seq_start;
int32_t ref_seq_span;
- int32_t record_counter;
+ int64_t record_counter;
int64_t num_bases;
int32_t num_records;
int32_t num_blocks;
@@ -334,37 +381,12 @@ typedef struct {
bam_seq_t **bams;
/* Statistics for encoding */
- cram_stats *TS_stats;
- cram_stats *RG_stats;
- cram_stats *FP_stats;
- cram_stats *NS_stats;
- cram_stats *RN_stats;
- cram_stats *CF_stats;
- cram_stats *TN_stats;
- cram_stats *BA_stats;
- cram_stats *TV_stats;
- cram_stats *BS_stats;
- cram_stats *FC_stats;
- cram_stats *BF_stats;
- cram_stats *AP_stats;
- cram_stats *NF_stats;
- cram_stats *MF_stats;
- cram_stats *FN_stats;
- cram_stats *RL_stats;
- cram_stats *DL_stats;
- cram_stats *TC_stats;
- cram_stats *TL_stats;
- cram_stats *MQ_stats;
- cram_stats *TM_stats;
- cram_stats *QS_stats;
- cram_stats *NP_stats;
- cram_stats *RI_stats;
- cram_stats *RS_stats;
- cram_stats *PD_stats;
- cram_stats *HC_stats;
+ cram_stats *stats[DS_END];
khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map
int *refs_used; // array of frequency of ref seq IDs
+
+ uint32_t crc32; // CRC32
} cram_container;
/*
@@ -440,6 +462,12 @@ typedef struct {
struct {
int pos;
int code;
+ int seq_idx; // index to s->seqs_blk
+ int len;
+ } b;
+ struct {
+ int pos;
+ int code;
int qual;
} Q;
struct {
@@ -496,9 +524,6 @@ typedef struct cram_slice {
/* State used during encoding/decoding */
int last_apos, max_apos;
- /* Identifier used for auto-assigning read names */
- uint64_t id;
-
/* Array of decoded cram records */
cram_record *crecs;
@@ -508,12 +533,6 @@ typedef struct cram_slice {
uint32_t *cigar;
uint32_t cigar_alloc;
uint32_t ncigar;
- cram_block *name_blk;
- cram_block *seqs_blk;
- cram_block *qual_blk;
- cram_block *aux_blk;
- cram_block *base_blk; // substitutions (soft-clips for 1.0)
- cram_block *soft_blk; // soft-clips
cram_feature *features;
int nfeatures;
@@ -528,17 +547,28 @@ typedef struct cram_slice {
int tn_id;
#endif
+ // For variable sized elements which are always external blocks.
+ cram_block *name_blk;
+ cram_block *seqs_blk;
+ cram_block *qual_blk;
+ cram_block *base_blk;
+ cram_block *soft_blk;
+ cram_block *aux_blk;
+ cram_block *aux_OQ_blk;
+ cram_block *aux_BQ_blk;
+ cram_block *aux_BD_blk;
+ cram_block *aux_BI_blk;
+ cram_block *aux_FZ_blk;
+ cram_block *aux_oq_blk;
+ cram_block *aux_os_blk;
+ cram_block *aux_oz_blk;
+
string_alloc_t *pair_keys; // Pooled keys for pair hash.
- khash_t(m_s2i) *pair; // for identifying read-pairs in this slice.
+ khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice.
char *ref; // slice of current reference
int ref_start; // start position of current reference;
int ref_end; // end position of current reference;
-
-#ifdef BA_external
- int BA_len;
- int ba_id;
-#endif
int ref_id;
} cram_slice;
@@ -568,7 +598,7 @@ typedef struct {
int nref; // number of ref_entry
char *fn; // current file opened
- FILE *fp; // and the FILE* to go with it.
+ BGZF *fp; // and the hFILE* to go with it.
int count; // how many cram_fd sharing this refs struct
@@ -620,15 +650,14 @@ typedef struct spare_bams {
} spare_bams;
typedef struct cram_fd {
- cram_FILE *fp;
+ struct hFILE *fp;
int mode; // 'r' or 'w'
int version;
cram_file_def *file_def;
SAM_hdr *header;
char *prefix;
- int record_counter;
- int slice_num;
+ int64_t record_counter;
int err;
// Most recent compression header decoded
@@ -651,7 +680,7 @@ typedef struct cram_fd {
// compression level and metrics
int level;
- cram_metrics *m[7];
+ cram_metrics *m[DS_END];
// options
int decode_md; // Whether to export MD and NM tags
@@ -662,7 +691,10 @@ typedef struct cram_fd {
int no_ref;
int ignore_md5;
int use_bz2;
+ int use_rans;
+ int use_lzma;
int shared_ref;
+ unsigned int required_fields;
cram_range range;
// lookup tables, stored here so we can be trivially multi-threaded
@@ -693,25 +725,52 @@ typedef struct cram_fd {
int ooc; // out of containers.
} cram_fd;
-enum cram_option {
- CRAM_OPT_DECODE_MD,
- CRAM_OPT_PREFIX,
- CRAM_OPT_VERBOSITY,
- CRAM_OPT_SEQS_PER_SLICE,
- CRAM_OPT_SLICES_PER_CONTAINER,
- CRAM_OPT_RANGE,
- CRAM_OPT_VERSION,
- CRAM_OPT_EMBED_REF,
- CRAM_OPT_IGNORE_MD5,
- CRAM_OPT_REFERENCE,
- CRAM_OPT_MULTI_SEQ_PER_SLICE,
- CRAM_OPT_NO_REF,
- CRAM_OPT_USE_BZIP2,
- CRAM_OPT_SHARED_REF,
- CRAM_OPT_NTHREADS,
- CRAM_OPT_THREAD_POOL,
+// Translation of required fields to cram data series
+enum cram_fields {
+ CRAM_BF = 0x00000001,
+ CRAM_AP = 0x00000002,
+ CRAM_FP = 0x00000004,
+ CRAM_RL = 0x00000008,
+ CRAM_DL = 0x00000010,
+ CRAM_NF = 0x00000020,
+ CRAM_BA = 0x00000040,
+ CRAM_QS = 0x00000080,
+ CRAM_FC = 0x00000100,
+ CRAM_FN = 0x00000200,
+ CRAM_BS = 0x00000400,
+ CRAM_IN = 0x00000800,
+ CRAM_RG = 0x00001000,
+ CRAM_MQ = 0x00002000,
+ CRAM_TL = 0x00004000,
+ CRAM_RN = 0x00008000,
+ CRAM_NS = 0x00010000,
+ CRAM_NP = 0x00020000,
+ CRAM_TS = 0x00040000,
+ CRAM_MF = 0x00080000,
+ CRAM_CF = 0x00100000,
+ CRAM_RI = 0x00200000,
+ CRAM_RS = 0x00400000,
+ CRAM_PD = 0x00800000,
+ CRAM_HC = 0x01000000,
+ CRAM_SC = 0x02000000,
+ CRAM_BB = 0x04000000,
+ CRAM_BB_len = 0x08000000,
+ CRAM_QQ = 0x10000000,
+ CRAM_QQ_len = 0x20000000,
+ CRAM_aux= 0x40000000,
+ CRAM_ALL= 0x7fffffff,
};
+// A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
+// encode a base difference, but we don't need to know what it is for CIGAR.
+// If we have a soft-clip or insertion, we do need SC/IN though to know how
+// long that array is.
+#define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
+ CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
+
+#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \
+ CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ)
+
/* BF bitfields */
/* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
#define CRAM_FPAIRED 256
@@ -724,6 +783,16 @@ enum cram_option {
#define CRAM_FQCFAIL 2
#define CRAM_FDUP 1
+#define DS_aux_S "\001"
+#define DS_aux_OQ_S "\002"
+#define DS_aux_BQ_S "\003"
+#define DS_aux_BD_S "\004"
+#define DS_aux_BI_S "\005"
+#define DS_aux_FZ_S "\006"
+#define DS_aux_oq_S "\007"
+#define DS_aux_os_S "\010"
+#define DS_aux_oz_S "\011"
+
#define CRAM_M_REVERSE 1
#define CRAM_M_UNMAP 2
@@ -733,18 +802,6 @@ enum cram_option {
#define CRAM_FLAG_DETACHED (1<<1)
#define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
-/* External IDs used by this implementation (only assumed during writing) */
-#define CRAM_EXT_IN 0
-#define CRAM_EXT_QUAL 1
-#define CRAM_EXT_NAME 2
-#define CRAM_EXT_TS_NP 3
-#define CRAM_EXT_TAG 4
-#define CRAM_EXT_TAG_S "\004"
-#define CRAM_EXT_BA 5
-#define CRAM_EXT_TN 6
-#define CRAM_EXT_SC 7
-#define CRAM_EXT_REF 8
-
#ifdef __cplusplus
}
#endif
diff --git a/htslib/cram/os.h b/htslib/cram/os.h
index b2affe0..22d8096 100644
--- a/htslib/cram/os.h
+++ b/htslib/cram/os.h
@@ -225,10 +225,12 @@ extern "C" {
*/
#ifdef SP_BIG_ENDIAN
#define le_int4(x) iswap_int4((x))
+#define le_int2(x) iswap_int2((x))
#endif
#ifdef SP_LITTLE_ENDIAN
#define le_int4(x) (x)
+#define le_int2(x) (x)
#endif
/*-----------------------------------------------------------------------------
diff --git a/htslib/cram/rANS_byte.h b/htslib/cram/rANS_byte.h
new file mode 100644
index 0000000..c61ed9d
--- /dev/null
+++ b/htslib/cram/rANS_byte.h
@@ -0,0 +1,336 @@
+/* rans_byte.h originally from https://github.com/rygorous/ryg_rans
+ *
+ * This is a public-domain implementation of several rANS variants. rANS is an
+ * entropy coder from the ANS family, as described in Jarek Duda's paper
+ * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540).
+ */
+
+/*-------------------------------------------------------------------------- */
+
+// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014
+//
+// Not intended to be "industrial strength"; just meant to illustrate the general
+// idea.
+
+#ifndef RANS_BYTE_HEADER
+#define RANS_BYTE_HEADER
+
+#include <stdint.h>
+
+#ifdef assert
+#define RansAssert assert
+#else
+#define RansAssert(x)
+#endif
+
+// READ ME FIRST:
+//
+// This is designed like a typical arithmetic coder API, but there's three
+// twists you absolutely should be aware of before you start hacking:
+//
+// 1. You need to encode data in *reverse* - last symbol first. rANS works
+// like a stack: last in, first out.
+// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give
+// it a pointer to the *end* of your buffer (exclusive), and it will
+// slowly move towards the beginning as more bytes are emitted.
+// 3. Unlike basically any other entropy coder implementation you might
+// have used, you can interleave data from multiple independent rANS
+// encoders into the same bytestream without any extra signaling;
+// you can also just write some bytes by yourself in the middle if
+// you want to. This is in addition to the usual arithmetic encoder
+// property of being able to switch models on the fly. Writing raw
+// bytes can be useful when you have some data that you know is
+// incompressible, and is cheaper than going through the rANS encode
+// function. Using multiple rANS coders on the same byte stream wastes
+// a few bytes compared to using just one, but execution of two
+// independent encoders can happen in parallel on superscalar and
+// Out-of-Order CPUs, so this can be *much* faster in tight decoding
+// loops.
+//
+// This is why all the rANS functions take the write pointer as an
+// argument instead of just storing it in some context struct.
+
+// --------------------------------------------------------------------------
+
+// L ('l' in the paper) is the lower bound of our normalization interval.
+// Between this and our byte-aligned emission, we use 31 (not 32!) bits.
+// This is done intentionally because exact reciprocals for 31-bit uints
+// fit in 32-bit uints: this permits some optimizations during encoding.
+#define RANS_BYTE_L (1u << 23) // lower bound of our normalization interval
+
+// State for a rANS encoder. Yep, that's all there is to it.
+typedef uint32_t RansState;
+
+// Initialize a rANS encoder.
+static inline void RansEncInit(RansState* r)
+{
+ *r = RANS_BYTE_L;
+}
+
+// Renormalize the encoder. Internal function.
+static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits)
+{
+ uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; // this turns into a shift.
+ if (x >= x_max) {
+ uint8_t* ptr = *pptr;
+ do {
+ *--ptr = (uint8_t) (x & 0xff);
+ x >>= 8;
+ } while (x >= x_max);
+ *pptr = ptr;
+ }
+ return x;
+}
+
+// Encodes a single symbol with range start "start" and frequency "freq".
+// All frequencies are assumed to sum to "1 << scale_bits", and the
+// resulting bytes get written to ptr (which is updated).
+//
+// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from
+// beginning to end! Likewise, the output bytestream is written *backwards*:
+// ptr starts pointing at the end of the output buffer and keeps decrementing.
+static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ // renormalize
+ RansState x = RansEncRenorm(*r, pptr, freq, scale_bits);
+
+ // x = C(s,x)
+ *r = ((x / freq) << scale_bits) + (x % freq) + start;
+}
+
+// Flushes the rANS encoder.
+static inline void RansEncFlush(RansState* r, uint8_t** pptr)
+{
+ uint32_t x = *r;
+ uint8_t* ptr = *pptr;
+
+ ptr -= 4;
+ ptr[0] = (uint8_t) (x >> 0);
+ ptr[1] = (uint8_t) (x >> 8);
+ ptr[2] = (uint8_t) (x >> 16);
+ ptr[3] = (uint8_t) (x >> 24);
+
+ *pptr = ptr;
+}
+
+// Initializes a rANS decoder.
+// Unlike the encoder, the decoder works forwards as you'd expect.
+static inline void RansDecInit(RansState* r, uint8_t** pptr)
+{
+ uint32_t x;
+ uint8_t* ptr = *pptr;
+
+ x = ptr[0] << 0;
+ x |= ptr[1] << 8;
+ x |= ptr[2] << 16;
+ x |= ptr[3] << 24;
+ ptr += 4;
+
+ *pptr = ptr;
+ *r = x;
+}
+
+// Returns the current cumulative frequency (map it to a symbol yourself!)
+static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits)
+{
+ return *r & ((1u << scale_bits) - 1);
+}
+
+// Advances in the bit stream by "popping" a single symbol with range start
+// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits",
+// and the resulting bytes get written to ptr (which is updated).
+static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ uint32_t mask = (1u << scale_bits) - 1;
+
+ // s, x = D(x)
+ uint32_t x = *r;
+ x = freq * (x >> scale_bits) + (x & mask) - start;
+
+ // renormalize
+ if (x < RANS_BYTE_L) {
+ uint8_t* ptr = *pptr;
+ do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L);
+ *pptr = ptr;
+ }
+
+ *r = x;
+}
+
+// --------------------------------------------------------------------------
+
+// That's all you need for a full encoder; below here are some utility
+// functions with extra convenience or optimizations.
+
+// Encoder symbol description
+// This (admittedly odd) selection of parameters was chosen to make
+// RansEncPutSymbol as cheap as possible.
+typedef struct {
+ uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval
+ uint32_t rcp_freq; // Fixed-point reciprocal frequency
+ uint32_t bias; // Bias
+ uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq
+ uint16_t rcp_shift; // Reciprocal shift
+} RansEncSymbol;
+
+// Decoder symbols are straightforward.
+typedef struct {
+ uint16_t start; // Start of range.
+ uint16_t freq; // Symbol frequency.
+} RansDecSymbol;
+
+// Initializes an encoder symbol to start "start" and frequency "freq"
+static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ RansAssert(scale_bits <= 16);
+ RansAssert(start <= (1u << scale_bits));
+ RansAssert(freq <= (1u << scale_bits) - start);
+
+ // Say M := 1 << scale_bits.
+ //
+ // The original encoder does:
+ // x_new = (x/freq)*M + start + (x%freq)
+ //
+ // The fast encoder does (schematically):
+ // q = mul_hi(x, rcp_freq) >> rcp_shift (division)
+ // r = x - q*freq (remainder)
+ // x_new = q*M + bias + r (new x)
+ // plugging in r into x_new yields:
+ // x_new = bias + x + q*(M - freq)
+ // =: bias + x + q*cmpl_freq (*)
+ //
+ // and we can just precompute cmpl_freq. Now we just need to
+ // set up our parameters such that the original encoder and
+ // the fast encoder agree.
+
+ s->x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq;
+ s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq);
+ if (freq < 2) {
+ // freq=0 symbols are never valid to encode, so it doesn't matter what
+ // we set our values to.
+ //
+ // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately,
+ // our fixed-point reciprocal approximation can only multiply by values
+ // smaller than 1.
+ //
+ // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0.
+ // This gives:
+ // q = mul_hi(x, rcp_freq) >> rcp_shift
+ // = mul_hi(x, (1<<32) - 1)) >> 0
+ // = floor(x - x/(2^32))
+ // = x - 1 if 1 <= x < 2^32
+ // and we know that x>0 (x=0 is never in a valid normalization interval).
+ //
+ // So we now need to choose the other parameters such that
+ // x_new = x*M + start
+ // plug it in:
+ // x*M + start (desired result)
+ // = bias + x + q*cmpl_freq (*)
+ // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq)
+ // = bias + 1 + (x - 1)*M
+ // = x*M + (bias + 1 - M)
+ //
+ // so we have start = bias + 1 - M, or equivalently
+ // bias = start + M - 1.
+ s->rcp_freq = ~0u;
+ s->rcp_shift = 0;
+ s->bias = start + (1 << scale_bits) - 1;
+ } else {
+ // Alverson, "Integer Division using reciprocals"
+ // shift=ceil(log2(freq))
+ uint32_t shift = 0;
+ while (freq > (1u << shift))
+ shift++;
+
+ s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq);
+ s->rcp_shift = shift - 1;
+
+ // With these values, 'q' is the correct quotient, so we
+ // have bias=start.
+ s->bias = start;
+ }
+
+ s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol
+}
+
+// Initialize a decoder symbol to start "start" and frequency "freq"
+static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq)
+{
+ RansAssert(start <= (1 << 16));
+ RansAssert(freq <= (1 << 16) - start);
+ s->start = (uint16_t) start;
+ s->freq = (uint16_t) freq;
+}
+
+// Encodes a given symbol. This is faster than straight RansEnc since we can do
+// multiplications instead of a divide.
+//
+// See RansEncSymbolInit for a description of how this works.
+static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym)
+{
+ RansAssert(sym->x_max != 0); // can't encode symbol with freq=0
+
+ // renormalize
+ uint32_t x = *r;
+ uint32_t x_max = sym->x_max;
+
+ if (x >= x_max) {
+ uint8_t* ptr = *pptr;
+ do {
+ *--ptr = (uint8_t) (x & 0xff);
+ x >>= 8;
+ } while (x >= x_max);
+ *pptr = ptr;
+ }
+
+ // x = C(s,x)
+ // NOTE: written this way so we get a 32-bit "multiply high" when
+ // available. If you're on a 64-bit platform with cheap multiplies
+ // (e.g. x64), just bake the +32 into rcp_shift.
+ //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift;
+
+ // The extra >>32 has already been added to RansEncSymbolInit
+ uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift);
+ *r = x + sym->bias + q * sym->cmpl_freq;
+}
+
+// Equivalent to RansDecAdvance that takes a symbol.
+static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits)
+{
+ RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits);
+}
+
+// Advances in the bit stream by "popping" a single symbol with range start
+// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits".
+// No renormalization or output happens.
+static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ uint32_t mask = (1u << scale_bits) - 1;
+
+ // s, x = D(x)
+ uint32_t x = *r;
+ *r = freq * (x >> scale_bits) + (x & mask) - start;
+}
+
+// Equivalent to RansDecAdvanceStep that takes a symbol.
+static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits)
+{
+ RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits);
+}
+
+// Renormalize.
+static inline void RansDecRenorm(RansState* r, uint8_t** pptr)
+{
+ // renormalize
+ uint32_t x = *r;
+
+ if (x < RANS_BYTE_L) {
+ uint8_t* ptr = *pptr;
+ do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L);
+ *pptr = ptr;
+ }
+
+ *r = x;
+}
+
+#endif // RANS_BYTE_HEADER
diff --git a/htslib/cram/rANS_static.c b/htslib/cram/rANS_static.c
new file mode 100644
index 0000000..19c26f3
--- /dev/null
+++ b/htslib/cram/rANS_static.c
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2014 Genome Research Ltd.
+ * Author(s): James Bonfield
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ * Institute nor the names of its contributors may be used to endorse
+ * or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Author: James Bonfield, Wellcome Trust Sanger Institute. 2014
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "cram/rANS_static.h"
+#include "cram/rANS_byte.h"
+
+#define TF_SHIFT 12
+#define TOTFREQ (1<<TF_SHIFT)
+
+#define ABS(a) ((a)>0?(a):-(a))
+#ifndef BLK_SIZE
+# define BLK_SIZE 1024*1024
+#endif
+
+// Room to allow for expanded BLK_SIZE on worst case compression.
+#define BLK_SIZE2 ((int)(1.05*BLK_SIZE))
+
+/*-----------------------------------------------------------------------------
+ * Memory to memory compression functions.
+ *
+ * These are original versions without any manual loop unrolling. They
+ * are easier to understand, but can be up to 2x slower.
+ */
+
+unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9);
+ unsigned char *cp, *out_end;
+ RansEncSymbol syms[256];
+ RansState rans0, rans1, rans2, rans3;
+ uint8_t* ptr;
+ int F[256] = {0}, i, j, tab_size, rle, x, fsum = 0;
+ int m = 0, M = 0;
+ uint64_t tr;
+
+ if (!out_buf)
+ return NULL;
+
+ ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
+
+ // Compute statistics
+ for (i = 0; i < in_size; i++) {
+ F[in[i]]++;
+ }
+ tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size;
+
+ // Normalise so T[i] == TOTFREQ
+ for (m = M = j = 0; j < 256; j++) {
+ if (!F[j])
+ continue;
+
+ if (m < F[j])
+ m = F[j], M = j;
+
+ if ((F[j] = (F[j]*tr)>>31) == 0)
+ F[j] = 1;
+ fsum += F[j];
+ }
+
+ fsum++;
+ if (fsum < TOTFREQ)
+ F[M] += TOTFREQ-fsum;
+ else
+ F[M] -= fsum-TOTFREQ;
+
+ //printf("F[%d]=%d\n", M, F[M]);
+ assert(F[M]>0);
+
+ // Encode statistics.
+ cp = out_buf+9;
+
+ for (x = rle = j = 0; j < 256; j++) {
+ if (F[j]) {
+ // j
+ if (rle) {
+ rle--;
+ } else {
+ *cp++ = j;
+ if (!rle && j && F[j-1]) {
+ for(rle=j+1; rle<256 && F[rle]; rle++)
+ ;
+ rle -= j+1;
+ *cp++ = rle;
+ }
+ //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
+ }
+
+ // F[j]
+ if (F[j]<128) {
+ *cp++ = F[j];
+ } else {
+ *cp++ = 128 | (F[j]>>8);
+ *cp++ = F[j]&0xff;
+ }
+ RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
+ x += F[j];
+ }
+ }
+ *cp++ = 0;
+
+ //write(1, out_buf+4, cp-(out_buf+4));
+ tab_size = cp-out_buf;
+
+ RansEncInit(&rans0);
+ RansEncInit(&rans1);
+ RansEncInit(&rans2);
+ RansEncInit(&rans3);
+
+ switch (i=(in_size&3)) {
+ case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
+ case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
+ case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
+ case 0:
+ break;
+ }
+ for (i=(in_size &~3); i>0; i-=4) {
+ RansEncSymbol *s3 = &syms[in[i-1]];
+ RansEncSymbol *s2 = &syms[in[i-2]];
+ RansEncSymbol *s1 = &syms[in[i-3]];
+ RansEncSymbol *s0 = &syms[in[i-4]];
+
+ RansEncPutSymbol(&rans3, &ptr, s3);
+ RansEncPutSymbol(&rans2, &ptr, s2);
+ RansEncPutSymbol(&rans1, &ptr, s1);
+ RansEncPutSymbol(&rans0, &ptr, s0);
+ }
+
+ RansEncFlush(&rans3, &ptr);
+ RansEncFlush(&rans2, &ptr);
+ RansEncFlush(&rans1, &ptr);
+ RansEncFlush(&rans0, &ptr);
+
+ // Finalise block size and return it
+ *out_size = (out_end - ptr) + tab_size;
+
+ cp = out_buf;
+
+ *cp++ = 0; // order
+ *cp++ = ((*out_size-9)>> 0) & 0xff;
+ *cp++ = ((*out_size-9)>> 8) & 0xff;
+ *cp++ = ((*out_size-9)>>16) & 0xff;
+ *cp++ = ((*out_size-9)>>24) & 0xff;
+
+ *cp++ = (in_size>> 0) & 0xff;
+ *cp++ = (in_size>> 8) & 0xff;
+ *cp++ = (in_size>>16) & 0xff;
+ *cp++ = (in_size>>24) & 0xff;
+
+ memmove(out_buf + tab_size, ptr, out_end-ptr);
+
+ return out_buf;
+}
+
+typedef struct {
+ struct {
+ int F;
+ int C;
+ } fc[256];
+ unsigned char *R;
+} ari_decoder;
+
+unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ /* Load in the static tables */
+ unsigned char *cp = in + 9;
+ int i, j, x, out_sz, in_sz, rle;
+ char *out_buf;
+ ari_decoder D;
+ RansDecSymbol syms[256];
+
+ memset(&D, 0, sizeof(D));
+
+ if (*in++ != 0) // Order-0 check
+ return NULL;
+
+ in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24);
+ out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24);
+ if (in_sz != in_size-9)
+ return NULL;
+
+ out_buf = malloc(out_sz);
+ if (!out_buf)
+ return NULL;
+
+ //fprintf(stderr, "out_sz=%d\n", out_sz);
+
+ // Precompute reverse lookup of frequency.
+ rle = x = 0;
+ j = *cp++;
+ do {
+ if ((D.fc[j].F = *cp++) >= 128) {
+ D.fc[j].F &= ~128;
+ D.fc[j].F = ((D.fc[j].F & 127) << 8) | *cp++;
+ }
+ D.fc[j].C = x;
+
+ RansDecSymbolInit(&syms[j], D.fc[j].C, D.fc[j].F);
+
+ /* Build reverse lookup table */
+ if (!D.R) D.R = (unsigned char *)malloc(TOTFREQ);
+ memset(&D.R[x], j, D.fc[j].F);
+
+ x += D.fc[j].F;
+
+ if (!rle && j+1 == *cp) {
+ j = *cp++;
+ rle = *cp++;
+ } else if (rle) {
+ rle--;
+ j++;
+ } else {
+ j = *cp++;
+ }
+ } while(j);
+
+ assert(x < TOTFREQ);
+
+ RansState rans0, rans1, rans2, rans3;
+ uint8_t *ptr = cp;
+ RansDecInit(&rans0, &ptr);
+ RansDecInit(&rans1, &ptr);
+ RansDecInit(&rans2, &ptr);
+ RansDecInit(&rans3, &ptr);
+
+ int out_end = (out_sz&~3);
+
+ RansState R[4];
+ R[0] = rans0;
+ R[1] = rans1;
+ R[2] = rans2;
+ R[3] = rans3;
+ uint32_t mask = (1u << TF_SHIFT)-1;
+
+ for (i=0; i < out_end; i+=4) {
+ uint32_t m[4] = {R[0] & mask,
+ R[1] & mask,
+ R[2] & mask,
+ R[3] & mask};
+ uint8_t c[4] = {D.R[m[0]],
+ D.R[m[1]],
+ D.R[m[2]],
+ D.R[m[3]]};
+ out_buf[i+0] = c[0];
+ out_buf[i+1] = c[1];
+ out_buf[i+2] = c[2];
+ out_buf[i+3] = c[3];
+
+ // RansDecAdvanceSymbolStep(&R[0], &syms[c[0]], TF_SHIFT);
+ // RansDecAdvanceSymbolStep(&R[1], &syms[c[1]], TF_SHIFT);
+ // RansDecAdvanceSymbolStep(&R[2], &syms[c[2]], TF_SHIFT);
+ // RansDecAdvanceSymbolStep(&R[3], &syms[c[3]], TF_SHIFT);
+ R[0] = syms[c[0]].freq * (R[0]>>TF_SHIFT);
+ R[1] = syms[c[1]].freq * (R[1]>>TF_SHIFT);
+ R[2] = syms[c[2]].freq * (R[2]>>TF_SHIFT);
+ R[3] = syms[c[3]].freq * (R[3]>>TF_SHIFT);
+
+ R[0] += m[0] - syms[c[0]].start;
+ R[1] += m[1] - syms[c[1]].start;
+ R[2] += m[2] - syms[c[2]].start;
+ R[3] += m[3] - syms[c[3]].start;
+
+ RansDecRenorm(&R[0], &ptr);
+ RansDecRenorm(&R[1], &ptr);
+ RansDecRenorm(&R[2], &ptr);
+ RansDecRenorm(&R[3], &ptr);
+ }
+
+ rans0 = R[0];
+ rans1 = R[1];
+ rans2 = R[2];
+ rans3 = R[3];
+
+ switch(out_sz&3) {
+ unsigned char c;
+ case 0:
+ break;
+ case 1:
+ c = D.R[RansDecGet(&rans0, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end] = c;
+ break;
+
+ case 2:
+ c = D.R[RansDecGet(&rans0, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end] = c;
+
+ c = D.R[RansDecGet(&rans1, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end+1] = c;
+ break;
+
+ case 3:
+ c = D.R[RansDecGet(&rans0, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end] = c;
+
+ c = D.R[RansDecGet(&rans1, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end+1] = c;
+
+ c = D.R[RansDecGet(&rans2, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans2, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end+2] = c;
+ break;
+ }
+
+ *out_size = out_sz;
+
+ if (D.R) free(D.R);
+
+ return (unsigned char *)out_buf;
+}
+
+unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ unsigned char *out_buf, *out_end, *cp;
+ unsigned int last_i, tab_size, rle_i, rle_j;
+ RansEncSymbol syms[256][256];
+
+ if (in_size < 4)
+ return rans_compress_O0(in, in_size, out_size);
+
+ out_buf = malloc(1.05*in_size + 257*257*3 + 9);
+ if (!out_buf)
+ return NULL;
+
+ out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
+ cp = out_buf+9;
+
+ int F[256][256], T[256], i, j;
+ unsigned char c;
+
+ memset(F, 0, 256*256*sizeof(int));
+ memset(T, 0, 256*sizeof(int));
+ //for (last = 0, i=in_size-1; i>=0; i--) {
+ // F[last][c = in[i]]++;
+ // T[last]++;
+ // last = c;
+ //}
+
+ for (last_i=i=0; i<in_size; i++) {
+ F[last_i][c = in[i]]++;
+ T[last_i]++;
+ last_i = c;
+ }
+ F[0][in[1*(in_size>>2)]]++;
+ F[0][in[2*(in_size>>2)]]++;
+ F[0][in[3*(in_size>>2)]]++;
+ T[0]+=3;
+
+ // Normalise so T[i] == TOTFREQ
+ for (rle_i = i = 0; i < 256; i++) {
+ int t2, m, M;
+ unsigned int x;
+
+ if (T[i] == 0)
+ continue;
+
+ //uint64_t p = (TOTFREQ * TOTFREQ) / t;
+ double p = ((double)TOTFREQ)/T[i];
+ for (t2 = m = M = j = 0; j < 256; j++) {
+ if (!F[i][j])
+ continue;
+
+ if (m < F[i][j])
+ m = F[i][j], M = j;
+
+ //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0)
+ if ((F[i][j] *= p) == 0)
+ F[i][j] = 1;
+ t2 += F[i][j];
+ }
+
+ t2++;
+ if (t2 < TOTFREQ)
+ F[i][M] += TOTFREQ-t2;
+ else
+ F[i][M] -= t2-TOTFREQ;
+
+ // Store frequency table
+ // i
+ if (rle_i) {
+ rle_i--;
+ } else {
+ *cp++ = i;
+ // FIXME: could use order-0 statistics to observe which alphabet
+ // symbols are present and base RLE on that ordering instead.
+ if (i && T[i-1]) {
+ for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++)
+ ;
+ rle_i -= i+1;
+ *cp++ = rle_i;
+ }
+ }
+
+ int *F_i_ = F[i];
+ x = 0;
+ rle_j = 0;
+ for (j = 0; j < 256; j++) {
+ if (F_i_[j]) {
+ //fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x);
+
+ // j
+ if (rle_j) {
+ rle_j--;
+ } else {
+ *cp++ = j;
+ if (!rle_j && j && F_i_[j-1]) {
+ for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++)
+ ;
+ rle_j -= j+1;
+ *cp++ = rle_j;
+ }
+ }
+
+ // F_i_[j]
+ if (F_i_[j]<128) {
+ *cp++ = F_i_[j];
+ } else {
+ *cp++ = 128 | (F_i_[j]>>8);
+ *cp++ = F_i_[j]&0xff;
+ }
+
+ RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT);
+ x += F_i_[j];
+ }
+ }
+ *cp++ = 0;
+ }
+ *cp++ = 0;
+
+ //write(1, out_buf+4, cp-(out_buf+4));
+ tab_size = cp - out_buf;
+ assert(tab_size < 257*257*3);
+
+ RansState rans0, rans1, rans2, rans3;
+ RansEncInit(&rans0);
+ RansEncInit(&rans1);
+ RansEncInit(&rans2);
+ RansEncInit(&rans3);
+
+ uint8_t* ptr = out_end;
+
+ int isz4 = in_size>>2;
+ int i0 = 1*isz4-2;
+ int i1 = 2*isz4-2;
+ int i2 = 3*isz4-2;
+ int i3 = 4*isz4-2;
+
+ unsigned char l0 = in[i0+1];
+ unsigned char l1 = in[i1+1];
+ unsigned char l2 = in[i2+1];
+ unsigned char l3 = in[i3+1];
+
+ // Deal with the remainder
+ l3 = in[in_size-1];
+ for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
+ unsigned char c3 = in[i3];
+ RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
+ l3 = c3;
+ }
+
+ for (; i0 >= 0; i0--, i1--, i2--, i3--) {
+ unsigned char c0, c1, c2, c3;
+ RansEncSymbol *s3 = &syms[c3 = in[i3]][l3];
+ RansEncSymbol *s2 = &syms[c2 = in[i2]][l2];
+ RansEncSymbol *s1 = &syms[c1 = in[i1]][l1];
+ RansEncSymbol *s0 = &syms[c0 = in[i0]][l0];
+
+ RansEncPutSymbol(&rans3, &ptr, s3);
+ RansEncPutSymbol(&rans2, &ptr, s2);
+ RansEncPutSymbol(&rans1, &ptr, s1);
+ RansEncPutSymbol(&rans0, &ptr, s0);
+
+ l0 = c0;
+ l1 = c1;
+ l2 = c2;
+ l3 = c3;
+ }
+
+ RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
+ RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
+ RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
+ RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
+
+ RansEncFlush(&rans3, &ptr);
+ RansEncFlush(&rans2, &ptr);
+ RansEncFlush(&rans1, &ptr);
+ RansEncFlush(&rans0, &ptr);
+
+ *out_size = (out_end - ptr) + tab_size;
+
+ cp = out_buf;
+ *cp++ = 1; // order
+
+ *cp++ = ((*out_size-9)>> 0) & 0xff;
+ *cp++ = ((*out_size-9)>> 8) & 0xff;
+ *cp++ = ((*out_size-9)>>16) & 0xff;
+ *cp++ = ((*out_size-9)>>24) & 0xff;
+
+ *cp++ = (in_size>> 0) & 0xff;
+ *cp++ = (in_size>> 8) & 0xff;
+ *cp++ = (in_size>>16) & 0xff;
+ *cp++ = (in_size>>24) & 0xff;
+
+ memmove(out_buf + tab_size, ptr, out_end-ptr);
+
+ return out_buf;
+}
+
+unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ /* Load in the static tables */
+ unsigned char *cp = in + 9;
+ int i, j = -999, x, out_sz, in_sz, rle_i, rle_j;
+ char *out_buf;
+ ari_decoder D[256];
+ RansDecSymbol syms[256][256];
+
+ memset(D, 0, 256*sizeof(*D));
+
+ if (*in++ != 1) // Order-1 check
+ return NULL;
+
+ in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24);
+ out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24);
+ if (in_sz != in_size-9)
+ return NULL;
+
+ out_buf = malloc(out_sz);
+ if (!out_buf)
+ return NULL;
+
+ //fprintf(stderr, "out_sz=%d\n", out_sz);
+
+ //i = *cp++;
+ rle_i = 0;
+ i = *cp++;
+ do {
+ rle_j = x = 0;
+ j = *cp++;
+ do {
+ if ((D[i].fc[j].F = *cp++) >= 128) {
+ D[i].fc[j].F &= ~128;
+ D[i].fc[j].F = ((D[i].fc[j].F & 127) << 8) | *cp++;
+ }
+ D[i].fc[j].C = x;
+
+ //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, D[i].fc[j].F, D[i].fc[j].C);
+
+ if (!D[i].fc[j].F)
+ D[i].fc[j].F = TOTFREQ;
+
+ RansDecSymbolInit(&syms[i][j], D[i].fc[j].C, D[i].fc[j].F);
+
+ /* Build reverse lookup table */
+ if (!D[i].R) D[i].R = (unsigned char *)malloc(TOTFREQ);
+ memset(&D[i].R[x], j, D[i].fc[j].F);
+
+ x += D[i].fc[j].F;
+ assert(x <= TOTFREQ);
+
+ if (!rle_j && j+1 == *cp) {
+ j = *cp++;
+ rle_j = *cp++;
+ } else if (rle_j) {
+ rle_j--;
+ j++;
+ } else {
+ j = *cp++;
+ }
+ } while(j);
+
+ if (!rle_i && i+1 == *cp) {
+ i = *cp++;
+ rle_i = *cp++;
+ } else if (rle_i) {
+ rle_i--;
+ i++;
+ } else {
+ i = *cp++;
+ }
+ } while (i);
+
+ // Precompute reverse lookup of frequency.
+
+ RansState rans0, rans1, rans2, rans3;
+ uint8_t *ptr = cp;
+ RansDecInit(&rans0, &ptr);
+ RansDecInit(&rans1, &ptr);
+ RansDecInit(&rans2, &ptr);
+ RansDecInit(&rans3, &ptr);
+
+ int isz4 = out_sz>>2;
+ int l0 = 0;
+ int l1 = 0;
+ int l2 = 0;
+ int l3 = 0;
+ int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
+
+ RansState R[4];
+ R[0] = rans0;
+ R[1] = rans1;
+ R[2] = rans2;
+ R[3] = rans3;
+
+ for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
+ uint32_t m[4] = {R[0] & ((1u << TF_SHIFT)-1),
+ R[1] & ((1u << TF_SHIFT)-1),
+ R[2] & ((1u << TF_SHIFT)-1),
+ R[3] & ((1u << TF_SHIFT)-1)};
+
+ uint8_t c[4] = {D[l0].R[m[0]],
+ D[l1].R[m[1]],
+ D[l2].R[m[2]],
+ D[l3].R[m[3]]};
+
+ out_buf[i4[0]] = c[0];
+ out_buf[i4[1]] = c[1];
+ out_buf[i4[2]] = c[2];
+ out_buf[i4[3]] = c[3];
+
+ //RansDecAdvanceSymbolStep(&R[0], &syms[l0][c[0]], TF_SHIFT);
+ //RansDecAdvanceSymbolStep(&R[1], &syms[l1][c[1]], TF_SHIFT);
+ //RansDecAdvanceSymbolStep(&R[2], &syms[l2][c[2]], TF_SHIFT);
+ //RansDecAdvanceSymbolStep(&R[3], &syms[l3][c[3]], TF_SHIFT);
+
+ R[0] = syms[l0][c[0]].freq * (R[0]>>TF_SHIFT);
+ R[1] = syms[l1][c[1]].freq * (R[1]>>TF_SHIFT);
+ R[2] = syms[l2][c[2]].freq * (R[2]>>TF_SHIFT);
+ R[3] = syms[l3][c[3]].freq * (R[3]>>TF_SHIFT);
+
+ R[0] += m[0] - syms[l0][c[0]].start;
+ R[1] += m[1] - syms[l1][c[1]].start;
+ R[2] += m[2] - syms[l2][c[2]].start;
+ R[3] += m[3] - syms[l3][c[3]].start;
+
+ RansDecRenorm(&R[0], &ptr);
+ RansDecRenorm(&R[1], &ptr);
+ RansDecRenorm(&R[2], &ptr);
+ RansDecRenorm(&R[3], &ptr);
+
+ l0 = c[0];
+ l1 = c[1];
+ l2 = c[2];
+ l3 = c[3];
+ }
+
+ rans0 = R[0];
+ rans1 = R[1];
+ rans2 = R[2];
+ rans3 = R[3];
+
+ // Remainder
+ for (; i4[3] < out_sz; i4[3]++) {
+ unsigned char c3 = D[l3].R[RansDecGet(&rans3, TF_SHIFT)];
+ out_buf[i4[3]] = c3;
+ RansDecAdvanceSymbol(&rans3, &ptr, &syms[l3][c3], TF_SHIFT);
+ l3 = c3;
+ }
+
+ *out_size = out_sz;
+
+ for (i = 0; i < 256; i++)
+ if (D[i].R) free(D[i].R);
+
+ return (unsigned char *)out_buf;
+}
+
+/*-----------------------------------------------------------------------------
+ * Simple interface to the order-0 vs order-1 encoders and decoders.
+ */
+unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size, int order) {
+ return order
+ ? rans_compress_O1(in, in_size, out_size)
+ : rans_compress_O0(in, in_size, out_size);
+}
+
+unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ return in[0]
+ ? rans_uncompress_O1(in, in_size, out_size)
+ : rans_uncompress_O0(in, in_size, out_size);
+}
+
+
+#ifdef TEST_MAIN
+/*-----------------------------------------------------------------------------
+ * Main.
+ *
+ * This is a simple command line tool for testing order-0 and order-1
+ * compression using the rANS codec. Simply compile with
+ *
+ * gcc -DTEST_MAIN -O3 -I. cram/rANS_static.c -o cram/rANS_static
+ *
+ * Usage: cram/rANS_static -o0 < file > file.o0
+ * cram/rANS_static -d < file.o0 > file2
+ *
+ * cram/rANS_static -o1 < file > file.o1
+ * cram/rANS_static -d < file.o1 > file2
+ */
+int main(int argc, char **argv) {
+ int opt, order = 0;
+ unsigned char in_buf[BLK_SIZE2+257*257*3];
+ int decode = 0;
+ FILE *infp = stdin, *outfp = stdout;
+ struct timeval tv1, tv2;
+ size_t bytes = 0;
+
+ extern char *optarg;
+ extern int optind;
+
+ while ((opt = getopt(argc, argv, "o:d")) != -1) {
+ switch (opt) {
+ case 'o':
+ order = atoi(optarg);
+ break;
+
+ case 'd':
+ decode = 1;
+ break;
+ }
+ }
+
+ order = order ? 1 : 0; // Only support O(0) and O(1)
+
+ if (optind < argc) {
+ if (!(infp = fopen(argv[optind], "rb"))) {
+ perror(argv[optind]);
+ return 1;
+ }
+ optind++;
+ }
+
+ if (optind < argc) {
+ if (!(outfp = fopen(argv[optind], "wb"))) {
+ perror(argv[optind]);
+ return 1;
+ }
+ optind++;
+ }
+
+ gettimeofday(&tv1, NULL);
+
+ if (decode) {
+ // Only used in some test implementations of RC_GetFreq()
+ //RC_init();
+ //RC_init2();
+
+ for (;;) {
+ uint32_t in_size, out_size;
+ unsigned char *out;
+
+ if (4 != fread(&in_size, 1, 4, infp))
+ break;
+ if (in_size != fread(in_buf, 1, in_size, infp)) {
+ fprintf(stderr, "Truncated input\n");
+ exit(1);
+ }
+ out = rans_uncompress(in_buf, in_size, &out_size);
+ if (!out)
+ abort();
+
+ fwrite(out, 1, out_size, outfp);
+ free(out);
+
+ bytes += out_size;
+ }
+ } else {
+ for (;;) {
+ uint32_t in_size, out_size;
+ unsigned char *out;
+
+ in_size = fread(in_buf, 1, BLK_SIZE, infp);
+ if (in_size <= 0)
+ break;
+
+ out = rans_compress(in_buf, in_size, &out_size, order);
+
+ fwrite(&out_size, 1, 4, outfp);
+ fwrite(out, 1, out_size, outfp);
+ free(out);
+
+ bytes += in_size;
+ }
+ }
+
+ gettimeofday(&tv2, NULL);
+
+ fprintf(stderr, "Took %ld microseconds, %5.1f MB/s\n",
+ (long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
+ tv2.tv_usec - tv1.tv_usec,
+ (double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
+ tv2.tv_usec - tv1.tv_usec));
+ return 0;
+}
+#endif
diff --git a/htslib/cram/rANS_static.h b/htslib/cram/rANS_static.h
new file mode 100644
index 0000000..971099c
--- /dev/null
+++ b/htslib/cram/rANS_static.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Genome Research Ltd.
+ * Author(s): James Bonfield
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ * Institute nor the names of its contributors may be used to endorse
+ * or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef RANS_STATIC_H
+#define RANS_STATIC_H
+
+unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size, int order);
+unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size);
+
+
+#endif /* RANS_STATIC_H */
diff --git a/htslib/cram/sam_header.c b/htslib/cram/sam_header.c
index 2a8110c..3367f19 100644
--- a/htslib/cram/sam_header.c
+++ b/htslib/cram/sam_header.c
@@ -38,10 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cram/sam_header.h"
#include "cram/string_alloc.h"
-#ifdef SAMTOOLS
-#define sam_hdr_parse sam_hdr_parse_
-#endif
-
static void sam_hdr_error(char *msg, char *line, int len, int lno) {
int j;
@@ -888,7 +884,7 @@ SAM_hdr *sam_hdr_new() {
* Returns a SAM_hdr struct on success (free with sam_hdr_free())
* NULL on failure
*/
-SAM_hdr *sam_hdr_parse(const char *hdr, int len) {
+SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
/* Make an empty SAM_hdr */
SAM_hdr *sh;
@@ -925,7 +921,7 @@ SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
if (-1 == sam_hdr_rebuild(hdr))
return NULL;
- return sam_hdr_parse(sam_hdr_str(hdr), sam_hdr_length(hdr));
+ return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
}
/*! Increments a reference count on hdr.
diff --git a/htslib/cram/sam_header.h b/htslib/cram/sam_header.h
index b9ea298..8e0929e 100644
--- a/htslib/cram/sam_header.h
+++ b/htslib/cram/sam_header.h
@@ -34,10 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* These functions can be shared between SAM, BAM and CRAM file
* formats as all three internally use the same string encoding for
* header fields.
- *
- * Consider using the scram() generic API and calling
- * scram_get_header() to obtain the format-specific pointer to the
- * SAM_hdr struct.
*/
/*
@@ -228,11 +224,7 @@ SAM_hdr *sam_hdr_new(void);
* Returns a SAM_hdr struct on success (free with sam_hdr_free());
* NULL on failure
*/
-#ifdef SAMTOOLS
SAM_hdr *sam_hdr_parse_(const char *hdr, int len);
-#else
-SAM_hdr *sam_hdr_parse(const char *hdr, int len);
-#endif
/*! Produces a duplicate copy of hdr and returns it.
diff --git a/htslib/cram/thread_pool.c b/htslib/cram/thread_pool.c
index 90652a7..dea9e90 100644
--- a/htslib/cram/thread_pool.c
+++ b/htslib/cram/thread_pool.c
@@ -35,18 +35,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
+#include <assert.h>
#include "cram/thread_pool.h"
//#define DEBUG
-#define DEBUG_TIME
+//#define DEBUG_TIME
+
+#define IN_ORDER
#ifdef DEBUG
static int worker_id(t_pool *p) {
int i;
pthread_t s = pthread_self();
for (i = 0; i < p->tsize; i++) {
- if (pthread_equal(s, p->t[i]))
+ if (pthread_equal(s, p->t[i].tid))
return i;
}
return -1;
@@ -103,7 +106,7 @@ static int t_pool_add_result(t_pool_job *j, void *data) {
fprintf(stderr, "%d: Broadcasting result_avail (id %d)\n",
worker_id(j->p), r->serial);
#endif
- pthread_cond_broadcast(&q->result_avail_c);
+ pthread_cond_signal(&q->result_avail_c);
#ifdef DEBUG
fprintf(stderr, "%d: Broadcast complete\n", worker_id(j->p));
#endif
@@ -301,7 +304,8 @@ void t_results_queue_destroy(t_results_queue *q) {
* and then executes the job.
*/
static void *t_pool_worker(void *arg) {
- t_pool *p = (t_pool *)arg;
+ t_pool_worker_t *w = (t_pool_worker_t *)arg;
+ t_pool *p = w->p;
t_pool_job *j;
#ifdef DEBUG_TIME
struct timeval t1, t2, t3;
@@ -318,28 +322,62 @@ static void *t_pool_worker(void *arg) {
#ifdef DEBUG_TIME
gettimeofday(&t2, NULL);
p->wait_time += TDIFF(t2,t1);
+ w->wait_time += TDIFF(t2,t1);
#endif
- p->nwaiting++;
+ // If there is something on the job list and a higher priority
+ // thread waiting, let it handle this instead.
+// while (p->head && p->t_stack_top != -1 && p->t_stack_top < w->idx) {
+// pthread_mutex_unlock(&p->pool_m);
+// pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
+// pthread_mutex_lock(&p->pool_m);
+// }
+
while (!p->head && !p->shutdown) {
+ p->nwaiting++;
+
if (p->njobs == 0)
pthread_cond_signal(&p->empty_c);
#ifdef DEBUG_TIME
gettimeofday(&t2, NULL);
#endif
+#ifdef IN_ORDER
+ // Push this thread to the top of the waiting stack
+ if (p->t_stack_top == -1 || p->t_stack_top > w->idx)
+ p->t_stack_top = w->idx;
+
+ p->t_stack[w->idx] = 1;
+ pthread_cond_wait(&w->pending_c, &p->pool_m);
+ p->t_stack[w->idx] = 0;
+
+ /* Find new t_stack_top */
+ {
+ int i;
+ p->t_stack_top = -1;
+ for (i = 0; i < p->tsize; i++) {
+ if (p->t_stack[i]) {
+ p->t_stack_top = i;
+ break;
+ }
+ }
+ }
+#else
pthread_cond_wait(&p->pending_c, &p->pool_m);
+#endif
#ifdef DEBUG_TIME
gettimeofday(&t3, NULL);
p->wait_time += TDIFF(t3,t2);
+ w->wait_time += TDIFF(t3,t2);
#endif
+ p->nwaiting--;
}
- p->nwaiting--;
-
if (p->shutdown) {
+#ifdef DEBUG_TIME
p->total_time += TDIFF(t3,t1);
+#endif
#ifdef DEBUG
fprintf(stderr, "%d: Shutting down\n", worker_id(p));
#endif
@@ -351,7 +389,7 @@ static void *t_pool_worker(void *arg) {
if (!(p->head = j->next))
p->tail = NULL;
- if (p->njobs-- == p->qsize)
+ if (p->njobs-- >= p->qsize)
pthread_cond_signal(&p->full_c);
if (p->njobs == 0)
@@ -389,6 +427,7 @@ t_pool *t_pool_init(int qsize, int tsize) {
p->nwaiting = 0;
p->shutdown = 0;
p->head = p->tail = NULL;
+ p->t_stack = NULL;
#ifdef DEBUG_TIME
p->total_time = p->wait_time = 0;
#endif
@@ -397,14 +436,40 @@ t_pool *t_pool_init(int qsize, int tsize) {
pthread_mutex_init(&p->pool_m, NULL);
pthread_cond_init(&p->empty_c, NULL);
- pthread_cond_init(&p->pending_c, NULL);
pthread_cond_init(&p->full_c, NULL);
+ pthread_mutex_lock(&p->pool_m);
+
+#ifdef IN_ORDER
+ if (!(p->t_stack = malloc(tsize * sizeof(*p->t_stack))))
+ return NULL;
+ p->t_stack_top = -1;
+
for (i = 0; i < tsize; i++) {
- if (0 != pthread_create(&p->t[i], NULL, t_pool_worker, p))
+ t_pool_worker_t *w = &p->t[i];
+ p->t_stack[i] = 0;
+ w->p = p;
+ w->idx = i;
+ w->wait_time = 0;
+ pthread_cond_init(&w->pending_c, NULL);
+ if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w))
return NULL;
}
-
+#else
+ pthread_cond_init(&p->pending_c, NULL);
+
+ for (i = 0; i < tsize; i++) {
+ t_pool_worker_t *w = &p->t[i];
+ w->p = p;
+ w->idx = i;
+ pthread_cond_init(&w->pending_c, NULL);
+ if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w))
+ return NULL;
+ }
+#endif
+
+ pthread_mutex_unlock(&p->pool_m);
+
return p;
}
@@ -447,7 +512,7 @@ int t_pool_dispatch(t_pool *p, t_results_queue *q,
pthread_mutex_lock(&p->pool_m);
// Check if queue is full
- while (p->njobs == p->qsize)
+ while (p->njobs >= p->qsize)
pthread_cond_wait(&p->full_c, &p->pool_m);
p->njobs++;
@@ -459,11 +524,13 @@ int t_pool_dispatch(t_pool *p, t_results_queue *q,
p->head = p->tail = j;
}
- if (p->njobs == 1) {
- // First job => tell all worker threads to start up
- pthread_cond_broadcast(&p->pending_c);
- }
-
+ // Let a worker know we have data.
+#ifdef IN_ORDER
+ if (p->t_stack_top >= 0 && p->njobs > p->tsize - p->nwaiting)
+ pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
+#else
+ pthread_cond_signal(&p->pending_c);
+#endif
pthread_mutex_unlock(&p->pool_m);
#ifdef DEBUG
@@ -482,9 +549,21 @@ int t_pool_dispatch(t_pool *p, t_results_queue *q,
*/
int t_pool_dispatch2(t_pool *p, t_results_queue *q,
void *(*func)(void *arg), void *arg, int nonblock) {
- t_pool_job *j = malloc(sizeof(*j));
+ t_pool_job *j;
- if (!j)
+#ifdef DEBUG
+ fprintf(stderr, "Dispatching job for queue %p, serial %d\n", q, q->curr_serial);
+#endif
+
+ pthread_mutex_lock(&p->pool_m);
+
+ if (p->njobs >= p->qsize && nonblock == 1) {
+ pthread_mutex_unlock(&p->pool_m);
+ errno = EAGAIN;
+ return -1;
+ }
+
+ if (!(j = malloc(sizeof(*j))))
return -1;
j->func = func;
j->arg = arg;
@@ -499,19 +578,6 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q,
j->serial = 0;
}
-#ifdef DEBUG
- fprintf(stderr, "Dispatching job for queue %p, serial %d\n", q, j->serial);
-#endif
-
- pthread_mutex_lock(&p->pool_m);
-
- if (p->njobs == p->qsize && nonblock == 1) {
- pthread_mutex_unlock(&p->pool_m);
- errno = EAGAIN;
- free(j);
- return -1;
- }
-
if (q) {
pthread_mutex_lock(&q->result_m);
q->curr_serial++;
@@ -521,7 +587,7 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q,
// Check if queue is full
if (nonblock == 0)
- while (p->njobs == p->qsize)
+ while (p->njobs >= p->qsize)
pthread_cond_wait(&p->full_c, &p->pool_m);
p->njobs++;
@@ -540,10 +606,18 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q,
fprintf(stderr, "Dispatched (serial %d)\n", j->serial);
#endif
- if (p->njobs == 1) {
- // First job => tell all worker threads to start up
- pthread_cond_broadcast(&p->pending_c);
- }
+ // Let a worker know we have data.
+#ifdef IN_ORDER
+ // Keep incoming queue at 1 per running thread, so there is always
+ // something waiting when they end their current task. If we go above
+ // this signal to start more threads (if available). This has the effect
+ // of concentrating jobs to fewer cores when we are I/O bound, which in
+ // turn benefits systems with auto CPU frequency scaling.
+ if (p->t_stack_top >= 0 && p->njobs > p->tsize - p->nwaiting)
+ pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
+#else
+ pthread_cond_signal(&p->pending_c);
+#endif
pthread_mutex_unlock(&p->pool_m);
@@ -558,12 +632,20 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q,
* -1 on failure
*/
int t_pool_flush(t_pool *p) {
+ int i;
+
#ifdef DEBUG
fprintf(stderr, "Flushing pool %p\n", p);
#endif
// Drains the queue
pthread_mutex_lock(&p->pool_m);
+
+ // Wake up everything for the final sprint!
+ for (i = 0; i < p->tsize; i++)
+ if (p->t_stack[i])
+ pthread_cond_signal(&p->t[i].pending_c);
+
while (p->njobs || p->nwaiting != p->tsize)
pthread_cond_wait(&p->empty_c, &p->pool_m);
@@ -601,31 +683,47 @@ void t_pool_destroy(t_pool *p, int kill) {
fprintf(stderr, "Sending shutdown request\n");
#endif
+#ifdef IN_ORDER
+ for (i = 0; i < p->tsize; i++)
+ pthread_cond_signal(&p->t[i].pending_c);
+#else
pthread_cond_broadcast(&p->pending_c);
+#endif
pthread_mutex_unlock(&p->pool_m);
#ifdef DEBUG
fprintf(stderr, "Shutdown complete\n");
#endif
for (i = 0; i < p->tsize; i++)
- pthread_join(p->t[i], NULL);
+ pthread_join(p->t[i].tid, NULL);
} else {
for (i = 0; i < p->tsize; i++)
- pthread_kill(p->t[i], SIGINT);
+ pthread_kill(p->t[i].tid, SIGINT);
}
pthread_mutex_destroy(&p->pool_m);
pthread_cond_destroy(&p->empty_c);
- pthread_cond_destroy(&p->pending_c);
pthread_cond_destroy(&p->full_c);
+#ifdef IN_ORDER
+ for (i = 0; i < p->tsize; i++)
+ pthread_cond_destroy(&p->t[i].pending_c);
+#else
+ pthread_cond_destroy(&p->pending_c);
+#endif
#ifdef DEBUG_TIME
fprintf(stderr, "Total time=%f\n", p->total_time / 1000000.0);
fprintf(stderr, "Wait time=%f\n", p->wait_time / 1000000.0);
fprintf(stderr, "%d%% utilisation\n",
(int)(100 - ((100.0 * p->wait_time) / p->total_time + 0.5)));
+ for (i = 0; i < p->tsize; i++)
+ fprintf(stderr, "%d: Wait time=%f\n", i,
+ p->t[i].wait_time / 1000000.0);
#endif
+ if (p->t_stack)
+ free(p->t_stack);
+
free(p->t);
free(p);
diff --git a/htslib/cram/thread_pool.h b/htslib/cram/thread_pool.h
index 18e8b42..d26c5d9 100644
--- a/htslib/cram/thread_pool.h
+++ b/htslib/cram/thread_pool.h
@@ -68,6 +68,16 @@ typedef struct t_res {
void *data; // result itself
} t_pool_result;
+struct t_pool;
+
+typedef struct {
+ struct t_pool *p;
+ int idx;
+ pthread_t tid;
+ pthread_cond_t pending_c;
+ long long wait_time;
+} t_pool_worker_t;
+
typedef struct t_pool {
int qsize; // size of queue
int njobs; // pending job count
@@ -79,7 +89,7 @@ typedef struct t_pool {
// threads
int tsize; // maximum number of jobs
- pthread_t *t;
+ t_pool_worker_t *t;
// Mutexes
pthread_mutex_t pool_m; // used when updating head/tail
@@ -88,6 +98,9 @@ typedef struct t_pool {
pthread_cond_t pending_c; // not empty
pthread_cond_t full_c;
+ // array of worker IDs free
+ int *t_stack, t_stack_top;
+
// Debugging to check wait time
long long total_time, wait_time;
} t_pool;
diff --git a/htslib/cram/vlen.c b/htslib/cram/vlen.c
index bc7e7d4..e451bbd 100644
--- a/htslib/cram/vlen.c
+++ b/htslib/cram/vlen.c
@@ -238,7 +238,7 @@ int vflen(char *fmt, va_list ap)
* Note that %10c and %.10c act differently.
* Besides, I think precision is not really allowed for %c.
*/
- len += MAX(conv_len1, 1);
+ len += MAX(conv_len1, i>=0x80 ?MB_CUR_MAX :1);
break;
case 'f':
diff --git a/htslib/faidx.c b/htslib/faidx.c
index 75ec84c..b48fce9 100644
--- a/htslib/faidx.c
+++ b/htslib/faidx.c
@@ -1,6 +1,6 @@
/* faidx.c -- FASTA random access.
- Copyright (C) 2008, 2009, 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013-2015 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -23,8 +23,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
-#include "config.h"
-
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
@@ -33,10 +31,8 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/bgzf.h"
#include "htslib/faidx.h"
+#include "htslib/hfile.h"
#include "htslib/khash.h"
-#ifdef _USE_KNETFILE
-#include "htslib/knetfile.h"
-#endif
typedef struct {
int32_t line_len, line_blen;
@@ -74,7 +70,8 @@ static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int
faidx_t *fai_build_core(BGZF *bgzf)
{
- char c, *name;
+ char *name;
+ int c;
int l_name, m_name;
int line_len, line_blen, state;
int l1, l2;
@@ -221,6 +218,7 @@ int fai_build(const char *fn)
if ( !fai )
{
if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n");
+ free(str);
return -1;
}
if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi");
@@ -238,13 +236,12 @@ int fai_build(const char *fn)
return 0;
}
-#ifdef _USE_KNETFILE
-FILE *download_and_open(const char *fn)
+static FILE *download_and_open(const char *fn)
{
const int buf_size = 1 * 1024 * 1024;
uint8_t *buf;
FILE *fp;
- knetFile *fp_remote;
+ hFILE *fp_remote;
const char *url = fn;
const char *p;
int l = strlen(fn);
@@ -258,26 +255,26 @@ FILE *download_and_open(const char *fn)
return fp;
// If failed, download from remote and open
- fp_remote = knet_open(url, "rb");
+ fp_remote = hopen(url, "rb");
if (fp_remote == 0) {
fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
return NULL;
}
if ((fp = fopen(fn, "wb")) == 0) {
fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
- knet_close(fp_remote);
+ hclose_abruptly(fp_remote);
return NULL;
}
buf = (uint8_t*)calloc(buf_size, 1);
- while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ while ((l = hread(fp_remote, buf, buf_size)) > 0)
fwrite(buf, 1, l, fp);
free(buf);
fclose(fp);
- knet_close(fp_remote);
+ if (hclose(fp_remote) != 0)
+ fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url);
return fopen(fn, "r");
}
-#endif
faidx_t *fai_load(const char *fn)
{
@@ -287,8 +284,7 @@ faidx_t *fai_load(const char *fn)
str = (char*)calloc(strlen(fn) + 5, 1);
sprintf(str, "%s.fai", fn);
-#ifdef _USE_KNETFILE
- if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
+ if (hisremote(str))
{
fp = download_and_open(str);
if ( !fp )
@@ -299,8 +295,8 @@ faidx_t *fai_load(const char *fn)
}
}
else
-#endif
fp = fopen(str, "rb");
+
if (fp == 0) {
fprintf(stderr, "[fai_load] build FASTA index.\n");
fai_build(fn);
@@ -335,8 +331,8 @@ faidx_t *fai_load(const char *fn)
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
- char *s, c;
- int i, l, k, name_end;
+ char *s;
+ int c, i, l, k, name_end;
khiter_t iter;
faidx1_t val;
khash_t(s) *h;
@@ -409,14 +405,21 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len)
return s;
}
+int faidx_fetch_nseq(const faidx_t *fai)
+{
+ return fai->n;
+}
+
int faidx_nseq(const faidx_t *fai)
{
return fai->n;
}
+
const char *faidx_iseq(const faidx_t *fai, int i)
{
return fai->name[i];
}
+
int faidx_seq_len(const faidx_t *fai, const char *seq)
{
khint_t k = kh_get(s, fai->hash, seq);
@@ -426,8 +429,7 @@ int faidx_seq_len(const faidx_t *fai, const char *seq)
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
{
- int l;
- char c;
+ int l, c;
khiter_t iter;
faidx1_t val;
char *seq=NULL;
diff --git a/htslib/hfile.c b/htslib/hfile.c
index 3f33bf6..d722c13 100644
--- a/htslib/hfile.c
+++ b/htslib/hfile.c
@@ -1,6 +1,6 @@
/* hfile.c -- buffered low-level input/output streams.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2015 Genome Research Ltd.
Author: John Marshall <jm18 at sanger.ac.uk>
@@ -254,6 +254,12 @@ off_t hseek(hFILE *fp, off_t offset, int whence)
int ret = flush_buffer(fp);
if (ret < 0) return ret;
}
+ else {
+ // Convert relative offsets from being relative to the hFILE's stream
+ // position (at begin) to being relative to the backend's physical
+ // stream position (at end, due to the buffering read-ahead).
+ if (whence == SEEK_CUR) offset -= fp->end - fp->begin;
+ }
pos = fp->backend->seek(fp, offset, whence);
if (pos < 0) { fp->has_errno = errno; return pos; }
@@ -520,7 +526,22 @@ hFILE *hopen(const char *fname, const char *mode)
{
if (strncmp(fname, "http://", 7) == 0 ||
strncmp(fname, "ftp://", 6) == 0) return hopen_net(fname, mode);
+#ifdef HAVE_IRODS
+ else if (strncmp(fname, "irods:", 6) == 0) return hopen_irods(fname, mode);
+#endif
else if (strncmp(fname, "data:", 5) == 0) return hopen_mem(fname + 5, mode);
else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
else return hopen_fd(fname, mode);
}
+
+int hisremote(const char *fname)
+{
+ // FIXME Make a new backend entry to return this
+ if (strncmp(fname, "http://", 7) == 0 ||
+ strncmp(fname, "https://", 8) == 0 ||
+ strncmp(fname, "ftp://", 6) == 0) return 1;
+#ifdef HAVE_IRODS
+ else if (strncmp(fname, "irods:", 6) == 0) return 1;
+#endif
+ else return 0;
+}
diff --git a/htslib/hfile_internal.h b/htslib/hfile_internal.h
index 7ac06ba..bfce2f6 100644
--- a/htslib/hfile_internal.h
+++ b/htslib/hfile_internal.h
@@ -1,6 +1,6 @@
/* hfile_internal.h -- internal parts of low-level input/output streams.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2015 Genome Research Ltd.
Author: John Marshall <jm18 at sanger.ac.uk>
@@ -56,6 +56,7 @@ struct hFILE_backend {
/* These are called from the hopen() dispatcher, and should call hfile_init()
to malloc a struct "derived" from hFILE and initialise it appropriately,
including setting base.backend to their own backend vector. */
+hFILE *hopen_irods(const char *filename, const char *mode);
hFILE *hopen_net(const char *filename, const char *mode);
/* May be called by hopen_*() functions to decode a fopen()-style mode into
diff --git a/htslib/hfile_irods.c b/htslib/hfile_irods.c
new file mode 100644
index 0000000..6bdbf21
--- /dev/null
+++ b/htslib/hfile_irods.c
@@ -0,0 +1,243 @@
+/* hfile_irods.c -- iRODS backend for low-level file streams.
+
+ Copyright (C) 2013, 2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "hfile_internal.h"
+
+#include <rcConnect.h>
+#include <dataObjOpen.h>
+#include <dataObjRead.h>
+#include <dataObjWrite.h>
+#include <dataObjFsync.h>
+#include <dataObjLseek.h>
+#include <dataObjClose.h>
+
+typedef struct {
+ hFILE base;
+ int descriptor;
+} hFILE_irods;
+
+static int status_errno(int status)
+{
+ switch (status) {
+ case SYS_NO_API_PRIV: return EACCES;
+ case SYS_MALLOC_ERR: return ENOMEM;
+ case SYS_OUT_OF_FILE_DESC: return ENFILE;
+ case SYS_BAD_FILE_DESCRIPTOR: return EBADF;
+ case CAT_NO_ROWS_FOUND: return ENOENT;
+ case CATALOG_ALREADY_HAS_ITEM_BY_THAT_NAME: return EEXIST;
+ default: return EIO;
+ }
+}
+
+static void set_errno(int status)
+{
+ int err = abs(status) % 1000;
+ errno = err? err : status_errno(status);
+}
+
+static struct {
+ rcComm_t *conn;
+ rodsEnv env;
+} irods = { NULL };
+
+static void irods_exit()
+{
+ (void) rcDisconnect(irods.conn);
+ irods.conn = NULL;
+}
+
+static int irods_init()
+{
+ rErrMsg_t err;
+ int ret;
+
+ ret = getRodsEnv(&irods.env);
+ if (ret < 0) goto error;
+
+ irods.conn = rcConnect(irods.env.rodsHost, irods.env.rodsPort,
+ irods.env.rodsUserName, irods.env.rodsZone,
+ NO_RECONN, &err);
+ if (irods.conn == NULL) { ret = err.status; goto error; }
+
+ if (strcmp(irods.env.rodsUserName, PUBLIC_USER_NAME) != 0) {
+ ret = clientLogin(irods.conn);
+ if (ret != 0) goto error;
+ }
+
+ // In the unlikely event atexit() fails, it's better to succeed here and
+ // carry on and do the I/O; then eventually when the program exits, we'll
+ // merely disconnect from the server uncleanly, as if we had aborted.
+ (void) atexit(irods_exit);
+
+ return 0;
+
+error:
+ if (irods.conn) { (void) rcDisconnect(irods.conn); }
+ irods.conn = NULL;
+ set_errno(ret);
+ return -1;
+}
+
+static ssize_t irods_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ bytesBuf_t buf;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+ args.len = nbytes;
+
+ buf.buf = buffer;
+ buf.len = nbytes;
+
+ ret = rcDataObjRead(irods.conn, &args, &buf);
+ if (ret < 0) set_errno(ret);
+ return ret;
+}
+
+static ssize_t irods_write(hFILE *fpv, const void *buffer, size_t nbytes)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ bytesBuf_t buf;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+ args.len = nbytes;
+
+ buf.buf = (void *) buffer; // ...the iRODS API is not const-correct here
+ buf.len = nbytes;
+
+ ret = rcDataObjWrite(irods.conn, &args, &buf);
+ if (ret < 0) set_errno(ret);
+ return ret;
+}
+
+static off_t irods_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ fileLseekOut_t *out = NULL;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+ args.offset = offset;
+ args.whence = whence;
+
+ ret = rcDataObjLseek(irods.conn, &args, &out);
+
+ if (out) { offset = out->offset; free(out); }
+ else offset = -1;
+ if (ret < 0) { set_errno(ret); return -1; }
+ return offset;
+}
+
+static int irods_flush(hFILE *fpv)
+{
+// FIXME rcDataObjFsync() doesn't seem to function as expected.
+// For now, flush is a no-op: see https://github.com/samtools/htslib/issues/168
+#if 0
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+
+ ret = rcDataObjFsync(irods.conn, &args);
+ if (ret < 0) set_errno(ret);
+ return ret;
+#endif
+ return 0;
+}
+
+static int irods_close(hFILE *fpv)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+
+ ret = rcDataObjClose(irods.conn, &args);
+ if (ret < 0) set_errno(ret);
+ return ret;
+}
+
+static const struct hFILE_backend irods_backend =
+{
+ irods_read, irods_write, irods_seek, irods_flush, irods_close
+};
+
+hFILE *hopen_irods(const char *filename, const char *mode)
+{
+ hFILE_irods *fp;
+ rodsPath_t path;
+ dataObjInp_t args;
+ int ret;
+
+ // Initialise the iRODS connection if this is the first use.
+ if (irods.conn == NULL) { if (irods_init() < 0) return NULL; }
+
+ if (strncmp(filename, "irods:", 6) == 0) filename += 6;
+ else { errno = EINVAL; return NULL; }
+
+ fp = (hFILE_irods *) hfile_init(sizeof (hFILE_irods), mode, 0);
+ if (fp == NULL) return NULL;
+
+ strncpy(path.inPath, filename, MAX_NAME_LEN-1);
+ path.inPath[MAX_NAME_LEN-1] = '\0';
+
+ ret = parseRodsPath(&path, &irods.env);
+ if (ret < 0) goto error;
+
+ memset(&args, 0, sizeof args);
+ strcpy(args.objPath, path.outPath);
+ args.openFlags = hfile_oflags(mode);
+ if (args.openFlags & O_CREAT) {
+ args.createMode = 0666;
+ addKeyVal(&args.condInput, DEST_RESC_NAME_KW,irods.env.rodsDefResource);
+ }
+
+ ret = rcDataObjOpen(irods.conn, &args);
+ if (ret < 0) goto error;
+ fp->descriptor = ret;
+
+ fp->base.backend = &irods_backend;
+ return &fp->base;
+
+error:
+ hfile_destroy((hFILE *) fp);
+ set_errno(ret);
+ return NULL;
+}
diff --git a/htslib/hts.c b/htslib/hts.c
index 5fab4ba..5f4d677 100644
--- a/htslib/hts.c
+++ b/htslib/hts.c
@@ -1,6 +1,6 @@
/* hts.c -- format-neutral I/O, indexing, and iterator API functions.
- Copyright (C) 2008, 2009, 2012-2014 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2012-2015 Genome Research Ltd.
Copyright (C) 2012, 2013 Broad Institute.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -28,6 +28,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
+#include <limits.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/stat.h>
@@ -40,7 +41,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/kseq.h"
#define KS_BGZF 1
#if KS_BGZF
- // bgzf now supports gzip-compressed files
+ // bgzf now supports gzip-compressed files, the gzFile branch can be removed
KSTREAM_INIT2(, BGZF*, bgzf_read, 65536)
#else
KSTREAM_INIT2(, gzFile, gzread, 16384)
@@ -78,10 +79,44 @@ const unsigned char seq_nt16_table[256] = {
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
+const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
/**********************
*** Basic file I/O ***
**********************/
+static enum htsFormatCategory format_category(enum htsExactFormat fmt)
+{
+ switch (fmt) {
+ case bam:
+ case sam:
+ case cram:
+ return sequence_data;
+
+ case vcf:
+ case bcf:
+ return variant_data;
+
+ case bai:
+ case crai:
+ case csi:
+ case gzi:
+ case tbi:
+ return index_file;
+
+ case bed:
+ return region_list;
+
+ case unknown_format:
+ case binary_format:
+ case text_format:
+ case format_maximum:
+ break;
+ }
+
+ return unknown_category;
+}
+
// Decompress up to ten or so bytes by peeking at the file, which must be
// positioned at the start of a GZIP block.
static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
@@ -112,91 +147,322 @@ static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
return destsize;
}
-// Returns whether the block contains any control characters, i.e.,
-// characters less than SPACE other than whitespace etc (ASCII BEL..CR).
-static int is_binary(unsigned char *s, size_t n)
+// Parse "x.y" text, taking care because the string is not NUL-terminated
+// and filling in major/minor only when the digits are followed by a delimiter,
+// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
+static void
+parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
{
- size_t i;
- for (i = 0; i < n; i++)
- if (s[i] < 0x07 || (s[i] >= 0x0e && s[i] < 0x20)) return 1;
+ const char *str = (const char *) u;
+ const char *slim = (const char *) ulim;
+ const char *s;
+
+ fmt->version.major = fmt->version.minor = -1;
+
+ for (s = str; s < slim; s++) if (!isdigit(*s)) break;
+ if (s < slim) {
+ fmt->version.major = atoi(str);
+ if (*s == '.') {
+ str = &s[1];
+ for (s = str; s < slim; s++) if (!isdigit(*s)) break;
+ if (s < slim)
+ fmt->version.minor = atoi(str);
+ }
+ else
+ fmt->version.minor = 0;
+ }
+}
+
+int hts_detect_format(hFILE *hfile, htsFormat *fmt)
+{
+ unsigned char s[21];
+ ssize_t len = hpeek(hfile, s, 18);
+ if (len < 0) return -1;
+
+ if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
+ // The stream is either gzip-compressed or BGZF-compressed.
+ // Determine which, and decompress the first few bytes.
+ fmt->compression = (len >= 18 && (s[3] & 4) &&
+ memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
+ len = decompress_peek(hfile, s, sizeof s);
+ }
+ else {
+ fmt->compression = no_compression;
+ len = hpeek(hfile, s, sizeof s);
+ }
+ if (len < 0) return -1;
+
+ fmt->compression_level = -1;
+ fmt->specific = NULL;
+
+ if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
+ fmt->category = sequence_data;
+ fmt->format = cram;
+ fmt->version.major = s[4], fmt->version.minor = s[5];
+ fmt->compression = custom;
+ return 0;
+ }
+ else if (len >= 4 && s[3] <= '\4') {
+ if (memcmp(s, "BAM\1", 4) == 0) {
+ fmt->category = sequence_data;
+ fmt->format = bam;
+ // TODO Decompress enough to pick version from @HD-VN header
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "BAI\1", 4) == 0) {
+ fmt->category = index_file;
+ fmt->format = bai;
+ fmt->version.major = -1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "BCF\4", 4) == 0) {
+ fmt->category = variant_data;
+ fmt->format = bcf;
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "BCF\2", 4) == 0) {
+ fmt->category = variant_data;
+ fmt->format = bcf;
+ fmt->version.major = s[3];
+ fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
+ return 0;
+ }
+ else if (memcmp(s, "CSI\1", 4) == 0) {
+ fmt->category = index_file;
+ fmt->format = csi;
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "TBI\1", 4) == 0) {
+ fmt->category = index_file;
+ fmt->format = tbi;
+ fmt->version.major = -1, fmt->version.minor = -1;
+ return 0;
+ }
+ }
+ else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
+ fmt->category = variant_data;
+ fmt->format = vcf;
+ if (len >= 21 && s[16] == 'v')
+ parse_version(fmt, &s[17], &s[len]);
+ else
+ fmt->version.major = fmt->version.minor = -1;
+ return 0;
+ }
+ else if (len >= 4 && s[0] == '@' &&
+ (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
+ memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
+ fmt->category = sequence_data;
+ fmt->format = sam;
+ // @HD-VN is not guaranteed to be the first tag, but then @HD is
+ // not guaranteed to be present at all...
+ if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
+ parse_version(fmt, &s[7], &s[len]);
+ else
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else {
+ // Various possibilities for tab-delimited text:
+ // .crai (gzipped tab-delimited six columns: seqid 5*number)
+ // .bed ([3..12] tab-delimited columns)
+ // .bedpe (>= 10 tab-delimited columns)
+ // .sam (tab-delimited >= 11 columns: seqid number seqid...)
+ // FIXME For now, assume it's SAM
+ fmt->category = sequence_data;
+ fmt->format = sam;
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+
+ fmt->category = unknown_category;
+ fmt->format = unknown_format;
+ fmt->version.major = fmt->version.minor = -1;
+ fmt->compression = no_compression;
return 0;
}
+char *hts_format_description(const htsFormat *format)
+{
+ kstring_t str = { 0, 0, NULL };
+
+ switch (format->format) {
+ case sam: kputs("SAM", &str); break;
+ case bam: kputs("BAM", &str); break;
+ case cram: kputs("CRAM", &str); break;
+ case vcf: kputs("VCF", &str); break;
+ case bcf:
+ if (format->version.major == 1) kputs("Legacy BCF", &str);
+ else kputs("BCF", &str);
+ break;
+ case bai: kputs("BAI", &str); break;
+ case crai: kputs("CRAI", &str); break;
+ case csi: kputs("CSI", &str); break;
+ case tbi: kputs("Tabix", &str); break;
+ default: kputs("unknown", &str); break;
+ }
+
+ if (format->version.major >= 0) {
+ kputs(" version ", &str);
+ kputw(format->version.major, &str);
+ if (format->version.minor >= 0) {
+ kputc('.', &str);
+ kputw(format->version.minor, &str);
+ }
+ }
+
+ switch (format->compression) {
+ case custom: kputs(" compressed", &str); break;
+ case gzip: kputs(" gzip-compressed", &str); break;
+ case bgzf:
+ switch (format->format) {
+ case bam:
+ case bcf:
+ case csi:
+ case tbi:
+ // These are by definition BGZF, so just use the generic term
+ kputs(" compressed", &str);
+ break;
+ default:
+ kputs(" BGZF-compressed", &str);
+ break;
+ }
+ break;
+ default: break;
+ }
+
+ switch (format->category) {
+ case sequence_data: kputs(" sequence", &str); break;
+ case variant_data: kputs(" variant calling", &str); break;
+ case index_file: kputs(" index", &str); break;
+ case region_list: kputs(" genomic region", &str); break;
+ default: break;
+ }
+
+ if (format->compression == no_compression)
+ switch (format->format) {
+ case sam:
+ case crai:
+ case vcf:
+ case bed:
+ kputs(" text", &str);
+ break;
+
+ default:
+ kputs(" data", &str);
+ break;
+ }
+ else
+ kputs(" data", &str);
+
+ return ks_release(&str);
+}
+
htsFile *hts_open(const char *fn, const char *mode)
{
htsFile *fp = NULL;
hFILE *hfile = hopen(fn, mode);
if (hfile == NULL) goto error;
- fp = (htsFile*)calloc(1, sizeof(htsFile));
+ fp = hts_hopen(hfile, fn, mode);
+ if (fp == NULL) goto error;
+
+ return fp;
+
+error:
+ if (hts_verbose >= 2)
+ fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
+
+ if (hfile)
+ hclose_abruptly(hfile);
+
+ return NULL;
+}
+
+htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode)
+{
+ htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
if (fp == NULL) goto error;
fp->fn = strdup(fn);
fp->is_be = ed_is_big();
if (strchr(mode, 'r')) {
- unsigned char s[18];
- if (hpeek(hfile, s, 6) == 6 && memcmp(s, "CRAM", 4) == 0 &&
- s[4] >= 1 && s[4] <= 2 && s[5] <= 1) {
- fp->is_cram = 1;
- }
- else if (hpeek(hfile, s, 18) == 18 && s[0] == 0x1f && s[1] == 0x8b &&
- (s[3] & 4) && memcmp(&s[12], "BC\2\0", 4) == 0) {
- // The stream is BGZF-compressed. Decompress a few bytes to see
- // whether it's in a binary format (e.g., BAM or BCF, starting
- // with four bytes of magic including a control character) or is
- // a bgzipped SAM or VCF text file.
- fp->is_compressed = 1;
- if (is_binary(s, decompress_peek(hfile, s, 4))) fp->is_bin = 1;
- else fp->is_kstream = 1;
- }
- else if (hpeek(hfile, s, 2) == 2 && s[0] == 0x1f && s[1] == 0x8b) {
- // Plain GZIP header... so a gzipped text file.
- fp->is_compressed = 1;
- fp->is_kstream = 1;
- }
- else if (hpeek(hfile, s, 4) == 4 && is_binary(s, 4)) {
- // Binary format, but in a raw non-compressed form.
- fp->is_bin = 1;
- }
- else {
- fp->is_kstream = 1;
- }
+ if (hts_detect_format(hfile, &fp->format) < 0) goto error;
}
else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+ htsFormat *fmt = &fp->format;
fp->is_write = 1;
- if (strchr(mode, 'b')) fp->is_bin = 1;
- if (strchr(mode, 'c')) fp->is_cram = 1;
- if (strchr(mode, 'z')) fp->is_compressed = 1;
- else if (strchr(mode, 'u')) fp->is_compressed = 0;
- else fp->is_compressed = 2; // not set, default behaviour
+
+ if (strchr(mode, 'b')) fmt->format = binary_format;
+ else if (strchr(mode, 'c')) fmt->format = cram;
+ else fmt->format = text_format;
+
+ if (strchr(mode, 'z')) fmt->compression = bgzf;
+ else if (strchr(mode, 'g')) fmt->compression = gzip;
+ else if (strchr(mode, 'u')) fmt->compression = no_compression;
+ else {
+ // No compression mode specified, set to the default for the format
+ switch (fmt->format) {
+ case binary_format: fmt->compression = bgzf; break;
+ case cram: fmt->compression = custom; break;
+ case text_format: fmt->compression = no_compression; break;
+ default: abort();
+ }
+ }
+
+ // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
+ fmt->category = format_category(fmt->format);
+
+ fmt->version.major = fmt->version.minor = -1;
+ fmt->compression_level = -1;
+ fmt->specific = NULL;
}
else goto error;
- if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) {
+ switch (fp->format.format) {
+ case binary_format:
+ case bam:
+ case bcf:
fp->fp.bgzf = bgzf_hopen(hfile, mode);
if (fp->fp.bgzf == NULL) goto error;
- }
- else if (fp->is_cram) {
+ fp->is_bin = 1;
+ break;
+
+ case cram:
fp->fp.cram = cram_dopen(hfile, fn, mode);
if (fp->fp.cram == NULL) goto error;
if (!fp->is_write)
cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
+ fp->is_cram = 1;
+ break;
- }
- else if (fp->is_kstream) {
- #if KS_BGZF
- BGZF *gzfp = bgzf_hopen(hfile, mode);
- #else
- // TODO Implement gzip hFILE adaptor
- hclose(hfile); // This won't work, especially for stdin
- gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb");
- #endif
- if (gzfp) fp->fp.voidp = ks_init(gzfp);
- else goto error;
- }
- else {
- fp->fp.hfile = hfile;
+ case text_format:
+ case sam:
+ case vcf:
+ if (!fp->is_write) {
+ #if KS_BGZF
+ BGZF *gzfp = bgzf_hopen(hfile, mode);
+ #else
+ // TODO Implement gzip hFILE adaptor
+ hclose(hfile); // This won't work, especially for stdin
+ gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb");
+ #endif
+ if (gzfp) fp->fp.voidp = ks_init(gzfp);
+ else goto error;
+ }
+ else if (fp->format.compression != no_compression) {
+ fp->fp.bgzf = bgzf_hopen(hfile, mode);
+ if (fp->fp.bgzf == NULL) goto error;
+ }
+ else
+ fp->fp.hfile = hfile;
+ break;
+
+ default:
+ goto error;
}
return fp;
@@ -205,9 +471,6 @@ error:
if (hts_verbose >= 2)
fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
- if (hfile)
- hclose_abruptly(hfile);
-
if (fp) {
free(fp->fn);
free(fp->fn_aux);
@@ -220,9 +483,14 @@ int hts_close(htsFile *fp)
{
int ret, save;
- if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) {
+ switch (fp->format.format) {
+ case binary_format:
+ case bam:
+ case bcf:
ret = bgzf_close(fp->fp.bgzf);
- } else if (fp->is_cram) {
+ break;
+
+ case cram:
if (!fp->is_write) {
switch (cram_eof(fp->fp.cram)) {
case 0:
@@ -236,17 +504,30 @@ int hts_close(htsFile *fp)
}
}
ret = cram_close(fp->fp.cram);
- } else if (fp->is_kstream) {
- #if KS_BGZF
- BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f;
- ret = bgzf_close(gzfp);
- #else
- gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f;
- ret = gzclose(gzfp);
- #endif
- ks_destroy((kstream_t*)fp->fp.voidp);
- } else {
- ret = hclose(fp->fp.hfile);
+ break;
+
+ case text_format:
+ case sam:
+ case vcf:
+ if (!fp->is_write) {
+ #if KS_BGZF
+ BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f;
+ ret = bgzf_close(gzfp);
+ #else
+ gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f;
+ ret = gzclose(gzfp);
+ #endif
+ ks_destroy((kstream_t*)fp->fp.voidp);
+ }
+ else if (fp->format.compression != no_compression)
+ ret = bgzf_close(fp->fp.bgzf);
+ else
+ ret = hclose(fp->fp.hfile);
+ break;
+
+ default:
+ ret = -1;
+ break;
}
save = errno;
@@ -258,11 +539,31 @@ int hts_close(htsFile *fp)
return ret;
}
+const htsFormat *hts_get_format(htsFile *fp)
+{
+ return fp? &fp->format : NULL;
+}
+
+int hts_set_opt(htsFile *fp, enum cram_option opt, ...) {
+ int r;
+ va_list args;
+
+ if (fp->format.format != cram)
+ return 0;
+
+ va_start(args, opt);
+ r = cram_set_voption(fp->fp.cram, opt, args);
+ va_end(args);
+
+ return r;
+}
+
int hts_set_threads(htsFile *fp, int n)
{
- // TODO Plug in CRAM and other threading
- if (fp->is_bin) {
+ if (fp->format.compression == bgzf) {
return bgzf_mt(fp->fp.bgzf, n, 256);
+ } else if (fp->format.format == cram) {
+ return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
}
else return 0;
}
@@ -276,6 +577,9 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
}
else fp->fn_aux = NULL;
+ if (fp->format.format == cram)
+ cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux);
+
return 0;
}
@@ -418,6 +722,7 @@ char **hts_readlines(const char *fn, int *_n)
return s;
}
+// DEPRECATED: To be removed in a future HTSlib release
int hts_file_type(const char *fname)
{
int len = strlen(fname);
@@ -425,27 +730,19 @@ int hts_file_type(const char *fname)
if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
if ( !strcmp("-",fname) ) return FT_STDIN;
- // ... etc
- int fd = open(fname, O_RDONLY);
- if ( !fd ) return 0;
+ hFILE *f = hopen(fname, "r");
+ if (f == NULL) return 0;
- uint8_t magic[5];
- if ( read(fd,magic,2)!=2 ) { close(fd); return 0; }
- if ( !strncmp((char*)magic,"##",2) ) { close(fd); return FT_VCF; }
- if ( !strncmp((char*)magic,"BCF",3) ) { close(fd); return FT_BCF; }
- close(fd);
+ htsFormat fmt;
+ if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
+ if (hclose(f) < 0) return 0;
- if ( magic[0]==0x1f && magic[1]==0x8b ) // compressed
- {
- BGZF *fp = bgzf_open(fname, "r");
- if ( !fp ) return 0;
- if ( bgzf_read(fp, magic, 3)!=3 ) { bgzf_close(fp); return 0; }
- bgzf_close(fp);
- if ( !strncmp((char*)magic,"##",2) ) return FT_VCF_GZ;
- if ( !strncmp((char*)magic,"BCF",3) ) return FT_BCF_GZ;
+ switch (fmt.format) {
+ case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
+ case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
+ default: return 0;
}
- return 0;
}
/****************
@@ -504,11 +801,11 @@ static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
l = &kh_value(b, k);
if (absent) {
l->m = 1; l->n = 0;
- l->list = (hts_pair64_t*)calloc(l->m, 16);
+ l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
}
if (l->n == l->m) {
l->m <<= 1;
- l->list = (hts_pair64_t*)realloc(l->list, l->m * 16);
+ l->list = (hts_pair64_t*)realloc(l->list, l->m * sizeof(hts_pair64_t));
}
l->list[l->n].u = beg;
l->list[l->n++].v = end;
@@ -523,7 +820,7 @@ static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t o
int old_m = l->m;
l->m = end + 1;
kroundup32(l->m);
- l->offset = (uint64_t*)realloc(l->offset, l->m * 8);
+ l->offset = (uint64_t*)realloc(l->offset, l->m * sizeof(uint64_t));
memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1
}
if (beg == end) { // to save a loop in this case
@@ -616,9 +913,9 @@ static void compress_binning(hts_idx_t *idx, int i)
if (q->n + p->n > q->m) {
q->m = q->n + p->n;
kroundup32(q->m);
- q->list = (hts_pair64_t*)realloc(q->list, q->m * 16);
+ q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
}
- memcpy(q->list + q->n, p->list, p->n * 16);
+ memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
q->n += p->n;
free(p->list);
kh_del(bin, bidx, k);
@@ -660,6 +957,7 @@ void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
{
int bin;
+ if (tid<0) beg = -1, end = 0;
if (tid >= idx->m) { // enlarge the index
int32_t oldm = idx->m;
idx->m = idx->m? idx->m<<1 : 2;
@@ -887,7 +1185,7 @@ static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt)
if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1;
if (is_be) ed_swap_4p(&p->n);
p->m = p->n;
- p->list = (hts_pair64_t*)malloc(p->m * 16);
+ p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
if (p->list == NULL) return -2;
if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1;
if (is_be) swap_bins(p);
@@ -897,7 +1195,7 @@ static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt)
if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1;
if (is_be) ed_swap_4p(&l->n);
l->m = l->n;
- l->offset = (uint64_t*)malloc(l->n << 3);
+ l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
if (l->offset == NULL) return -2;
if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1;
if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
@@ -1130,7 +1428,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re
if (beg < 0) beg = 0;
if (end < beg) return 0;
- if ((bidx = idx->bidx[tid]) == 0) return 0;
+ if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
@@ -1154,7 +1452,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re
if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
n_off += kh_value(bidx, k).n;
if (n_off == 0) return iter;
- off = (hts_pair64_t*)calloc(n_off, 16);
+ off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
for (i = n_off = 0; i < iter->bins.n; ++i) {
if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
int j;
@@ -1213,10 +1511,10 @@ const char *hts_parse_reg(const char *s, int *beg, int *end)
if (s[i] != ',') tmp[k++] = s[i];
tmp[k] = 0;
if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0;
- *end = *tmp? strtol(tmp + 1, &tmp, 10) : 1<<29;
+ *end = *tmp? strtol(tmp + 1, &tmp, 10) : INT_MAX;
if (*beg > *end) name_end = l;
}
- if (name_end == l) *beg = 0, *end = 1<<29;
+ if (name_end == l) *beg = 0, *end = INT_MAX;
return s + name_end;
}
@@ -1225,7 +1523,7 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g
int tid, beg, end;
char *q, *tmp;
if (strcmp(reg, ".") == 0)
- return itr_query(idx, HTS_IDX_START, 0, 1<<29, readrec);
+ return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
else if (strcmp(reg, "*") != 0) {
q = (char*)hts_parse_reg(reg, &beg, &end);
tmp = (char*)alloca(q - reg + 1);
@@ -1249,6 +1547,9 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
}
ret = iter->readrec(fp, data, r, &tid, &beg, &end);
if (ret < 0) iter->finished = 1;
+ iter->curr_tid = tid;
+ iter->curr_beg = beg;
+ iter->curr_end = end;
return ret;
}
if (iter->off == 0) return -1;
@@ -1265,7 +1566,12 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
iter->curr_off = bgzf_tell(fp);
if (tid != iter->tid || beg >= iter->end) { // no need to proceed
ret = -1; break;
- } else if (end > iter->beg && iter->end > beg) return ret;
+ } else if (end > iter->beg && iter->end > beg) {
+ iter->curr_tid = tid;
+ iter->curr_beg = beg;
+ iter->curr_end = end;
+ return ret;
+ }
} else break; // end of file or error
}
iter->finished = 1;
@@ -1279,8 +1585,7 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
static char *test_and_fetch(const char *fn)
{
FILE *fp;
- // FIXME Use is_remote_scheme() helper that's true for ftp/http/irods/etc
- if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) {
+ if (hisremote(fn)) {
const int buf_size = 1 * 1024 * 1024;
hFILE *fp_remote;
uint8_t *buf;
@@ -1289,10 +1594,14 @@ static char *test_and_fetch(const char *fn)
for (p = fn + strlen(fn) - 1; p >= fn; --p)
if (*p == '/') break;
++p; // p now points to the local file name
- if ((fp_remote = hopen(fn, "r")) == 0) {
- if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to open remote file '%s'\n", __func__, fn);
- return 0;
+ // Attempt to open local file first
+ if ((fp = fopen((char*)p, "rb")) != 0)
+ {
+ fclose(fp);
+ return (char*)p;
}
+ // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
+ if ((fp_remote = hopen(fn, "r")) == 0) return 0;
if ((fp = fopen(p, "w")) == 0) {
if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
hclose_abruptly(fp_remote);
diff --git a/htslib/htsfile.1 b/htslib/htsfile.1
new file mode 100644
index 0000000..b55cafa
--- /dev/null
+++ b/htslib/htsfile.1
@@ -0,0 +1,71 @@
+.TH htsfile 1 "3 February 2015" "htslib-1.2.1" "Bioinformatics tools"
+.SH NAME
+htsfile \- identify high-throughput sequencing data files
+.\"
+.\" Copyright (C) 2015 Genome Research Ltd.
+.\"
+.\" Author: John Marshall <jm18 at sanger.ac.uk>
+.\"
+.\" Permission is hereby granted, free of charge, to any person obtaining a
+.\" copy of this software and associated documentation files (the "Software"),
+.\" to deal in the Software without restriction, including without limitation
+.\" the rights to use, copy, modify, merge, publish, distribute, sublicense,
+.\" and/or sell copies of the Software, and to permit persons to whom the
+.\" Software is furnished to do so, subject to the following conditions:
+.\"
+.\" The above copyright notice and this permission notice shall be included in
+.\" all copies or substantial portions of the Software.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+.\" IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+.\" FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+.\" THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+.\" LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+.\" DEALINGS IN THE SOFTWARE.
+.\"
+.SH SYNOPSIS
+.B htsfile
+.RB [ -chH ]
+.IR FILE ...
+.SH DESCRIPTION
+The \fBhtsfile\fR utility attempts to identify what kind of high-throughput
+sequencing data files the specified files are, and provides minimal viewing
+capabilities for some kinds of data file.
+.P
+It can identify sequencing data files such as SAM, BAM, and CRAM;
+variant calling data files such as VCF and BCF;
+index files used to index these data files;
+and compressed versions of many of them.
+.P
+For each \fIFILE\fR given, \fBhtsfile\fP prints a description of the file
+format determined, using similar keyword conventions to \fBfile\fP(1):
+"text" indicates a textual file that can probably be viewed on a terminal;
+"data" indicates binary data;
+"sequence", "variant calling", and "index" indicate different categories of
+data file.
+When it can be identified, the name of the particular file format (such as
+"BAM" or "VCF") is printed at the start of the description.
+.P
+When used to view file contents as text, \fBhtsfile\fP can optionally show
+only headers or only data records, but has no other filtering capabilities.
+Use \fBsamtools\fR or \fBbcftools\fR if you need more extensive viewing or
+filtering capabilities.
+.P
+The following options are accepted:
+.TP 4n
+.BR -c ", " --view
+Instead of identifying the specified files, display a textual representation
+of their contents on standard output.
+.TP
+.BR -h ", " --header-only
+Display data file headers only.
+Implies \fB--view\fR.
+.TP
+.BR -H ", " --no-header
+When viewing files, display data records only.
+.PP
+.SH SEE ALSO
+.IR bcftools (1),
+.IR file (1),
+.IR samtools (1)
diff --git a/htslib/htsfile.c b/htslib/htsfile.c
new file mode 100644
index 0000000..fac943b
--- /dev/null
+++ b/htslib/htsfile.c
@@ -0,0 +1,168 @@
+/* htsfile.c -- file identifier and minimal viewer.
+
+ Copyright (C) 2014-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <unistd.h>
+
+#include "htslib/hfile.h"
+#include "htslib/hts.h"
+#include "htslib/sam.h"
+#include "htslib/vcf.h"
+
+enum { identify, view_headers, view_all } mode = identify;
+int show_headers = 1;
+
+static htsFile *dup_stdout(const char *mode)
+{
+ int fd = dup(STDOUT_FILENO);
+ hFILE *hfp = (fd >= 0)? hdopen(fd, mode) : NULL;
+ return hfp? hts_hopen(hfp, "-", mode) : NULL;
+}
+
+static int view_sam(hFILE *hfp, const char *filename)
+{
+ samFile *in = hts_hopen(hfp, filename, "r");
+ if (in == NULL) return 0;
+ samFile *out = dup_stdout("w");
+ bam_hdr_t *hdr = sam_hdr_read(in);
+
+ if (show_headers) sam_hdr_write(out, hdr);
+ if (mode == view_all) {
+ bam1_t *b = bam_init1();
+ while (sam_read1(in, hdr, b) >= 0)
+ sam_write1(out, hdr, b);
+ bam_destroy1(b);
+ }
+
+ bam_hdr_destroy(hdr);
+ hts_close(out);
+ hts_close(in);
+ return 1;
+}
+
+static int view_vcf(hFILE *hfp, const char *filename)
+{
+ vcfFile *in = hts_hopen(hfp, filename, "r");
+ if (in == NULL) return 0;
+ vcfFile *out = dup_stdout("w");
+ bcf_hdr_t *hdr = bcf_hdr_read(in);
+
+ if (show_headers) bcf_hdr_write(out, hdr);
+ if (mode == view_all) {
+ bcf1_t *rec = bcf_init();
+ while (bcf_read(in, hdr, rec) >= 0)
+ bcf_write(out, hdr, rec);
+ bcf_destroy(rec);
+ }
+
+ bcf_hdr_destroy(hdr);
+ hts_close(out);
+ hts_close(in);
+ return 1;
+}
+
+static void usage(FILE *fp, int status)
+{
+ fprintf(fp,
+"Usage: htsfile [-chH] FILE...\n"
+"Options:\n"
+" -c, --view Write textual form of FILEs to standard output\n"
+" -h, --header-only Display only headers in view mode, not records\n"
+" -H, --no-header Suppress header display in view mode\n");
+ exit(status);
+}
+
+int main(int argc, char **argv)
+{
+ static const struct option options[] = {
+ { "header-only", no_argument, NULL, 'h' },
+ { "no-header", no_argument, NULL, 'H' },
+ { "view", no_argument, NULL, 'c' },
+ { "help", no_argument, NULL, '?' },
+ { "version", no_argument, NULL, 1 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ int status = EXIT_SUCCESS;
+ int c, i;
+ while ((c = getopt_long(argc, argv, "chH?", options, NULL)) >= 0)
+ switch (c) {
+ case 'c': mode = view_all; break;
+ case 'h': mode = view_headers; show_headers = 1; break;
+ case 'H': show_headers = 0; break;
+ case 1:
+ printf(
+"htsfile (htslib) %s\n"
+"Copyright (C) 2015 Genome Research Ltd.\n",
+ hts_version());
+ exit(EXIT_SUCCESS);
+ break;
+ case '?': usage(stdout, EXIT_SUCCESS); break;
+ default: usage(stderr, EXIT_FAILURE); break;
+ }
+
+ if (optind == argc) usage(stderr, EXIT_FAILURE);
+
+ for (i = optind; i < argc; i++) {
+ htsFormat fmt;
+ hFILE *fp = hopen(argv[i], "r");
+ if (fp == NULL) {
+ fprintf(stderr, "htsfile: can't open \"%s\": %s\n", argv[i], strerror(errno));
+ status = EXIT_FAILURE;
+ continue;
+ }
+
+ if (hts_detect_format(fp, &fmt) < 0) {
+ fprintf(stderr, "htsfile: detecting \"%s\" format failed: %s\n", argv[i], strerror(errno));
+ hclose_abruptly(fp);
+ status = EXIT_FAILURE;
+ continue;
+ }
+
+ if (mode == identify) {
+ char *description = hts_format_description(&fmt);
+ printf("%s:\t%s\n", argv[i], description);
+ free(description);
+ }
+ else
+ switch (fmt.category) {
+ case sequence_data: if (view_sam(fp, argv[i])) fp = NULL; break;
+ case variant_data: if (view_vcf(fp, argv[i])) fp = NULL; break;
+ default:
+ fprintf(stderr, "htsfile: can't view %s: unknown format\n", argv[i]);
+ status = EXIT_FAILURE;
+ break;
+ }
+
+ if (fp && hclose(fp) < 0) {
+ fprintf(stderr, "htsfile: closing %s failed\n", argv[i]);
+ status = EXIT_FAILURE;
+ }
+ }
+
+ return status;
+}
diff --git a/htslib/htslib.mk b/htslib/htslib.mk
index 6c203ca..14baea2 100644
--- a/htslib/htslib.mk
+++ b/htslib/htslib.mk
@@ -1,6 +1,6 @@
# Makefile rules useful for third-party code using htslib's public API.
#
-# Copyright (C) 2013-2014 Genome Research Ltd.
+# Copyright (C) 2013-2015 Genome Research Ltd.
#
# Author: John Marshall <jm18 at sanger.ac.uk>
#
@@ -60,6 +60,7 @@ HTSLIB_PUBLIC_HEADERS = \
$(HTSDIR)/htslib/kseq.h \
$(HTSDIR)/htslib/ksort.h \
$(HTSDIR)/htslib/kstring.h \
+ $(HTSDIR)/htslib/regidx.h \
$(HTSDIR)/htslib/sam.h \
$(HTSDIR)/htslib/synced_bcf_reader.h \
$(HTSDIR)/htslib/tbx.h \
@@ -73,10 +74,12 @@ HTSLIB_ALL = \
$(HTSDIR)/faidx.c \
$(HTSDIR)/hfile_internal.h \
$(HTSDIR)/hfile.c \
+ $(HTSDIR)/hfile_irods.c \
$(HTSDIR)/hfile_net.c \
$(HTSDIR)/hts.c \
$(HTSDIR)/knetfile.c \
$(HTSDIR)/kstring.c \
+ $(HTSDIR)/regidx.c \
$(HTSDIR)/sam.c \
$(HTSDIR)/synced_bcf_reader.c \
$(HTSDIR)/tbx.c \
@@ -130,6 +133,9 @@ $(HTSDIR)/libhts.so $(HTSDIR)/libhts.dylib: $(HTSLIB_ALL)
$(HTSDIR)/bgzip: $(HTSDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS)
+cd $(HTSDIR) && $(MAKE) bgzip
+$(HTSDIR)/htsfile: $(HTSDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS)
+ +cd $(HTSDIR) && $(MAKE) htsfile
+
$(HTSDIR)/tabix: $(HTSDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS)
+cd $(HTSDIR) && $(MAKE) tabix
diff --git a/htslib/htslib/bgzf.h b/htslib/htslib/bgzf.h
index 31b8d5e..cb8d4b9 100644
--- a/htslib/htslib/bgzf.h
+++ b/htslib/htslib/bgzf.h
@@ -84,9 +84,10 @@ extern "C" {
* Open an existing file descriptor for reading or writing.
*
* @param fd file descriptor
- * @param mode mode matching /[rwa][u0-9]+/: 'r' for reading, 'w' for
- * writing, or 'a' for appending, while a digit specifies
- * the zlib compression level.
+ * @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+ * writing, 'a' for appending, 'g' for gzip rather than BGZF
+ * compression (with 'w' only), and digit specifies the zlib
+ * compression level.
* Note that there is a distinction between 'u' and '0': the
* first yields plain uncompressed output whereas the latter
* outputs uncompressed data wrapped in the zlib format.
diff --git a/htslib/htslib/faidx.h b/htslib/htslib/faidx.h
index 24a30e2..a32d3a9 100644
--- a/htslib/htslib/faidx.h
+++ b/htslib/htslib/faidx.h
@@ -61,7 +61,7 @@ extern "C" {
int fai_build(const char *fn);
/*!
- @abstract Distroy a faidx_t struct.
+ @abstract Destroy a faidx_t struct.
@param fai Pointer to the struct to be destroyed
*/
void fai_destroy(faidx_t *fai);
diff --git a/htslib/htslib/hfile.h b/htslib/htslib/hfile.h
index 1b1a8a9..ea49c45 100644
--- a/htslib/htslib/hfile.h
+++ b/htslib/htslib/hfile.h
@@ -1,6 +1,6 @@
/* hfile.h -- buffered low-level input/output streams.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2015 Genome Research Ltd.
Author: John Marshall <jm18 at sanger.ac.uk>
@@ -61,6 +61,14 @@ hFILE *hopen(const char *filename, const char *mode) HTS_RESULT_USED;
hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED;
/*!
+ @abstract Report whether the file name or URL denotes remote storage
+ @return 0 if local, 1 if remote.
+ @notes "Remote" means involving e.g. explicit network access, with the
+ implication that callers may wish to cache such files' contents locally.
+*/
+int hisremote(const char *filename) HTS_RESULT_USED;
+
+/*!
@abstract Flush (for output streams) and close the stream
@return 0 if successful, or EOF (with errno set) if an error occurred.
*/
diff --git a/htslib/htslib/hts.h b/htslib/htslib/hts.h
index d020751..084c162 100644
--- a/htslib/htslib/hts.h
+++ b/htslib/htslib/hts.h
@@ -69,8 +69,48 @@ typedef struct __kstring_t {
* File I/O *
************/
+// Add new entries only at the end (but before the *_maximum entry)
+// of these enums, as their numbering is part of the htslib ABI.
+
+enum htsFormatCategory {
+ unknown_category,
+ sequence_data, // Sequence data -- SAM, BAM, CRAM, etc
+ variant_data, // Variant calling data -- VCF, BCF, etc
+ index_file, // Index file associated with some data file
+ region_list, // Coordinate intervals or regions -- BED, etc
+ category_maximum = 32767
+};
+
+enum htsExactFormat {
+ unknown_format,
+ binary_format, text_format,
+ sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed,
+ format_maximum = 32767
+};
+
+enum htsCompression {
+ no_compression, gzip, bgzf, custom,
+ compression_maximum = 32767
+};
+
+typedef struct htsFormat {
+ enum htsFormatCategory category;
+ enum htsExactFormat format;
+ struct { short major, minor; } version;
+ enum htsCompression compression;
+ short compression_level; // currently unused
+ void *specific; // currently unused
+} htsFormat;
+
+// Maintainers note htsFile cannot be an opaque structure because some of its
+// fields are part of libhts.so's ABI (hence these fields must not be moved):
+// - fp is used in the public sam_itr_next()/etc macros
+// - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
+// - is_write and is_cram are used directly in samtools <= 1.1
+// - fp is used directly in samtools (up to and including current develop)
+// - line is used directly in bcftools (up to and including current develop)
typedef struct {
- uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, is_compressed:2, is_kstream:1, dummy:25;
+ uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, dummy:28;
int64_t lineno;
kstring_t line;
char *fn, *fn_aux;
@@ -80,20 +120,71 @@ typedef struct {
struct hFILE *hfile;
void *voidp;
} fp;
+ htsFormat format;
} htsFile;
+// REQUIRED_FIELDS
+enum sam_fields {
+ SAM_QNAME = 0x00000001,
+ SAM_FLAG = 0x00000002,
+ SAM_RNAME = 0x00000004,
+ SAM_POS = 0x00000008,
+ SAM_MAPQ = 0x00000010,
+ SAM_CIGAR = 0x00000020,
+ SAM_RNEXT = 0x00000040,
+ SAM_PNEXT = 0x00000080,
+ SAM_TLEN = 0x00000100,
+ SAM_SEQ = 0x00000200,
+ SAM_QUAL = 0x00000400,
+ SAM_AUX = 0x00000800,
+ SAM_RGAUX = 0x00001000,
+};
+
+enum cram_option {
+ CRAM_OPT_DECODE_MD,
+ CRAM_OPT_PREFIX,
+ CRAM_OPT_VERBOSITY,
+ CRAM_OPT_SEQS_PER_SLICE,
+ CRAM_OPT_SLICES_PER_CONTAINER,
+ CRAM_OPT_RANGE,
+ CRAM_OPT_VERSION,
+ CRAM_OPT_EMBED_REF,
+ CRAM_OPT_IGNORE_MD5,
+ CRAM_OPT_REFERENCE,
+ CRAM_OPT_MULTI_SEQ_PER_SLICE,
+ CRAM_OPT_NO_REF,
+ CRAM_OPT_USE_BZIP2,
+ CRAM_OPT_SHARED_REF,
+ CRAM_OPT_NTHREADS,
+ CRAM_OPT_THREAD_POOL,
+ CRAM_OPT_USE_LZMA,
+ CRAM_OPT_USE_RANS,
+ CRAM_OPT_REQUIRED_FIELDS,
+};
+
/**********************
* Exported functions *
**********************/
extern int hts_verbose;
-/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
+/*! @abstract Table for converting a nucleotide character to 4-bit encoding.
+The input character may be either an IUPAC ambiguity code, '=' for 0, or
+'0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
+for A/C/G/T or combinations of these bits for ambiguous bases.
+*/
extern const unsigned char seq_nt16_table[256];
-/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
+/*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+ambiguity code letter (or '=' when given 0).
+*/
extern const char seq_nt16_str[];
+/*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+*/
+extern const int seq_nt16_int[];
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -106,6 +197,20 @@ extern "C" {
const char *hts_version(void);
/*!
+ @abstract Determine format by peeking at the start of a file
+ @param fp File opened for reading, positioned at the beginning
+ @param fmt Format structure that will be filled out on return
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_detect_format(struct hFILE *fp, htsFormat *fmt);
+
+/*!
+ @abstract Get a human-readable description of the file format
+ @return Description string, to be freed by the caller after use.
+*/
+char *hts_format_description(const htsFormat *format);
+
+/*!
@abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
@param fn The file name or "-" for stdin/stdout
@param mode Mode matching /[rwa][bcuz0-9]+/
@@ -116,8 +221,9 @@ const char *hts_version(void);
specifier letters:
b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
c CRAM format
+ g gzip compressed
u uncompressed
- z compressed
+ z bgzf compressed
[0-9] zlib compression level
Note that there is a distinction between 'u' and '0': the first yields
plain uncompressed output whereas the latter outputs uncompressed data
@@ -131,12 +237,35 @@ const char *hts_version(void);
htsFile *hts_open(const char *fn, const char *mode);
/*!
+ @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+ @param fn The already-open file handle
+ @param mode Open mode, as per hts_open()
+*/
+htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode);
+
+/*!
@abstract Close a file handle, flushing buffered data for output streams
@param fp The file handle to be closed
@return 0 for success, or negative if an error occurred.
*/
int hts_close(htsFile *fp);
+/*!
+ @abstract Returns the file's format information
+ @param fp The file handle
+ @return Read-only pointer to the file's htsFormat.
+*/
+const htsFormat *hts_get_format(htsFile *fp);
+
+/*!
+ @abstract Sets a specified CRAM option on the open file handle.
+ @param fp The file handle open the open file.
+ @param opt The CRAM_OPT_* option.
+ @param ... Optional arguments, dependent on the option used.
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_set_opt(htsFile *fp, enum cram_option opt, ...);
+
int hts_getline(htsFile *fp, int delimiter, kstring_t *str);
char **hts_readlines(const char *fn, int *_n);
/*!
@@ -207,6 +336,7 @@ typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg,
typedef struct {
uint32_t read_rest:1, finished:1, dummy:29;
int tid, beg, end, n_off, i;
+ int curr_tid, curr_beg, curr_end;
uint64_t curr_off;
hts_pair64_t *off;
hts_readrec_func *readrec;
@@ -251,12 +381,8 @@ extern "C" {
/**
* hts_file_type() - Convenience function to determine file type
- * @fname: the file name
- *
- * Returns one of the FT_* defines.
- *
- * This function was added in order to avoid the need for excessive command
- * line switches.
+ * DEPRECATED: This function has been replaced by hts_detect_format().
+ * It and these FT_* macros will be removed in a future HTSlib release.
*/
#define FT_UNKN 0
#define FT_GZ 1
diff --git a/htslib/htslib/khash.h b/htslib/htslib/khash.h
index 2d910de..5e55088 100644
--- a/htslib/htslib/khash.h
+++ b/htslib/htslib/khash.h
@@ -143,11 +143,13 @@ typedef unsigned long khint64_t;
typedef unsigned long long khint64_t;
#endif
+#ifndef kh_inline
#ifdef _MSC_VER
#define kh_inline __inline
#else
#define kh_inline inline
#endif
+#endif /* kh_inline */
typedef khint32_t khint_t;
typedef khint_t khiter_t;
@@ -182,7 +184,7 @@ typedef khint_t khiter_t;
static const double __ac_HASH_UPPER = 0.77;
#define __KHASH_TYPE(name, khkey_t, khval_t) \
- typedef struct { \
+ typedef struct kh_##name##_s { \
khint_t n_buckets, size, n_occupied, upper_bound; \
khint32_t *flags; \
khkey_t *keys; \
@@ -245,11 +247,11 @@ static const double __ac_HASH_UPPER = 0.77;
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { /* expand */ \
khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (!new_keys) return -1; \
+ if (!new_keys) { kfree(new_flags); return -1; } \
h->keys = new_keys; \
if (kh_is_map) { \
khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
- if (!new_vals) return -1; \
+ if (!new_vals) { kfree(new_flags); return -1; } \
h->vals = new_vals; \
} \
} /* otherwise shrink */ \
diff --git a/htslib/htslib/khash_str2int.h b/htslib/htslib/khash_str2int.h
index 8c4f5a6..4bbc100 100644
--- a/htslib/htslib/khash_str2int.h
+++ b/htslib/htslib/khash_str2int.h
@@ -121,4 +121,13 @@ static inline int khash_str2int_set(void *_hash, const char *str, int value)
return k;
}
+/*
+ * Return the number of keys in the hash table.
+ */
+static inline int khash_str2int_size(void *_hash)
+{
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ return kh_size(hash);
+}
+
#endif
diff --git a/htslib/htslib/kseq.h b/htslib/htslib/kseq.h
index 577cdc4..e1a3eaa 100644
--- a/htslib/htslib/kseq.h
+++ b/htslib/htslib/kseq.h
@@ -71,8 +71,7 @@
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, ks->bufsize); \
- if (ks->end < ks->bufsize) ks->is_eof = 1; \
- if (ks->end == 0) return -1; \
+ if (ks->end == 0) { ks->is_eof = 1; return -1; } \
} \
ks->seek_pos++; \
return (int)ks->buf[ks->begin++]; \
@@ -95,18 +94,17 @@ typedef struct __kstring_t {
#define __KS_GETUNTIL(SCOPE, __read) \
SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
+ int gotany = 0; \
if (dret) *dret = 0; \
str->l = append? str->l : 0; \
uint64_t seek_pos = str->l; \
- if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, ks->bufsize); \
- if (ks->end < ks->bufsize) ks->is_eof = 1; \
- if (ks->end == 0) break; \
+ if (ks->end == 0) { ks->is_eof = 1; break; } \
} else break; \
} \
if (delimiter == KS_SEP_LINE) { \
@@ -128,6 +126,7 @@ typedef struct __kstring_t {
str->s = (char*)realloc(str->s, str->m); \
} \
seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \
+ gotany = 1; \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
@@ -136,6 +135,7 @@ typedef struct __kstring_t {
break; \
} \
} \
+ if (!gotany && ks_eof(ks)) return -1; \
ks->seek_pos += seek_pos; \
if (str->s == 0) { \
str->m = 1; \
diff --git a/htslib/htslib/regidx.h b/htslib/htslib/regidx.h
new file mode 100644
index 0000000..39a795e
--- /dev/null
+++ b/htslib/htslib/regidx.h
@@ -0,0 +1,147 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/*
+ Regions indexing with an optional payload. Inspired by samtools/bedidx.c.
+ This code is intended as future replacement of bcf_sr_regions_t.
+
+ Example of usage:
+
+ // Init the parser and print regions. In this example the payload is a
+ // pointer to a string. For the description of parse_custom and
+ // free_custom functions, see regidx_parse_f and regidx_free_f below,
+ // and for working example see test/test-regidx.c.
+ regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
+
+ // Query overlap with chr:from-to
+ regitr_t itr;
+ if ( regidx_overlap(idx, chr,from,to, &itr) ) printf("There is an overlap!\n");
+
+ while ( REGITR_OVERLAP(itr,from,to) )
+ {
+ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to,
+ REGITR_START(itr), REGITR_END(itr), REGITR_PAYLOAD(itr,char*));
+ itr.i++;
+ }
+
+ regidx_destroy(regs);
+*/
+
+#ifndef HTSLIB_REGIDX_H
+#define HTSLIB_REGIDX_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+typedef struct _regidx_t regidx_t;
+typedef struct
+{
+ uint32_t start, end;
+}
+reg_t;
+typedef struct
+{
+ int i, n;
+ reg_t *reg;
+ void *payload;
+}
+regitr_t;
+
+#define REGITR_START(itr) (itr).reg[(itr).i].start
+#define REGITR_END(itr) (itr).reg[(itr).i].end
+#define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload)[(itr).i]
+#define REGITR_OVERLAP(itr,from,to) (itr.i < itr.n && REGITR_START(itr)<=to && REGITR_END(itr)>=from )
+
+/*
+ * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
+ * or regidx_parse_tab below. The function is expected to set `chr_from` and
+ * `chr_to` to point to first and last character of chromosome name and set
+ * coordinates `reg->start` and `reg->end` (0-based, inclusive). If
+ * regidx_init() was called with non-zero payload_size, the `payload` points
+ * to a memory location of the payload_size and `usr` is data passed to
+ * regidx_init(). Any memory allocated by the function will be freed by
+ * regidx_free_f on regidx_destroy().
+ *
+ * Return value: 0 on success, -1 to skip a record, -2 on fatal error.
+ */
+typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr);
+typedef void (*regidx_free_f)(void *payload);
+
+int regidx_parse_bed(const char*,char**,char**,reg_t*,void*,void*); // CHROM,FROM,TO (0-based,right-open)
+int regidx_parse_tab(const char*,char**,char**,reg_t*,void*,void*); // CHROM,POS (1-based, inclusive)
+
+/*
+ * regidx_init() - creates new index
+ * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert()
+ * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
+ * the format will be autodected, currently either regidx_parse_tab (the default) or
+ * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that
+ * the exact autodetection algorithm will change.
+ * @param freef: NULL or see description of regidx_parse_f
+ * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
+ * @param usr: optional user data passed to regidx_parse_f
+ *
+ * Returns index on success or NULL on error.
+ */
+regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
+
+/*
+ * regidx_destroy() - free memory allocated by regidx_init
+ */
+void regidx_destroy(regidx_t *idx);
+
+/*
+ * regidx_overlap() - check overlap of the location chr:from-to with regions
+ * @param start,end: 0-based start, end coordinate (inclusive)
+ * @param itr: pointer to iterator, can be NULL if not needed
+ *
+ * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
+ * regions can be iterated as shown in the example above.
+ */
+int regidx_overlap(regidx_t *idx, const char *chr, uint32_t start, uint32_t end, regitr_t *itr);
+
+/*
+ * regidx_insert() - add a new region.
+ *
+ * After last region has been added, call regidx_insert(idx,NULL) to
+ * build the index.
+ *
+ * Returns 0 on success or -1 on error.
+ */
+int regidx_insert(regidx_t *idx, char *line);
+
+/*
+ * regidx_seq_names() - return list of all sequence names
+ */
+char **regidx_seq_names(regidx_t *idx, int *n);
+
+/*
+ * regidx_seq_nregs() - number of regions
+ * regidx_nregs() - total number of regions
+ */
+int regidx_seq_nregs(regidx_t *idx, const char *seq);
+int regidx_nregs(regidx_t *idx);
+
+#endif
+
diff --git a/htslib/htslib/sam.h b/htslib/htslib/sam.h
index 94c18f7..9e6d6a3 100644
--- a/htslib/htslib/sam.h
+++ b/htslib/htslib/sam.h
@@ -235,7 +235,7 @@ typedef struct {
#define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1))
/*! @function
@abstract Get a base on read
- @param s Query sequence returned by bam1_seq()
+ @param s Query sequence returned by bam_get_seq()
@param i The i-th position, 0-based
@return 4-bit integer representing the base.
*/
diff --git a/htslib/htslib/synced_bcf_reader.h b/htslib/htslib/synced_bcf_reader.h
index 76d79d0..888fa1e 100644
--- a/htslib/htslib/synced_bcf_reader.h
+++ b/htslib/htslib/synced_bcf_reader.h
@@ -106,11 +106,17 @@ typedef struct
bcf1_t **buffer; // cached VCF records. First is the current record synced across the reader
int nbuffer, mbuffer; // number of cached records (including the current record); number of allocated records
int nfilter_ids, *filter_ids; // -1 for ".", otherwise filter id as returned by bcf_id2int
- int type;
int *samples, n_smpl; // list of columns in the order consistent with bcf_srs_t.samples
}
bcf_sr_t;
+typedef enum
+{
+ open_failed, not_bgzf, idx_load_failed, file_type_error, api_usage_error,
+ header_error
+}
+bcf_sr_error;
+
typedef struct
{
// Parameters controlling the logic
@@ -123,6 +129,7 @@ typedef struct
int require_index; // Some tools do not need random access
int max_unpack; // When reading VCFs and knowing some fields will not be needed, boost performance of vcf_parse1
int *has_line; // Corresponds to return value of bcf_sr_next_line but is not limited by sizeof(int). Use bcf_sr_has_line macro to query.
+ bcf_sr_error errnum;
// Auxiliary data
bcf_sr_t *readers;
@@ -148,6 +155,9 @@ bcf_srs_t *bcf_sr_init(void);
/** Destroy bcf_srs_t struct */
void bcf_sr_destroy(bcf_srs_t *readers);
+char *bcf_sr_strerror(int errnum);
+
+
/**
* bcf_sr_add_reader() - open new reader
* @readers: holder of the open readers
@@ -161,7 +171,6 @@ void bcf_sr_destroy(bcf_srs_t *readers);
int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname);
void bcf_sr_remove_reader(bcf_srs_t *files, int i);
-
/**
* bcf_sr_next_line() - the iterator
* @readers: holder of the open readers
@@ -174,6 +183,8 @@ int bcf_sr_next_line(bcf_srs_t *readers);
#define bcf_sr_has_line(readers, i) (readers)->has_line[i]
#define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL)
#define bcf_sr_region_done(_readers,i) (!(_readers)->has_line[i] && !(_readers)->readers[i].nbuffer ? 1 : 0)
+#define bcf_sr_get_header(_readers, i) (_readers)->readers[i].header
+#define bcf_sr_get_reader(_readers, i) &((_readers)->readers[i])
/**
* bcf_sr_seek() - set all readers to selected position
diff --git a/htslib/htslib/vcf.h b/htslib/htslib/vcf.h
index 38d418c..fde9394 100644
--- a/htslib/htslib/vcf.h
+++ b/htslib/htslib/vcf.h
@@ -87,7 +87,8 @@ typedef struct {
} bcf_hrec_t;
typedef struct {
- uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 for BCF_HL_FLT,INFO,FMT
+ uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
+ // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
bcf_hrec_t *hrec[3];
int id;
} bcf_idinfo_t;
@@ -103,7 +104,7 @@ typedef struct {
void *dict[3]; // ID dictionary, contig dict and sample dict
char **samples;
bcf_hrec_t **hrec;
- int nhrec;
+ int nhrec, dirty;
int ntransl, *transl[2]; // for bcf_translate()
int nsamples_ori; // for bcf_hdr_set_samples()
uint8_t *keep_samples;
@@ -306,7 +307,7 @@ extern "C" {
/** Writes VCF or BCF header */
- int bcf_hdr_write(htsFile *fp, const bcf_hdr_t *h);
+ int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h);
/** Parse VCF line contained in kstring and populate the bcf1_t struct */
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v);
@@ -348,6 +349,7 @@ extern "C" {
* internally to reflect any changes made by bcf_update_* functions.
*/
bcf1_t *bcf_dup(bcf1_t *src);
+ bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src);
/**
* bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
@@ -385,8 +387,7 @@ extern "C" {
/**
* bcf_hdr_add_sample() - add a new sample.
- * @param sample: Sample name to be added. After all samples have been added, NULL
- * must be passed to update internal header structures.
+ * @param sample: sample name to be added
*/
int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample);
@@ -403,6 +404,7 @@ extern "C" {
int bcf_hdr_append(bcf_hdr_t *h, const char *line);
int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
+ /** VCF version, e.g. VCFv4.2 */
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr);
void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version);
@@ -565,7 +567,8 @@ extern "C" {
// from bcf_get_genotypes() below.
#define bcf_gt_phased(idx) ((idx+1)<<1|1)
#define bcf_gt_unphased(idx) ((idx+1)<<1)
- #define bcf_gt_missing 0
+ #define bcf_gt_missing 0
+ #define bcf_gt_is_missing(val) ((val)>>1 ? 0 : 1)
#define bcf_gt_is_phased(idx) ((idx)&1)
#define bcf_gt_allele(val) (((val)>>1)-1)
diff --git a/htslib/htslib_vars.mk b/htslib/htslib_vars.mk
index 725e9ee..08f9a57 100644
--- a/htslib/htslib_vars.mk
+++ b/htslib/htslib_vars.mk
@@ -30,6 +30,7 @@ htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h
htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h)
htslib_hts_h = $(HTSPREFIX)htslib/hts.h
htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h
+htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h
htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h)
htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h)
htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h)
diff --git a/htslib/knetfile.c b/htslib/knetfile.c
index 400da4f..28fe629 100644
--- a/htslib/knetfile.c
+++ b/htslib/knetfile.c
@@ -327,11 +327,9 @@ int kftp_connect_file(knetFile *fp)
kftp_pasv_prep(fp);
kftp_send_cmd(fp, fp->size_cmd, 1);
#ifndef _WIN32
- if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
- {
- fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
- return -1;
- }
+ // If the file does not exist, the response will be "550 Could not get file
+ // size". Be silent on failure, hts_idx_load can be trying the existence of .csi or .tbi.
+ if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) return -1;
#else
const char *p = fp->response;
while (*p != ' ') ++p;
@@ -413,7 +411,7 @@ int khttp_connect_file(knetFile *fp)
l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
l += sprintf(buf + l, "\r\n");
- if ( netwrite(fp->fd, buf, l) != l ) return -1;
+ if ( netwrite(fp->fd, buf, l) != l ) { free(buf); return -1; }
l = 0;
while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
if (buf[l] == '\n' && l >= 3)
@@ -422,6 +420,7 @@ int khttp_connect_file(knetFile *fp)
}
buf[l] = 0;
if (l < 14) { // prematured header
+ free(buf);
netclose(fp->fd);
fp->fd = -1;
return -1;
diff --git a/htslib/regidx.c b/htslib/regidx.c
new file mode 100644
index 0000000..291ba79
--- /dev/null
+++ b/htslib/regidx.c
@@ -0,0 +1,338 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "htslib/hts.h"
+#include "htslib/kstring.h"
+#include "htslib/kseq.h"
+#include "htslib/khash_str2int.h"
+#include "htslib/regidx.h"
+
+#define LIDX_SHIFT 13 // number of insignificant index bits
+
+// List of regions for one chromosome
+typedef struct
+{
+ int *idx, nidx;
+ int nregs, mregs; // n:used, m:alloced
+ reg_t *regs;
+ void *payload;
+}
+reglist_t;
+
+// Container of all sequences
+struct _regidx_t
+{
+ int nseq, mseq; // n:used, m:alloced
+ reglist_t *seq; // regions for each sequence
+ void *seq2regs; // hash for fast lookup from chr name to regions
+ char **seq_names;
+ regidx_free_f free; // function to free any data allocated by regidx_parse_f
+ regidx_parse_f parse; // parse one input line
+ void *usr; // user data to pass to regidx_parse_f
+
+ // temporary data for index initialization
+ kstring_t str;
+ int rid_prev, start_prev, end_prev;
+ int payload_size;
+ void *payload;
+};
+
+int regidx_seq_nregs(regidx_t *idx, const char *seq)
+{
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
+ return idx->seq[iseq].nregs;
+}
+
+int regidx_nregs(regidx_t *idx)
+{
+ int i, nregs = 0;
+ for (i=0; i<idx->nseq; i++) nregs += idx->seq[i].nregs;
+ return nregs;
+}
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+ *n = idx->nseq;
+ return idx->seq_names;
+}
+
+int _regidx_build_index(regidx_t *idx)
+{
+ int iseq;
+ for (iseq=0; iseq<idx->nseq; iseq++)
+ {
+ reglist_t *list = &idx->seq[iseq];
+ int j,k, imax = 0; // max index bin
+ for (j=0; j<list->nregs; j++)
+ {
+ int ibeg = list->regs[j].start >> LIDX_SHIFT;
+ int iend = list->regs[j].end >> LIDX_SHIFT;
+ if ( imax < iend + 1 )
+ {
+ int old_imax = imax;
+ imax = iend + 1;
+ kroundup32(imax);
+ list->idx = (int*) realloc(list->idx, imax*sizeof(int));
+ for (k=old_imax; k<imax; k++) list->idx[k] = -1;
+ }
+ if ( ibeg==iend )
+ {
+ if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
+ }
+ else
+ {
+ for (k=ibeg; k<=iend; k++)
+ if ( list->idx[k]<0 ) list->idx[k] = j;
+ }
+ list->nidx = iend + 1;
+ }
+ }
+ return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+ if ( !line )
+ return _regidx_build_index(idx);
+
+ char *chr_from, *chr_to;
+ reg_t reg;
+ int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr);
+ if ( ret==-2 ) return -1; // error
+ if ( ret==-1 ) return 0; // skip the line
+
+ int rid;
+ idx->str.l = 0;
+ kputsn(chr_from, chr_to-chr_from+1, &idx->str);
+ if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+ {
+ idx->nseq++;
+ int m_prev = idx->mseq;
+ hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+ hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+ idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+ rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+ }
+
+ reglist_t *list = &idx->seq[rid];
+ list->nregs++;
+ int m_prev = list->mregs;
+ hts_expand(reg_t,list->nregs,list->mregs,list->regs);
+ list->regs[list->nregs-1] = reg;
+ if ( idx->payload_size )
+ {
+ if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs);
+ memcpy(list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size);
+ }
+
+ if ( idx->rid_prev==rid )
+ {
+ if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) )
+ {
+ fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n",
+ idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1);
+ return -1;
+ }
+ }
+ idx->rid_prev = rid;
+ idx->start_prev = reg.start;
+ idx->end_prev = reg.end;
+ return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ if ( !parser )
+ {
+ if ( !fname ) parser = regidx_parse_tab;
+ else
+ {
+ int len = strlen(fname);
+ if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
+ parser = regidx_parse_bed;
+ else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
+ parser = regidx_parse_bed;
+ else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
+ parser = regidx_parse_bed;
+ else
+ parser = regidx_parse_tab;
+ }
+ }
+
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ idx->free = free_f;
+ idx->parse = parser;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->rid_prev = -1;
+ idx->start_prev = -1;
+ idx->end_prev = -1;
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ if ( !fname ) return idx;
+
+ kstring_t str = {0,0,0};
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) goto error;
+
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( regidx_insert(idx, str.s) ) goto error;
+ }
+ regidx_insert(idx, NULL);
+
+ free(str.s);
+ hts_close(fp);
+ return idx;
+
+error:
+ free(str.s);
+ if ( fp ) hts_close(fp);
+ regidx_destroy(idx);
+ return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+ int i, j;
+ for (i=0; i<idx->nseq; i++)
+ {
+ reglist_t *list = &idx->seq[i];
+ if ( idx->free )
+ {
+ for (j=0; j<list->nregs; j++)
+ idx->free(list->payload + idx->payload_size*j);
+ }
+ free(list->payload);
+ free(list->regs);
+ free(list->idx);
+ }
+ free(idx->seq_names);
+ free(idx->seq);
+ free(idx->str.s);
+ free(idx->payload);
+ khash_str2int_destroy_free(idx->seq2regs);
+ free(idx);
+}
+
+int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr)
+{
+ if ( itr ) itr->i = itr->n = 0;
+
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+ reglist_t *list = &idx->seq[iseq];
+ if ( !list->nregs ) return 0;
+
+ int i, ibeg = from>>LIDX_SHIFT;
+ int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
+ if ( ireg < 0 )
+ {
+ // linear search; if slow, replace with binary search
+ if ( ibeg > list->nidx ) ibeg = list->nidx;
+ for (i=ibeg - 1; i>=0; i--)
+ if ( list->idx[i] >=0 ) break;
+ ireg = i>=0 ? list->idx[i] : 0;
+ }
+ for (i=ireg; i<list->nregs; i++)
+ {
+ if ( list->regs[i].start > to ) return 0; // no match
+ if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
+ }
+
+ if ( i>=list->nregs ) return 0; // no match
+
+ if ( !itr ) return 1;
+
+ itr->i = 0;
+ itr->n = list->nregs - i;
+ itr->reg = &idx->seq[iseq].regs[i];
+ if ( idx->payload_size )
+ itr->payload = idx->seq[iseq].payload + i*idx->payload_size;
+ else
+ itr->payload = NULL;
+
+ return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ ss = se+1;
+ reg->start = strtol(ss, &se, 10);
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ ss = se+1;
+ reg->end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ ss = se+1;
+ reg->start = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ if ( !se[0] || !se[1] )
+ reg->end = reg->start;
+ else
+ {
+ ss = se+1;
+ reg->end = strtol(ss, &se, 10);
+ if ( ss==se ) reg->end = reg->start;
+ else reg->end--;
+ }
+
+ return 0;
+}
+
diff --git a/htslib/sam.c b/htslib/sam.c
index d85b85b..460cf33 100644
--- a/htslib/sam.c
+++ b/htslib/sam.c
@@ -77,7 +77,7 @@ bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
h->sdict = NULL;
h->text = (char*)calloc(h->l_text + 1, 1);
memcpy(h->text, h0->text, h->l_text);
- h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
int i;
for (i = 0; i < h->n_targets; ++i) {
@@ -95,7 +95,7 @@ static bam_hdr_t *hdr_from_dict(sdict_t *d)
h = bam_hdr_init();
h->sdict = d;
h->n_targets = kh_size(d);
- h->target_len = (uint32_t*)malloc(4 * h->n_targets);
+ h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
for (k = kh_begin(d); k != kh_end(d); ++k) {
if (!kh_exist(d, k)) continue;
@@ -135,7 +135,7 @@ bam_hdr_t *bam_hdr_read(BGZF *fp)
if (fp->is_be) ed_swap_4p(&h->n_targets);
// read reference sequence names and lengths
h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
- h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
for (i = 0; i != h->n_targets; ++i) {
bgzf_read(fp, &name_len, 4);
if (fp->is_be) ed_swap_4p(&name_len);
@@ -432,18 +432,23 @@ int bam_index_build(const char *fn, int min_shift)
int ret = 0;
if ((fp = hts_open(fn, "r")) == 0) return -1;
- if (fp->is_cram) {
- ret = cram_index_build(fp->fp.cram, fn);
- } else {
- idx = bam_index(fp->fp.bgzf, min_shift);
- if ( !idx )
- {
- hts_close(fp);
- return -1;
- }
- hts_idx_save(idx, fn, min_shift > 0
- ? HTS_FMT_CSI : HTS_FMT_BAI);
- hts_idx_destroy(idx);
+ switch (fp->format.format) {
+ case cram:
+ ret = cram_index_build(fp->fp.cram, fn);
+ break;
+
+ case bam:
+ idx = bam_index(fp->fp.bgzf, min_shift);
+ if (idx) {
+ hts_idx_save(idx, fn, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
+ hts_idx_destroy(idx);
+ }
+ else ret = -1;
+ break;
+
+ default:
+ ret = -1;
+ break;
}
hts_close(fp);
@@ -474,9 +479,10 @@ static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int
{
htsFile *fp = fpv;
bam1_t *b = bv;
- if (fp->is_bin) return bam_read1(bgzfp, b);
- else if (fp->is_cram) return cram_get_bam_seq(fp->fp.cram, &b);
- else {
+ switch (fp->format.format) {
+ case bam: return bam_read1(bgzfp, b);
+ case cram: return cram_get_bam_seq(fp->fp.cram, &b);
+ default:
// TODO Need headers available to implement this for SAM files
fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n");
abort();
@@ -494,8 +500,11 @@ typedef struct hts_cram_idx_t {
hts_idx_t *sam_index_load(samFile *fp, const char *fn)
{
- if (fp->is_bin) return bam_index_load(fn);
- else if (fp->is_cram) {
+ switch (fp->format.format) {
+ case bam:
+ return bam_index_load(fn);
+
+ case cram: {
if (cram_index_load(fp->fp.cram, fn) < 0) return NULL;
// Cons up a fake "index" just pointing at the associated cram_fd:
hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
@@ -503,8 +512,11 @@ hts_idx_t *sam_index_load(samFile *fp, const char *fn)
idx->fmt = HTS_FMT_CRAI;
idx->cram = fp->fp.cram;
return (hts_idx_t *) idx;
+ }
+
+ default:
+ return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
}
- else return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
}
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
@@ -620,11 +632,14 @@ bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
bam_hdr_t *sam_hdr_read(htsFile *fp)
{
- if (fp->is_bin) {
+ switch (fp->format.format) {
+ case bam:
return bam_hdr_read(fp->fp.bgzf);
- } else if (fp->is_cram) {
+
+ case cram:
return cram_header_to_bam(fp->fp.cram->header);
- } else {
+
+ case sam: {
kstring_t str;
bam_hdr_t *h;
int has_SQ = 0;
@@ -650,20 +665,38 @@ bam_hdr_t *sam_hdr_read(htsFile *fp)
h = sam_hdr_parse(str.l, str.s);
h->l_text = str.l; h->text = str.s;
return h;
+ }
+
+ default:
+ abort();
}
}
int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
{
- if (fp->is_bin) {
+ switch (fp->format.format) {
+ case binary_format:
+ fp->format.category = sequence_data;
+ fp->format.format = bam;
+ /* fall-through */
+ case bam:
bam_hdr_write(fp->fp.bgzf, h);
- } else if (fp->is_cram) {
+ break;
+
+ case cram: {
cram_fd *fd = fp->fp.cram;
if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1;
if (fp->fn_aux)
cram_load_reference(fd, fp->fn_aux);
if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
- } else {
+ }
+ break;
+
+ case text_format:
+ fp->format.category = sequence_data;
+ fp->format.format = sam;
+ /* fall-through */
+ case sam: {
char *p;
hputs(h->text, fp->fp.hfile);
p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
@@ -677,6 +710,11 @@ int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
}
}
if ( hflush(fp->fp.hfile) != 0 ) return -1;
+ }
+ break;
+
+ default:
+ abort();
}
return 0;
}
@@ -806,9 +844,8 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
kputc_('A', &str);
kputc_(*q, &str);
} else if (type == 'i' || type == 'I') {
- long x;
- x = strtol(q, &q, 10);
- if (x < 0) {
+ if (*q == '-') {
+ long x = strtol(q, &q, 10);
if (x >= INT8_MIN) {
kputc_('c', &str); kputc_(x, &str);
} else if (x >= INT16_MIN) {
@@ -819,6 +856,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
kputc_('i', &str); kputsn_(&y, 4, &str);
}
} else {
+ unsigned long x = strtoul(q, &q, 10);
if (x <= UINT8_MAX) {
kputc_('C', &str); kputc_(x, &str);
} else if (x <= UINT16_MAX) {
@@ -873,7 +911,8 @@ err_ret:
int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
{
- if (fp->is_bin) {
+ switch (fp->format.format) {
+ case bam: {
int r = bam_read1(fp->fp.bgzf, b);
if (r >= 0) {
if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
@@ -881,9 +920,12 @@ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
return -3;
}
return r;
- } else if (fp->is_cram) {
+ }
+
+ case cram:
return cram_get_bam_seq(fp->fp.cram, &b);
- } else {
+
+ case sam: {
int ret;
err_recover:
if (fp->line.l == 0) {
@@ -898,6 +940,10 @@ err_recover:
if (h->ignore_sam_err) goto err_recover;
}
return ret;
+ }
+
+ default:
+ abort();
}
}
@@ -1024,15 +1070,29 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
{
- if (fp->is_bin) {
+ switch (fp->format.format) {
+ case binary_format:
+ fp->format.category = sequence_data;
+ fp->format.format = bam;
+ /* fall-through */
+ case bam:
return bam_write1(fp->fp.bgzf, b);
- } else if (fp->is_cram) {
+
+ case cram:
return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
- } else {
+
+ case text_format:
+ fp->format.category = sequence_data;
+ fp->format.format = sam;
+ /* fall-through */
+ case sam:
if (sam_format1(h, b, &fp->line) < 0) return -1;
kputc('\n', &fp->line);
if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
return fp->line.l;
+
+ default:
+ abort();
}
}
@@ -1759,7 +1819,7 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
int i;
bam_mplp_t iter;
iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
- iter->pos = (uint64_t*)calloc(n, 8);
+ iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
iter->n_plp = (int*)calloc(n, sizeof(int));
iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
diff --git a/htslib/synced_bcf_reader.c b/htslib/synced_bcf_reader.c
index 19fa703..3747c0e 100644
--- a/htslib/synced_bcf_reader.c
+++ b/htslib/synced_bcf_reader.c
@@ -52,6 +52,26 @@ static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int
static bcf_sr_regions_t *_regions_init_string(const char *str);
static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
+char *bcf_sr_strerror(int errnum)
+{
+ switch (errnum)
+ {
+ case open_failed:
+ return strerror(errno); break;
+ case not_bgzf:
+ return "not compressed with bgzip"; break;
+ case idx_load_failed:
+ return "could not load index"; break;
+ case file_type_error:
+ return "unknown file type"; break;
+ case api_usage_error:
+ return "API usage error"; break;
+ case header_error:
+ return "could not parse header"; break;
+ default: return "";
+ }
+}
+
static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
{
kstring_t str = {0,0,0};
@@ -61,7 +81,7 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
{
if ( *tmp==',' || !*tmp )
{
- out = (int*) realloc(out, sizeof(int));
+ out = (int*) realloc(out, (nout+1)*sizeof(int));
if ( tmp-prev==1 && *prev=='.' )
out[nout] = -1;
else
@@ -111,76 +131,94 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
{
+ htsFile* file_ptr = hts_open(fname, "r");
+ if ( ! file_ptr ) {
+ files->errnum = open_failed;
+ return 0;
+ }
+
files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
files->has_line[files->nreaders] = 0;
files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
bcf_sr_t *reader = &files->readers[files->nreaders++];
memset(reader,0,sizeof(bcf_sr_t));
- reader->file = hts_open(fname, "r");
- if ( !reader->file ) return 0;
+ reader->file = file_ptr;
- reader->type = reader->file->is_bin? FT_BCF : FT_VCF;
- if (reader->file->is_compressed) reader->type |= FT_GZ;
+ files->errnum = 0;
if ( files->require_index )
{
- if ( reader->type==FT_VCF_GZ )
+ if ( reader->file->format.format==vcf )
{
+ if ( reader->file->format.compression!=bgzf )
+ {
+ files->errnum = not_bgzf;
+ return 0;
+ }
+
reader->tbx_idx = tbx_index_load(fname);
if ( !reader->tbx_idx )
{
- fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
+ files->errnum = idx_load_failed;
return 0;
}
reader->header = bcf_hdr_read(reader->file);
}
- else if ( reader->type==FT_BCF_GZ )
+ else if ( reader->file->format.format==bcf )
{
+ if ( reader->file->format.compression!=bgzf )
+ {
+ files->errnum = not_bgzf;
+ return 0;
+ }
+
reader->header = bcf_hdr_read(reader->file);
reader->bcf_idx = bcf_index_load(fname);
if ( !reader->bcf_idx )
{
- fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
- return 0; // not indexed..?
+ files->errnum = idx_load_failed;
+ return 0;
}
}
else
{
- fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname);
+ files->errnum = file_type_error;
return 0;
}
}
else
{
- if ( reader->type & FT_BCF )
- {
- reader->header = bcf_hdr_read(reader->file);
- }
- else if ( reader->type & FT_VCF )
+ if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
{
reader->header = bcf_hdr_read(reader->file);
}
else
{
- fprintf(stderr,"File type not recognised: %s\n", fname);
+ files->errnum = file_type_error;
return 0;
}
files->streaming = 1;
}
if ( files->streaming && files->nreaders>1 )
{
+ files->errnum = api_usage_error;
fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
return 0;
}
if ( files->streaming && files->regions )
{
+ files->errnum = api_usage_error;
fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
return 0;
}
- if ( !reader->header ) return 0;
+ if ( !reader->header )
+ {
+ files->errnum = header_error;
+ return 0;
+ }
reader->fname = fname;
if ( files->apply_filters )
@@ -423,13 +461,13 @@ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
}
if ( files->streaming )
{
- if ( reader->type & FT_VCF )
+ if ( reader->file->format.format==vcf )
{
if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines
int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
if ( ret<0 ) break;
}
- else if ( reader->type & FT_BCF )
+ else if ( reader->file->format.format==bcf )
{
if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
}
@@ -959,8 +997,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
int len = strlen(regions);
int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1;
if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;
- int ft_type = hts_file_type(regions);
- if ( ft_type & FT_VCF ) ito = 1;
+
+ if ( reg->file->format.format==vcf ) ito = 1;
// read the whole file, tabix index is not present
while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 )
@@ -1034,7 +1072,11 @@ int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions
// using in-memory regions
- if ( reg->regs ) return 0;
+ if ( reg->regs )
+ {
+ reg->regs[reg->iseq].creg = -1;
+ return 0;
+ }
// reading regions from tabix
if ( reg->itr ) tbx_itr_destroy(reg->itr);
diff --git a/htslib/tabix.1 b/htslib/tabix.1
index 8fd1fe5..55c0ebb 100644
--- a/htslib/tabix.1
+++ b/htslib/tabix.1
@@ -1,9 +1,9 @@
-.TH tabix 1 "23 September 2014" "htslib-1.1" "Bioinformatics tools"
+.TH tabix 1 "3 February 2015" "htslib-1.2.1" "Bioinformatics tools"
.SH NAME
.PP
-bgzip - Block compression/decompression utility
+bgzip \- Block compression/decompression utility
.PP
-tabix - Generic indexer for TAB-delimited genome position files
+tabix \- Generic indexer for TAB-delimited genome position files
.\"
.\" Copyright (C) 2009-2011 Broad Institute.
.\"
@@ -30,26 +30,26 @@ tabix - Generic indexer for TAB-delimited genome position files
.SH SYNOPSIS
.PP
.B bgzip
-.RB [ \-cdhB ]
-.RB [ \-b
+.RB [ -cdhB ]
+.RB [ -b
.IR virtualOffset ]
-.RB [ \-s
+.RB [ -s
.IR size ]
.RI [ file ]
.PP
.B tabix
-.RB [ \-0lf ]
-.RB [ \-p
-.R gff|bed|sam|vcf]
-.RB [ \-s
+.RB [ -0lf ]
+.RB [ -p
+gff|bed|sam|vcf]
+.RB [ -s
.IR seqCol ]
-.RB [ \-b
+.RB [ -b
.IR begCol ]
-.RB [ \-e
+.RB [ -e
.IR endCol ]
-.RB [ \-S
+.RB [ -S
.IR lineSkip ]
-.RB [ \-c
+.RB [ -c
.IR metaChar ]
.I in.tab.bgz
.RI [ "region1 " [ "region2 " [ ... "]]]"
@@ -58,9 +58,11 @@ tabix - Generic indexer for TAB-delimited genome position files
.PP
Tabix indexes a TAB-delimited genome position file
.I in.tab.bgz
-and creates an index file
+and creates an index file (
.I in.tab.bgz.tbi
-when
+or
+.I in.tab.bgz.csi
+) when
.I region
is absent from the command-line. The input data file must be position
sorted and compressed by
@@ -74,52 +76,75 @@ specified in the format "chr:beginPos-endPos". Fast data retrieval also
works over network if URI is given as a file name and in this case the
index file will be downloaded if it is not present locally.
-.SH OPTIONS OF TABIX
+.SH INDEXING OPTIONS
.TP 10
-.BI "-p " STR
-Input format for indexing. Valid values are: gff, bed, sam, vcf and
-psltab. This option should not be applied together with any of
-.BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ;
-it is not used for data retrieval because this setting is stored in
-the index file. [gff]
-.TP
-.BI "-s " INT
-Column of sequence name. Option
-.BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0
-are all stored in the index file and thus not used in data retrieval. [1]
+.B -0, --zero-based
+Specify that the position in the data file is 0-based (e.g. UCSC files)
+rather than 1-based.
.TP
-.BI "-b " INT
+.BI "-b, --begin " INT
Column of start chromosomal position. [4]
.TP
-.BI "-e " INT
+.BI "-c, --comment " CHAR
+Skip lines started with character CHAR. [#]
+.TP
+.BI "-C, --csi"
+Skip lines started with character CHAR. [#]
+.TP
+.BI "-e, --end " INT
Column of end chromosomal position. The end column can be the same as the
start column. [5]
.TP
-.BI "-S " INT
-Skip first INT lines in the data file. [0]
+.B "-f, --force "
+Force to overwrite the index file if it is present.
.TP
-.BI "-c " CHAR
-Skip lines started with character CHAR. [#]
+.BI "-m, --min-shift" INT
+set minimal interval size for CSI indices to 2^INT [14]
.TP
-.B -0
-Specify that the position in the data file is 0-based (e.g. UCSC files)
-rather than 1-based.
+.BI "-p, --preset " STR
+Input format for indexing. Valid values are: gff, bed, sam, vcf.
+This option should not be applied together with any of
+.BR -s ", " -b ", " -e ", " -c " and " -0 ;
+it is not used for data retrieval because this setting is stored in
+the index file. [gff]
.TP
-.B -h
-Print the header/meta lines.
+.BI "-s, --sequence " INT
+Column of sequence name. Option
+.BR -s ", " -b ", " -e ", " -S ", " -c " and " -0
+are all stored in the index file and thus not used in data retrieval. [1]
.TP
-.B -B
-The second argument is a BED file. When this option is in use, the input
-file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless,
-with this option, the format of the input must be specificed correctly on the command line.
+.BI "-S, --skip-lines " INT
+Skip first INT lines in the data file. [0]
+
+.SH QUERYING AND OTHER OPTIONS
.TP
-.B -f
-Force to overwrite the index file if it is present.
+.B "-h, --print-header "
+Print also the header/meta lines.
.TP
-.B -l
+.B "-H, --only-header "
+Print only the header/meta lines.
+.TP
+.B "-i, --file-info "
+Print file format info.
+.TP
+.B "-l, --list-chroms "
List the sequence names stored in the index file.
-.RE
-
+.TP
+.B "-r, --reheader " FILE
+Replace the header with the content of FILE
+.TP
+.B "-R, --regions " FILE
+Restrict to regions listed in the FILE. The FILE can be BED file (requires .bed, .bed.gz, .bed.bgz
+file name extension) or a TAB-delimited file with CHROM, POS, and, optionally,
+POS_TO columns, where positions are 1-based and inclusive. When this option is in use, the input
+file may not be sorted.
+regions.
+.TP
+.B "-T, --targets" FILE
+Similar to
+.B -R
+but the entire input will be read sequentially and regions not listed in FILE will be skipped.
+.PP
.SH EXAMPLE
(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz;
diff --git a/htslib/tabix.c b/htslib/tabix.c
index b0af21d..2f6cfea 100644
--- a/htslib/tabix.c
+++ b/htslib/tabix.c
@@ -37,10 +37,12 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/kseq.h"
#include "htslib/bgzf.h"
#include "htslib/hts.h"
+#include "htslib/regidx.h"
typedef struct
{
- int min_shift;
+ char *regions_fname, *targets_fname;
+ int print_header, header_only;
}
args_t;
@@ -53,14 +55,14 @@ static void error(const char *format, ...)
exit(EXIT_FAILURE);
}
-
-#define IS_GFF (1<<0)
-#define IS_BED (1<<1)
-#define IS_SAM (1<<2)
-#define IS_VCF (1<<3)
-#define IS_BCF (1<<4)
-#define IS_BAM (1<<5)
-#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
+#define IS_GFF (1<<0)
+#define IS_BED (1<<1)
+#define IS_SAM (1<<2)
+#define IS_VCF (1<<3)
+#define IS_BCF (1<<4)
+#define IS_BAM (1<<5)
+#define IS_CRAM (1<<6)
+#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
int file_type(const char *fname)
{
@@ -72,78 +74,154 @@ int file_type(const char *fname)
else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
+ else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
+
+ htsFile *fp = hts_open(fname,"r");
+ enum htsExactFormat format = fp->format.format;
+ hts_close(fp);
+ if ( format == bcf ) return IS_BCF;
+ if ( format == bam ) return IS_BAM;
+ if ( format == cram ) return IS_CRAM;
+ if ( format == vcf ) return IS_VCF;
+
return 0;
}
-#define PRINT_HEADER 1
-#define HEADER_ONLY 2
-static int query_regions(char **argv, int argc, int mode)
+static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
{
- char *fname = argv[0];
- int i, ftype = file_type(fname);
+ kstring_t str = {0,0,0};
+ int iseq = 0, ireg = 0;
+ char **regs = NULL;
+ *nregs = argc;
- if ( ftype & IS_TXT || !ftype )
+ if ( regions_fname )
{
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) error("Could not read %s\n", fname);
- tbx_t *tbx = tbx_index_load(fname);
- if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
- kstring_t str = {0,0,0};
- if ( mode )
+ // improve me: this is a too heavy machinery for parsing regions...
+
+ regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
+ if ( !idx ) error("Could not read %s\n", regions_fname);
+
+ (*nregs) += regidx_nregs(idx);
+ regs = (char**) malloc(sizeof(char*)*(*nregs));
+
+ int nseq;
+ char **seqs = regidx_seq_names(idx, &nseq);
+ for (iseq=0; iseq<nseq; iseq++)
{
- while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
+ regitr_t itr;
+ regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr);
+ while ( itr.i < itr.n )
{
- if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
- puts(str.s);
+ str.l = 0;
+ ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1);
+ regs[ireg++] = strdup(str.s);
+ itr.i++;
}
}
- if ( mode!=HEADER_ONLY )
+ regidx_destroy(idx);
+ }
+ free(str.s);
+
+ if ( !ireg )
+ {
+ if ( argc )
+ regs = (char**) malloc(sizeof(char*)*argc);
+ else
{
- for (i=1; i<argc; i++)
- {
- hts_itr_t *itr = tbx_itr_querys(tbx, argv[i]);
- if ( !itr ) continue;
- while (tbx_itr_next(fp, tbx, itr, &str) >= 0) puts(str.s);
- tbx_itr_destroy(itr);
- }
+ regs = (char**) malloc(sizeof(char*));
+ regs[0] = strdup(".");
+ *nregs = 1;
}
- free(str.s);
- if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
- tbx_destroy(tbx);
}
- else if ( ftype==IS_BCF ) // output uncompressed VCF
+
+ for (iseq=0; iseq<argc; iseq++) regs[ireg++] = strdup(argv[iseq]);
+ return regs;
+}
+static int query_regions(args_t *args, char *fname, char **regs, int nregs)
+{
+ int i;
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) error("Could not read %s\n", fname);
+ enum htsExactFormat format = hts_get_format(fp)->format;
+
+ regidx_t *reg_idx = NULL;
+ if ( args->targets_fname )
+ {
+ reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
+ if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
+ }
+
+ if ( format == bcf )
{
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) error("Could not read %s\n", fname);
htsFile *out = hts_open("-","w");
if ( !out ) error("Could not open stdout\n", fname);
hts_idx_t *idx = bcf_index_load(fname);
if ( !idx ) error("Could not load .csi index of %s\n", fname);
bcf_hdr_t *hdr = bcf_hdr_read(fp);
if ( !hdr ) error("Could not read the header: %s\n", fname);
- if ( mode )
- {
+ if ( args->print_header )
bcf_hdr_write(out,hdr);
- }
- if ( mode!=HEADER_ONLY )
+ if ( !args->header_only )
{
bcf1_t *rec = bcf_init();
- for (i=1; i<argc; i++)
+ for (i=0; i<nregs; i++)
{
- hts_itr_t *itr = bcf_itr_querys(idx,hdr,argv[i]);
- if ( !itr ) continue;
- while ( bcf_itr_next(fp, itr, rec) >=0 ) bcf_write(out,hdr,rec);
+ hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
+ while ( bcf_itr_next(fp, itr, rec) >=0 )
+ {
+ if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
+ bcf_write(out,hdr,rec);
+ }
tbx_itr_destroy(itr);
}
bcf_destroy(rec);
}
- if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
bcf_hdr_destroy(hdr);
hts_idx_destroy(idx);
}
- else if ( ftype==IS_BAM ) // todo: BAM
+ else if ( format==vcf || format==sam || format==unknown_format )
+ {
+ tbx_t *tbx = tbx_index_load(fname);
+ if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
+ kstring_t str = {0,0,0};
+ if ( args->print_header )
+ {
+ while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
+ {
+ if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
+ puts(str.s);
+ }
+ }
+ if ( !args->header_only )
+ {
+ int nseq;
+ const char **seq = NULL;
+ if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
+ for (i=0; i<nregs; i++)
+ {
+ hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
+ if ( !itr ) continue;
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
+ {
+ if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
+ puts(str.s);
+ }
+ tbx_itr_destroy(itr);
+ }
+ free(seq);
+ }
+ free(str.s);
+ tbx_destroy(tbx);
+ }
+ else if ( format==bam )
error("Please use \"samtools view\" for querying BAM files.\n");
+
+ if ( reg_idx ) regidx_destroy(reg_idx);
+ if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
+
+ for (i=0; i<nregs; i++) free(regs[i]);
+ free(regs);
return 0;
}
static int query_chroms(char *fname)
@@ -259,33 +337,44 @@ static int usage(void)
fprintf(stderr, "\n");
fprintf(stderr, "Version: %s\n", hts_version());
fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n");
- fprintf(stderr, "Options:\n");
- fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
- fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
- fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
- fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
- fprintf(stderr, " -f, --force overwrite existing index without asking\n");
- fprintf(stderr, " -h, --print-header print also the header lines\n");
- fprintf(stderr, " -H, --only-header print only the header lines\n");
- fprintf(stderr, " -l, --list-chroms list chromosome names\n");
- fprintf(stderr, " -m, --min-shift INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
- fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf, bcf, bam\n");
- fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
- fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
- fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Indexing Options:\n");
+ fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
+ fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
+ fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
+ fprintf(stderr, " -C, --csi generate CSI index for VCF (default is TBI)\n");
+ fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
+ fprintf(stderr, " -f, --force overwrite existing index without asking\n");
+ fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf\n");
+ fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
+ fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Querying and other options:\n");
+ fprintf(stderr, " -h, --print-header print also the header lines\n");
+ fprintf(stderr, " -H, --only-header print only the header lines\n");
+ fprintf(stderr, " -l, --list-chroms list chromosome names\n");
+ fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
+ fprintf(stderr, " -R, --regions FILE restrict to regions listed in the file\n");
+ fprintf(stderr, " -T, --targets FILE similar to -R but streams rather than index-jumps\n");
fprintf(stderr, "\n");
return 1;
}
int main(int argc, char *argv[])
{
- int c, min_shift = -1, is_force = 0, list_chroms = 0, mode = 0;
+ int c, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
char *reheader = NULL;
+ args_t args;
+ memset(&args,0,sizeof(args_t));
static struct option loptions[] =
{
{"help",0,0,'h'},
+ {"regions",1,0,'R'},
+ {"targets",1,0,'T'},
+ {"csi",0,0,'C'},
{"zero-based",0,0,'0'},
{"print-header",0,0,'h'},
{"only-header",0,0,'H'},
@@ -301,13 +390,16 @@ int main(int argc, char *argv[])
{0,0,0,0}
};
- while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:", loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
{
switch (c)
{
+ case 'R': args.regions_fname = optarg; break;
+ case 'T': args.targets_fname = optarg; break;
+ case 'C': do_csi = 1; break;
case 'r': reheader = optarg; break;
- case 'h': mode = PRINT_HEADER; break;
- case 'H': mode = HEADER_ONLY; break;
+ case 'h': args.print_header = 1; break;
+ case 'H': args.header_only = 1; break;
case 'l': list_chroms = 1; break;
case '0': conf.preset |= TBX_UCSC; break;
case 'b': conf.bc = atoi(optarg); break;
@@ -320,6 +412,8 @@ int main(int argc, char *argv[])
else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ else if (strcmp(optarg, "bcf") == 0) ; // bcf is autodetected, preset is not needed
+ else if (strcmp(optarg, "bam") == 0) ; // same as bcf
else error("The preset string not recognised: '%s'\n", optarg);
break;
case 's': conf.sc = atoi(optarg); break;
@@ -333,8 +427,14 @@ int main(int argc, char *argv[])
if ( list_chroms )
return query_chroms(argv[optind]);
- if ( argc > optind+1 || mode==HEADER_ONLY )
- return query_regions(&argv[optind], argc-optind, mode);
+ if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
+ {
+ int nregs = 0;
+ char **regs = NULL;
+ if ( !args.header_only )
+ regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
+ return query_regions(&args, argv[optind], regs, nregs);
+ }
char *fname = argv[optind];
int ftype = file_type(fname);
@@ -343,23 +443,38 @@ int main(int argc, char *argv[])
if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff;
else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed;
else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam;
- else if ( ftype==IS_VCF ) conf_ptr = &tbx_conf_vcf;
+ else if ( ftype==IS_VCF )
+ {
+ conf_ptr = &tbx_conf_vcf;
+ if ( !min_shift && do_csi ) min_shift = 14;
+ }
else if ( ftype==IS_BCF )
{
- if ( min_shift <= 0 ) min_shift = 14;
+ if ( !min_shift ) min_shift = 14;
}
else if ( ftype==IS_BAM )
{
- if ( min_shift <= 0 ) min_shift = 14;
+ if ( !min_shift ) min_shift = 14;
}
}
+ if ( do_csi )
+ {
+ if ( !min_shift ) min_shift = 14;
+ min_shift *= do_csi; // positive for CSIv2, negative for CSIv1
+ }
+ if ( min_shift!=0 && !do_csi ) do_csi = 1;
+
if ( reheader )
return reheader_file(fname, reheader, ftype, conf_ptr);
if ( conf_ptr )
conf = *conf_ptr;
- char *suffix = min_shift <= 0 ? ".tbi" : (ftype==IS_BAM ? ".bai" : ".csi");
+ char *suffix = ".tbi";
+ if ( do_csi ) suffix = ".csi";
+ else if ( ftype==IS_BAM ) suffix = ".bai";
+ else if ( ftype==IS_CRAM ) suffix = ".crai";
+
char *idx_fname = calloc(strlen(fname) + 5, 1);
strcat(strcpy(idx_fname, fname), suffix);
@@ -375,7 +490,12 @@ int main(int argc, char *argv[])
}
free(idx_fname);
- if ( min_shift > 0 ) // CSI index
+ if ( ftype==IS_CRAM )
+ {
+ if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
+ return 0;
+ }
+ else if ( do_csi )
{
if ( ftype==IS_BCF )
{
@@ -390,7 +510,7 @@ int main(int argc, char *argv[])
if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
return 0;
}
- else
+ else // TBI index
{
if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
return 0;
diff --git a/htslib/tbx.c b/htslib/tbx.c
index 4a5bdd2..a82f195 100644
--- a/htslib/tbx.c
+++ b/htslib/tbx.c
@@ -280,6 +280,11 @@ tbx_t *tbx_index_load(const char *fn)
return NULL;
}
meta = hts_idx_get_meta(tbx->idx, &l_meta);
+ if ( !meta )
+ {
+ free(tbx);
+ return NULL;
+ }
memcpy(x, meta, 28);
memcpy(&tbx->conf, x, 24);
p = nm = (char*)meta + 28;
diff --git a/htslib/test/aux#aux.sam b/htslib/test/auxf#values.sam
similarity index 100%
rename from htslib/test/aux#aux.sam
rename to htslib/test/auxf#values.sam
diff --git a/htslib/test/aux.fa b/htslib/test/auxf.fa
similarity index 100%
rename from htslib/test/aux.fa
rename to htslib/test/auxf.fa
diff --git a/htslib/test/aux.fa.fai b/htslib/test/auxf.fa.fai
similarity index 100%
rename from htslib/test/aux.fa.fai
rename to htslib/test/auxf.fa.fai
diff --git a/htslib/test/hfile.c b/htslib/test/hfile.c
index 987c8e0..c4ba91c 100644
--- a/htslib/test/hfile.c
+++ b/htslib/test/hfile.c
@@ -141,7 +141,7 @@ int main(void)
check_offset(fin, 200, "input/first200");
check_offset(fout, 1000, "output/first200");
- if (hseek(fin, 1000, SEEK_SET) < 0) fail("hseek");
+ if (hseek(fin, 800, SEEK_CUR) < 0) fail("hseek/cur");
check_offset(fin, 1000, "input/seek");
for (off = 1000; (n = hread(fin, buffer, sizeof buffer)) > 0; off += n)
if (hwrite(fout, buffer, n) != n) fail("hwrite");
@@ -149,7 +149,7 @@ int main(void)
check_offset(fin, off, "input/eof");
check_offset(fout, off, "output/eof");
- if (hseek(fin, 200, SEEK_SET) < 0) fail("hseek");
+ if (hseek(fin, 200, SEEK_SET) < 0) fail("hseek/set");
if (hseek(fout, 200, SEEK_SET) < 0) fail("hseek(output)");
check_offset(fin, 200, "input/backto200");
check_offset(fout, 200, "output/backto200");
diff --git a/htslib/test/sam.c b/htslib/test/sam.c
index 22f06dc..5539840 100644
--- a/htslib/test/sam.c
+++ b/htslib/test/sam.c
@@ -1,6 +1,6 @@
/* test/sam.c -- SAM/BAM/CRAM API test cases.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2015 Genome Research Ltd.
Author: John Marshall <jm18 at sanger.ac.uk>
@@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */
#include <math.h>
#include "htslib/sam.h"
+#include "htslib/faidx.h"
#include "htslib/kstring.h"
int status;
@@ -71,10 +72,10 @@ static int aux_fields1(void)
static const char sam[] = "data:"
"@SQ\tSN:one\tLN:1000\n"
"@SQ\tSN:two\tLN:500\n"
-"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\n";
+"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n";
// Canonical form of the alignment record above, as output by sam_format1()
- static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000";
+ static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295";
samFile *in = sam_open(sam, "r");
bam_hdr_t *header = sam_hdr_read(in);
@@ -109,6 +110,33 @@ static int aux_fields1(void)
if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000)
fail("ZZ field is %d, expected 1000000", bam_aux2i(p));
+ if ((p = bam_aux_get(aln, "Y1")) && bam_aux2i(p) != -2147483647-1)
+ fail("Y1 field is %d, expected -2^31", bam_aux2i(p));
+
+ if ((p = bam_aux_get(aln, "Y2")) && bam_aux2i(p) != -2147483647)
+ fail("Y2 field is %d, expected -2^31+1", bam_aux2i(p));
+
+ if ((p = bam_aux_get(aln, "Y3")) && bam_aux2i(p) != -1)
+ fail("Y3 field is %d, expected -1", bam_aux2i(p));
+
+ if ((p = bam_aux_get(aln, "Y4")) && bam_aux2i(p) != 0)
+ fail("Y4 field is %d, expected 0", bam_aux2i(p));
+
+ if ((p = bam_aux_get(aln, "Y5")) && bam_aux2i(p) != 1)
+ fail("Y5 field is %d, expected 1", bam_aux2i(p));
+
+ if ((p = bam_aux_get(aln, "Y6")) && bam_aux2i(p) != 2147483647)
+ fail("Y6 field is %d, expected 2^31-1", bam_aux2i(p));
+
+ // TODO Checking these perhaps requires inventing bam_aux2u() or so
+#if 0
+ if ((p = bam_aux_get(aln, "Y7")) && bam_aux2i(p) != 2147483648)
+ fail("Y7 field is %d, expected 2^31", bam_aux2i(p));
+
+ if ((p = bam_aux_get(aln, "Y8")) && bam_aux2i(p) != 4294967295)
+ fail("Y8 field is %d, expected 2^32-1", bam_aux2i(p));
+#endif
+
if (sam_format1(header, aln, &ks) < 0)
fail("can't format record");
@@ -132,12 +160,28 @@ static void iterators1(void)
hts_itr_destroy(sam_itr_queryi(NULL, HTS_IDX_NONE, 0, 0));
}
-int main(void)
+static void faidx1(const char *filename)
+{
+ int n;
+ faidx_t *fai = fai_load(filename);
+ if (fai == NULL) fail("can't load faidx file");
+
+ n = faidx_fetch_nseq(fai);
+ if (n != 7) fail("faidx_fetch_nseq returned %d, expected 7", n);
+
+ n = faidx_nseq(fai);
+ if (n != 7) fail("faidx_nseq returned %d, expected 7", n);
+
+ fai_destroy(fai);
+}
+
+int main(int argc, char **argv)
{
status = EXIT_SUCCESS;
aux_fields1();
iterators1();
+ if (argc >= 2) faidx1(argv[1]);
return status;
}
diff --git a/htslib/test/test-regidx.c b/htslib/test/test-regidx.c
new file mode 100644
index 0000000..0aea6b8
--- /dev/null
+++ b/htslib/test/test-regidx.c
@@ -0,0 +1,116 @@
+/* test/test-regidx.c -- Regions index test harness.
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <htslib/regidx.h>
+
+void error(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(-1);
+}
+
+int custom_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ // Use the standard parser for CHROM,FROM,TO
+ int i, ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ if ( ret!=0 ) return ret;
+
+ // Skip the fields that were parsed above
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ for (i=0; i<3; i++)
+ {
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) return -2; // wrong number of fields
+ while ( *ss && isspace(*ss) ) ss++;
+ }
+ if ( !*ss ) return -2;
+
+ // Parse the payload
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ char **dat = (char**) payload;
+ *dat = (char*) malloc(se-ss+1);
+ memcpy(*dat,ss,se-ss+1);
+ (*dat)[se-ss] = 0;
+ return 0;
+}
+void custom_free(void *payload)
+{
+ char **dat = (char**)payload;
+ free(*dat);
+}
+
+int main(int argc, char **argv)
+{
+ // Init index with no file name, we will insert the regions manually
+ regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL);
+ if ( !idx ) error("init failed\n");
+
+ // Insert regions
+ char *line;
+ line = "1 10000000 10000000 1:10000000-10000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+ line = "1 20000000 20000001 1:20000000-20000001"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+ line = "1 20000002 20000002 1:20000002-20000002"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+ line = "1 30000000 30000000 1:30000000-30000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+
+ // Finish initialization
+ regidx_insert(idx,NULL);
+
+ // Test
+ regitr_t itr;
+ int from, to;
+
+ from = to = 10000000;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+ if ( strcmp("1:10000000-10000000",REGITR_PAYLOAD(itr,char*)) ) error("query failed: 1:%d-%d vs %s\n", from,to,REGITR_PAYLOAD(itr,char*));
+ if ( !regidx_overlap(idx,"1",from-2,to-1,&itr) ) error("query failed: 1:%d-%d\n",from-1,to);
+ if ( !regidx_overlap(idx,"1",from-2,to+3,&itr) ) error("query failed: 1:%d-%d\n",from-1,to+2);
+ if ( regidx_overlap(idx,"1",from-2,to-2,&itr) ) error("query failed: 1:%d-%d\n",from-1,to-1);
+
+ from = to = 20000000;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+ from = to = 20000002;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+ from = to = 30000000;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+ // Clean up
+ regidx_destroy(idx);
+
+ return 0;
+}
+
+
diff --git a/htslib/test/test-vcf-api.c b/htslib/test/test-vcf-api.c
index 77a8fec..3e7623a 100644
--- a/htslib/test/test-vcf-api.c
+++ b/htslib/test/test-vcf-api.c
@@ -26,6 +26,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/hts.h>
#include <htslib/vcf.h>
#include <htslib/kstring.h>
+#include <htslib/kseq.h>
void write_bcf(char *fname)
{
@@ -153,7 +154,12 @@ void write_bcf(char *fname)
free(str.s);
bcf_destroy1(rec);
bcf_hdr_destroy(hdr);
- hts_close(fp);
+ int ret;
+ if ( (ret=hts_close(fp)) )
+ {
+ fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
+ exit(ret);
+ }
}
void bcf_to_vcf(char *fname)
@@ -161,7 +167,10 @@ void bcf_to_vcf(char *fname)
htsFile *fp = hts_open(fname,"rb");
bcf_hdr_t *hdr = bcf_hdr_read(fp);
bcf1_t *rec = bcf_init1();
- htsFile *out = hts_open("-","w");
+
+ char *gz_fname = (char*) malloc(strlen(fname)+4);
+ snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname);
+ htsFile *out = hts_open(gz_fname,"wg");
bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr);
bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused");
@@ -199,8 +208,41 @@ void bcf_to_vcf(char *fname)
bcf_destroy1(rec);
bcf_hdr_destroy(hdr);
bcf_hdr_destroy(hdr_out);
- hts_close(fp);
- hts_close(out);
+ int ret;
+ if ( (ret=hts_close(fp)) )
+ {
+ fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
+ exit(ret);
+ }
+ if ( (ret=hts_close(out)) )
+ {
+ fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
+ exit(ret);
+ }
+
+
+ // read gzip, write stdout
+ htsFile *gz_in = hts_open(gz_fname, "r");
+ if ( !gz_in )
+ {
+ fprintf(stderr,"Could not read: %s\n", gz_fname);
+ exit(1);
+ }
+
+ kstring_t line = {0,0,0};
+ while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 )
+ {
+ kputc('\n',&line);
+ fwrite(line.s,1,line.l,stdout);
+ }
+
+ if ( (ret=hts_close(gz_in)) )
+ {
+ fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
+ exit(ret);
+ }
+ free(line.s);
+ free(gz_fname);
}
void iterator(const char *fname)
@@ -221,7 +263,12 @@ void iterator(const char *fname)
hts_idx_destroy(idx);
bcf_hdr_destroy(hdr);
- hts_close(fp);
+ int ret;
+ if ( (ret=hts_close(fp)) )
+ {
+ fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
+ exit(ret);
+ }
}
int main(int argc, char **argv)
diff --git a/htslib/test/test_view.c b/htslib/test/test_view.c
index 7f02708..1f96cea 100644
--- a/htslib/test/test_view.c
+++ b/htslib/test/test_view.c
@@ -32,6 +32,82 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/sam.h"
+typedef struct hts_opt {
+ enum cram_option opt;
+ union {
+ int i;
+ char *s;
+ } val;
+ struct hts_opt *next;
+} hts_opt;
+
+/*
+ * Parses arg and appends it to the option list.
+ * Returns 0 on success;
+ * -1 on failure.
+ */
+int add_option(hts_opt **opts, char *arg) {
+ hts_opt *o, *t;
+ char *cp;
+
+ if (!(cp = strchr(arg, '=')))
+ cp = "1"; // assume boolean
+ else
+ *cp++ = 0;
+
+ if (!(o = malloc(sizeof(*o))))
+ return -1;
+
+ if (strcmp(arg, "DECODE_MD") == 0)
+ o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(cp);
+ else if (strcmp(arg, "VERBOSITY") == 0)
+ o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(cp);
+ else if (strcmp(arg, "SEQS_PER_SLICE") == 0)
+ o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(cp);
+ else if (strcmp(arg, "SLICES_PER_CONTAINER") == 0)
+ o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(cp);
+ else if (strcmp(arg, "EMBED_REF") == 0)
+ o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(cp);
+ else if (strcmp(arg, "NO_REF") == 0)
+ o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(cp);
+ else if (strcmp(arg, "IGNORE_MD5") == 0)
+ o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(cp);
+ else if (strcmp(arg, "USE_BZIP2") == 0)
+ o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(cp);
+ else if (strcmp(arg, "USE_RANS") == 0)
+ o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(cp);
+ else if (strcmp(arg, "USE_LZMA") == 0)
+ o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(cp);
+ else if (strcmp(arg, "REFERENCE") == 0)
+ o->opt = CRAM_OPT_REFERENCE, o->val.s = cp;
+ else if (strcmp(arg, "VERSION") == 0)
+ o->opt = CRAM_OPT_VERSION, o->val.s =cp;
+ else if (strcmp(arg, "MULTI_SEQ_PER_SLICE") == 0)
+ o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(cp);
+ else if (strcmp(arg, "NTHREADS") == 0)
+ o->opt = CRAM_OPT_NTHREADS, o->val.i = atoi(cp);
+ else if (strcmp(arg, "REQUIRED_FIELDS") == 0)
+ o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(cp, NULL, 0);
+ else {
+ fprintf(stderr, "Unknown option '%s'\n", arg);
+ free(o);
+ return -1;
+ }
+
+ o->next = NULL;
+
+ if (*opts) {
+ t = *opts;
+ while (t->next)
+ t = t->next;
+ t->next = o;
+ } else {
+ *opts = o;
+ }
+
+ return 0;
+}
+
int main(int argc, char *argv[])
{
samFile *in;
@@ -43,8 +119,9 @@ int main(int argc, char *argv[])
htsFile *out;
char modew[8];
int r = 0, exit_code = 0;
+ hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL;
- while ((c = getopt(argc, argv, "IbDCSl:t:")) >= 0) {
+ while ((c = getopt(argc, argv, "IbDCSl:t:i:o:")) >= 0) {
switch (c) {
case 'S': flag |= 1; break;
case 'b': flag |= 2; break;
@@ -53,10 +130,12 @@ int main(int argc, char *argv[])
case 'l': clevel = atoi(optarg); flag |= 2; break;
case 't': fn_ref = optarg; break;
case 'I': ignore_sam_err = 1; break;
+ case 'i': if (add_option(&in_opts, optarg)) return 1; break;
+ case 'o': if (add_option(&out_opts, optarg)) return 1; break;
}
}
if (argc == optind) {
- fprintf(stderr, "Usage: samview [-bSCSI] [-l level] <in.bam>|<in.sam>|<in.cram> [region]\n");
+ fprintf(stderr, "Usage: samview [-bSCSI] [-l level] [-o option=value] <in.bam>|<in.sam>|<in.cram> [region]\n");
return 1;
}
strcpy(moder, "r");
@@ -95,6 +174,15 @@ int main(int argc, char *argv[])
cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL);
}
+ // Process any options; currently cram only.
+ for (; in_opts; in_opts = (last=in_opts)->next, free(last)) {
+ hts_set_opt(in, in_opts->opt, in_opts->val);
+ if (in_opts->opt == CRAM_OPT_REFERENCE)
+ hts_set_opt(out, in_opts->opt, in_opts->val);
+ }
+ for (; out_opts; out_opts = (last=out_opts)->next, free(last))
+ hts_set_opt(out, out_opts->opt, out_opts->val);
+
sam_hdr_write(out, h);
if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
int i;
diff --git a/htslib/vcf.c b/htslib/vcf.c
index fb44980..0901ce1 100644
--- a/htslib/vcf.c
+++ b/htslib/vcf.c
@@ -36,17 +36,18 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/vcf.h"
#include "htslib/tbx.h"
#include "htslib/hfile.h"
+#include "htslib/khash_str2int.h"
#include "htslib/khash.h"
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
- typedef khash_t(vdict) vdict_t;
+typedef khash_t(vdict) vdict_t;
#include "htslib/kseq.h"
KSTREAM_DECLARE(gzFile, gzread)
- uint32_t bcf_float_missing = 0x7F800001;
- uint32_t bcf_float_vector_end = 0x7F800002;
- uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+uint32_t bcf_float_missing = 0x7F800001;
+uint32_t bcf_float_vector_end = 0x7F800002;
+uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
/*************************
@@ -57,17 +58,13 @@ int bcf_hdr_sync(bcf_hdr_t *h);
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
{
- if ( !s )
- {
- bcf_hdr_sync(h);
- return 0;
- }
+ if ( !s ) return 0;
const char *ss = s;
while ( !*ss && isspace(*ss) ) ss++;
if ( !*ss )
{
- fprintf(stderr,"[W::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
+ fprintf(stderr,"[E::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
abort();
}
@@ -80,18 +77,23 @@ int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
kh_val(d, k).id = kh_size(d) - 1;
} else {
if (hts_verbose >= 2)
- fprintf(stderr, "[W::%s] Duplicated sample name '%s'. Skipped.\n", __func__, s);
+ {
+ fprintf(stderr, "[E::%s] Duplicated sample name '%s'\n", __func__, s);
+ abort();
+ }
free(sdup);
return -1;
}
int n = kh_size(d);
h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
h->samples[n-1] = sdup;
+ h->dirty = 1;
return 0;
}
-void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
+int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
{
+ int ret = 0;
int i = 0;
const char *p, *q;
// add samples
@@ -101,13 +103,14 @@ void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
char *s = (char*)malloc(q - p + 1);
strncpy(s, p, q - p);
s[q - p] = 0;
- bcf_hdr_add_sample(h,s);
+ if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1;
free(s);
}
if (*q == 0 || *q == '\n') break;
p = q + 1;
}
bcf_hdr_add_sample(h,NULL);
+ return ret;
}
int bcf_hdr_sync(bcf_hdr_t *h)
@@ -142,6 +145,7 @@ int bcf_hdr_sync(bcf_hdr_t *h)
h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
}
}
+ h->dirty = 0;
return 0;
}
@@ -178,7 +182,7 @@ bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
j++;
}
- if ( i!=j ) out->nkeys--; // IDX was omitted
+ if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
return out;
}
@@ -350,8 +354,8 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
// Get the contig ID ($str) and length ($j)
i = bcf_hrec_find_key(hrec,"length");
- if ( i<0 ) return 0;
- if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
+ if ( i<0 ) j = 0;
+ else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
i = bcf_hrec_find_key(hrec,"ID");
if ( i<0 ) return 0;
@@ -381,7 +385,7 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
kh_val(d, k) = bcf_idinfo_def;
kh_val(d, k).id = idx;
- kh_val(d, k).info[0] = i;
+ kh_val(d, k).info[0] = j;
kh_val(d, k).hrec[0] = hrec;
return 1;
@@ -414,6 +418,7 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
+ else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
else
{
@@ -449,6 +454,7 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
if ( kh_val(d, k).hrec[info&0xf] ) return 0;
kh_val(d, k).info[info&0xf] = info;
kh_val(d, k).hrec[info&0xf] = hrec;
+ if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
return 1;
}
kh_val(d, k) = bcf_idinfo_def;
@@ -494,6 +500,7 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
int n = ++hdr->nhrec;
hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
hdr->hrec[n-1] = hrec;
+ hdr->dirty = 1;
return hrec->type==BCF_HL_GEN ? 0 : 1;
}
@@ -579,9 +586,10 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
needs_sync += bcf_hdr_add_hrec(hdr, hrec);
p += len;
}
- bcf_hdr_parse_sample_line(hdr,p); // calls hdr_sync
+ int ret = bcf_hdr_parse_sample_line(hdr,p);
+ bcf_hdr_sync(hdr);
bcf_hdr_check_sanity(hdr);
- return 0;
+ return ret;
}
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
@@ -589,8 +597,7 @@ int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
int len;
bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
if ( !hrec ) return -1;
- if ( bcf_hdr_add_hrec(hdr, hrec) )
- bcf_hdr_sync(hdr);
+ bcf_hdr_add_hrec(hdr, hrec);
return 0;
}
@@ -637,8 +644,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
if ( i < hdr->nhrec )
memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
bcf_hrec_destroy(hrec);
-
- bcf_hdr_sync(hdr);
+ hdr->dirty = 1;
}
}
@@ -692,7 +698,7 @@ void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
free(hrec->value);
hrec->value = strdup(version);
}
- bcf_hdr_sync(hdr);
+ hdr->dirty = 1;
}
bcf_hdr_t *bcf_hdr_init(const char *mode)
@@ -735,7 +741,7 @@ void bcf_hdr_destroy(bcf_hdr_t *h)
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
{
- if (!hfp->is_bin)
+ if (hfp->format.format == vcf)
return vcf_hdr_read(hfp);
BGZF *fp = hfp->fp.bgzf;
@@ -766,9 +772,11 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
return h;
}
-int bcf_hdr_write(htsFile *hfp, const bcf_hdr_t *h)
+int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
{
- if (!hfp->is_bin) return vcf_hdr_write(hfp, h);
+ if ( h->dirty ) bcf_hdr_sync(h);
+ if (hfp->format.format == vcf || hfp->format.format == text_format)
+ return vcf_hdr_write(hfp, h);
int hlen;
char *htxt = bcf_hdr_fmt_text(h, 1, &hlen);
@@ -916,7 +924,7 @@ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
{
- if (!fp->is_bin) return vcf_read(fp,h,v);
+ if (fp->format.format == vcf) return vcf_read(fp,h,v);
int ret = bcf_read1_core(fp->fp.bgzf, v);
if ( ret!=0 || !h->keep_samples ) return ret;
return bcf_subset_format(h,v);
@@ -1116,32 +1124,42 @@ static int bcf1_sync(bcf1_t *line)
return 0;
}
-bcf1_t *bcf_dup(bcf1_t *src)
+bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
{
bcf1_sync(src);
- bcf1_t *out = bcf_init1();
-
- out->rid = src->rid;
- out->pos = src->pos;
- out->rlen = src->rlen;
- out->qual = src->qual;
- out->n_info = src->n_info; out->n_allele = src->n_allele;
- out->n_fmt = src->n_fmt; out->n_sample = src->n_sample;
+ bcf_clear(dst);
+ dst->rid = src->rid;
+ dst->pos = src->pos;
+ dst->rlen = src->rlen;
+ dst->qual = src->qual;
+ dst->n_info = src->n_info; dst->n_allele = src->n_allele;
+ dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
- out->shared.m = out->shared.l = src->shared.l;
- out->shared.s = (char*) malloc(out->shared.l);
- memcpy(out->shared.s,src->shared.s,out->shared.l);
+ dst->shared.m = dst->shared.l = src->shared.l;
+ dst->shared.s = (char*) malloc(dst->shared.l);
+ memcpy(dst->shared.s,src->shared.s,dst->shared.l);
- out->indiv.m = out->indiv.l = src->indiv.l;
- out->indiv.s = (char*) malloc(out->indiv.l);
- memcpy(out->indiv.s,src->indiv.s,out->indiv.l);
+ dst->indiv.m = dst->indiv.l = src->indiv.l;
+ dst->indiv.s = (char*) malloc(dst->indiv.l);
+ memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
- return out;
+ return dst;
+}
+bcf1_t *bcf_dup(bcf1_t *src)
+{
+ bcf1_t *out = bcf_init1();
+ return bcf_copy(out, src);
}
int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v)
{
+ if ( h->dirty )
+ {
+ // we could as well call bcf_hdr_sync here, not sure
+ fprintf(stderr,"FIXME: dirty header not synced\n");
+ exit(1);
+ }
if ( bcf_hdr_nsamples(h)!=v->n_sample )
{
fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
@@ -1149,7 +1167,8 @@ int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v)
return -1;
}
- if ( !hfp->is_bin ) return vcf_write(hfp,h,v);
+ if ( hfp->format.format == vcf || hfp->format.format == text_format )
+ return vcf_write(hfp,h,v);
if ( v->errcode )
{
@@ -1240,8 +1259,6 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp)
hrec->key = strdup("contig");
bcf_hrec_add_key(hrec, "ID", strlen("ID"));
bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
- bcf_hrec_add_key(hrec, "length", strlen("length"));
- bcf_hrec_set_val(hrec, hrec->nkeys-1, "2147483647", strlen("2147483647"), 0);
bcf_hdr_add_hrec(h, hrec);
need_sync = 1;
}
@@ -1343,7 +1360,7 @@ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
char *htxt = bcf_hdr_fmt_text(h, 0, &hlen);
while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros
int ret;
- if ( fp->is_compressed==1 )
+ if ( fp->format.compression!=no_compression )
ret = bgzf_write(fp->fp.bgzf, htxt, hlen);
else
ret = hwrite(fp->fp.hfile, htxt, hlen);
@@ -1546,7 +1563,15 @@ int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char
if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1;
if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
l = 0, m = g = 1;
- if ( *r==':' ) j++;
+ if ( *r==':' )
+ {
+ j++;
+ if ( j>=v->n_fmt )
+ {
+ fprintf(stderr,"Incorrect number of FORMAT fields at %s:%d\n", h->id[BCF_DT_CTG][v->rid].key,v->pos+1);
+ exit(1);
+ }
+ }
else break;
}
else if ( *r== ',' ) m++;
@@ -1727,7 +1752,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
kstring_t tmp = {0,0,0};
int l;
- ksprintf(&tmp, "##contig=<ID=%s,length=2147483647>", p);
+ ksprintf(&tmp, "##contig=<ID=%s>", p);
bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
free(tmp.s);
if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
@@ -1768,7 +1793,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
if (*(q-1) == ';') *(q-1) = 0;
for (r = p; *r; ++r)
if (*r == ';') ++n_flt;
- a = (int32_t*)alloca(n_flt * 4);
+ a = (int32_t*)alloca(n_flt * sizeof(int32_t));
// add filters
for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
*(char*)aux1.p = 0;
@@ -1810,6 +1835,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
for (end = val; *end != ';' && *end != 0; ++end);
c = *end; *end = 0;
} else end = r;
+ if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
k = kh_get(vdict, d, key);
if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
{
@@ -1837,7 +1863,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
if (*t == ',') ++n_val;
if ((y>>4&0xf) == BCF_HT_INT) {
int32_t *z;
- z = (int32_t*)alloca(n_val<<2);
+ z = (int32_t*)alloca(n_val * sizeof(int32_t));
for (i = 0, t = val; i < n_val; ++i, ++t)
{
z[i] = strtol(t, &te, 10);
@@ -1852,7 +1878,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos;
} else if ((y>>4&0xf) == BCF_HT_REAL) {
float *z;
- z = (float*)alloca(n_val<<2);
+ z = (float*)alloca(n_val * sizeof(float));
for (i = 0, t = val; i < n_val; ++i, ++t)
{
z[i] = strtod(t, &te);
@@ -2090,7 +2116,7 @@ int vcf_write_line(htsFile *fp, kstring_t *line)
{
int ret;
if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
- if ( fp->is_compressed==1 )
+ if ( fp->format.compression!=no_compression )
ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
else
ret = hwrite(fp->fp.hfile, line->s, line->l);
@@ -2102,7 +2128,7 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
int ret;
fp->line.l = 0;
vcf_format1(h, v, &fp->line);
- if ( fp->is_compressed==1 )
+ if ( fp->format.compression!=no_compression )
ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
else
ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
@@ -2168,7 +2194,7 @@ int bcf_index_build(const char *fn, int min_shift)
htsFile *fp;
hts_idx_t *idx;
if ((fp = hts_open(fn, "rb")) == 0) return -1;
- if ( !fp->fp.bgzf->is_compressed ) { hts_close(fp); return -1; }
+ if ( fp->format.compression!=bgzf ) { hts_close(fp); return -1; }
idx = bcf_index(fp, min_shift);
hts_close(fp);
if ( !idx ) return -1;
@@ -2233,6 +2259,11 @@ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]);
ret |= 1;
}
+ if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
+ {
+ fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]);
+ ret |= 1;
+ }
}
}
}
@@ -2256,7 +2287,9 @@ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
for (i=0; i<src_hdr->n[dict]; i++)
{
- if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) )
+ if ( !src_hdr->id[dict][i].key || !dst_hdr->id[dict][i].key ) // gap left after removed BCF header lines
+ src_hdr->transl[dict][i] = -1;
+ else if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) )
{
src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
src_hdr->ntransl++;
@@ -2365,6 +2398,7 @@ bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
{
int hlen;
+ void *names_hash = khash_str2int_init();
char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen);
kstring_t str;
bcf_hdr_t *h;
@@ -2385,10 +2419,20 @@ bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int
}
kputsn(htxt, p - htxt, &str);
for (i = 0; i < n; ++i) {
+ if ( khash_str2int_has_key(names_hash,samples[i]) )
+ {
+ fprintf(stderr,"[E::bcf_hdr_subset] Duplicate sample name \"%s\".\n", samples[i]);
+ free(str.s);
+ free(htxt);
+ khash_str2int_destroy(names_hash);
+ bcf_hdr_destroy(h);
+ return NULL;
+ }
imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
if (imap[i] < 0) continue;
kputc('\t', &str);
kputs(samples[i], &str);
+ khash_str2int_inc(names_hash,samples[i]);
}
} else kputsn(htxt, hlen, &str);
while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
@@ -2396,6 +2440,7 @@ bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int
bcf_hdr_parse(h, str.s);
free(str.s);
free(htxt);
+ khash_str2int_destroy(names_hash);
return h;
}
@@ -2849,7 +2894,7 @@ int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
if ( flt_id==line->d.flt[i] ) break;
if ( i==line->d.n_flt ) return 0; // the filter is not present
line->d.shared_dirty |= BCF1_DIRTY_FLT;
- if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,line->d.n_flt-i);
+ if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
line->d.n_flt--;
if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
return 0;
@@ -3138,30 +3183,30 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v
if ( !dst ) return -4; // could not alloc
}
-#define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
- out_type_t *tmp = (out_type_t *) *dst; \
- type_t *p = (type_t*) fmt->p; \
- for (i=0; i<nsmpl; i++) \
- { \
- for (j=0; j<fmt->n; j++) \
+ #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
+ out_type_t *tmp = (out_type_t *) *dst; \
+ type_t *p = (type_t*) fmt->p; \
+ for (i=0; i<nsmpl; i++) \
{ \
- if ( is_missing ) set_missing; \
- else if ( is_vector_end ) { set_vector_end; break; } \
- else *tmp = p[j]; \
- tmp++; \
+ for (j=0; j<fmt->n; j++) \
+ { \
+ if ( is_missing ) set_missing; \
+ else if ( is_vector_end ) { set_vector_end; break; } \
+ else *tmp = p[j]; \
+ tmp++; \
+ } \
+ for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
+ p = (type_t *)((char *)p + fmt->size); \
} \
- for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
- p = (type_t *)((char *)p + fmt->size); \
- } \
-}
-switch (fmt->type) {
- case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
- case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
- case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
- default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
-}
-#undef BRANCH
-return nsmpl*fmt->n;
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
+ default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
+ }
+ #undef BRANCH
+ return nsmpl*fmt->n;
}
diff --git a/htslib/vcfutils.c b/htslib/vcfutils.c
index 3f64836..91118e4 100644
--- a/htslib/vcfutils.c
+++ b/htslib/vcfutils.c
@@ -64,7 +64,11 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
}
#undef BRANCH_INT
- assert( an>=nac ); // sanity check for missing values
+ if ( an<nac )
+ {
+ fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+ exit(1);
+ }
ac[0] = an - nac;
return 1;
}
@@ -80,7 +84,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
for (i=0; i<(int)line->n_fmt; i++)
if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
if ( !fmt_gt ) return 0;
- #define BRANCH_INT(type_t,missing,vector_end) { \
+ #define BRANCH_INT(type_t,vector_end) { \
for (i=0; i<line->n_sample; i++) \
{ \
type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
@@ -88,15 +92,20 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
for (ial=0; ial<fmt_gt->n; ial++) \
{ \
if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
- if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \
+ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
+ if ( p[ial]>>1 > line->n_allele ) \
+ { \
+ fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
+ exit(1); \
+ } \
ac[(p[ial]>>1)-1]++; \
} \
} \
}
switch (fmt_gt->type) {
- case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
- case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
- case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
}
#undef BRANCH_INT
@@ -108,12 +117,12 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
{
int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
- #define BRANCH_INT(type_t,missing,vector_end) { \
+ #define BRANCH_INT(type_t,vector_end) { \
type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
for (i=0; i<fmt_ptr->n; i++) \
{ \
if ( p[i] == vector_end ) break; /* smaller ploidy */ \
- if ( !p[i] || p[i] == missing ) continue; /* missing allele */ \
+ if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
int tmp = p[i]>>1; \
if ( tmp>1 ) \
{ \
@@ -137,9 +146,9 @@ int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
} \
}
switch (fmt_ptr->type) {
- case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
- case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
- case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
}
#undef BRANCH_INT
@@ -165,7 +174,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
int *ac = (int*) calloc(line->n_allele,sizeof(int));
// check if all alleles are populated
- #define BRANCH(type_t,missing,vector_end) { \
+ #define BRANCH(type_t,vector_end) { \
for (i=0; i<line->n_sample; i++) \
{ \
type_t *p = (type_t*) (gt->p + i*gt->size); \
@@ -173,16 +182,16 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
for (ial=0; ial<gt->n; ial++) \
{ \
if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
- if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \
- if ( (p[ial]>>1)-1 >= line->n_allele ) return -1; \
+ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
+ if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \
ac[(p[ial]>>1)-1]++; \
} \
} \
}
switch (gt->type) {
- case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
- case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
- case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
}
#undef BRANCH
@@ -416,7 +425,7 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
{
for (j=0; j<nret; j++)
{
- if ( ptr[j]==bcf_gt_missing ) continue;
+ if ( bcf_gt_is_missing(ptr[j]) ) continue;
if ( ptr[j]==bcf_int32_vector_end ) break;
int al = bcf_gt_allele(ptr[j]);
assert( al<nR_ori && map[al]>=0 );
@@ -563,19 +572,19 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
int nori = nret / line->n_sample;
if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
{
- int ntop, inc = 0;
+ int inc = 0, nnew;
if ( vlen==BCF_VL_A )
{
assert( nori==nA_ori ); // todo: will fail if all values are missing
- ntop = nA_ori;
ndat = nA_new*line->n_sample;
+ nnew = nA_new;
inc = 1;
}
else
{
assert( nori==nR_ori ); // todo: will fail if all values are missing
- ntop = nR_ori;
ndat = nR_new*line->n_sample;
+ nnew = nR_new;
}
#define BRANCH(type_t,is_vector_end) \
@@ -583,14 +592,14 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
for (j=0; j<line->n_sample; j++) \
{ \
type_t *ptr_src = ((type_t*)dat) + j*nori; \
- type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \
+ type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
int size = sizeof(type_t); \
int k_src, k_dst = 0; \
- for (k_src=0; k_src<ntop; k_src++) \
+ for (k_src=0; k_src<nori; k_src++) \
{ \
if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
if ( rm_mask & 1<<(k_src+inc) ) continue; \
- if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
+ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
k_dst++; \
} \
} \
@@ -622,7 +631,7 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
for (k_src=0; k_src<nR_ori; k_src++) \
{ \
if ( rm_mask & 1<<k_src ) continue; \
- if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
+ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
k_dst++; \
} \
memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
@@ -635,9 +644,9 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
for (ib=0; ib<=ia; ib++) \
{ \
k_src++; \
- if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
+ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
- if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
+ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
k_dst++; \
} \
} \
diff --git a/htslib/version.h b/htslib/version.h
index 6a43f98..0185c52 100644
--- a/htslib/version.h
+++ b/htslib/version.h
@@ -1 +1 @@
-#define HTS_VERSION "1.1"
+#define HTS_VERSION "1.2.1"
diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh
index ae9a6cb..e8055f9 100755
--- a/install-CGAT-tools.sh
+++ b/install-CGAT-tools.sh
@@ -155,11 +155,11 @@ mkdir -p $HOME/CGAT/external-tools
cd $HOME/CGAT/external-tools
# install samtools
-curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.1/samtools-1.1.tar.bz2 > samtools-1.1.tar.bz2
-tar xjvf samtools-1.1.tar.bz2
-cd samtools-1.1
+curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.2/samtools-1.2.tar.bz2 > samtools-1.2.tar.bz2
+tar xjvf samtools-1.2.tar.bz2
+cd samtools-1.2
make
-PATH=$PATH:$HOME/CGAT/external-tools/samtools-1.1
+PATH=$PATH:$HOME/CGAT/external-tools/samtools-1.2
popd
diff --git a/pysam/TabProxies.pyx b/pysam/TabProxies.pyx
index bc2742d..0add831 100644
--- a/pysam/TabProxies.pyx
+++ b/pysam/TabProxies.pyx
@@ -390,84 +390,99 @@ cdef class GTFProxy(TupleProxy):
cdef int getMinFields(self):
'''return minimum number of fields.'''
- return 3
+ return 9
cdef int getMaxFields(self):
'''return max number of fields.'''
return 9
property contig:
- '''contig of feature.'''
- def __get__(self):
- return self._getindex(0)
- def __set__(self, value):
- self._setindex(0, value)
+ '''contig of feature.'''
+ def __get__(self):
+ return self._getindex(0)
+ def __set__(self, value):
+ self._setindex(0, value)
property source:
- '''feature source.'''
- def __get__(self):
- return self._getindex(1)
- def __set__(self, value):
- self._setindex(1, value)
+ '''feature source.'''
+ def __get__(self):
+ return self._getindex(1)
+ def __set__(self, value):
+ self._setindex(1, value)
property feature:
- '''feature name.'''
- def __get__( self ): return self._getindex( 2 )
- def __set__( self, value ): self._setindex( 2, value )
+ '''feature name.'''
+ def __get__(self):
+ return self._getindex(2)
+ def __set__(self, value):
+ self._setindex(2, value)
property start:
- '''feature start (in 0-based open/closed coordinates).'''
- def __get__( self ): return int( self._getindex( 3 )) - 1
- def __set__( self, value ): self._setindex( 3, str(value+1) )
+ '''feature start (in 0-based open/closed coordinates).'''
+ def __get__(self ):
+ return int( self._getindex(3)) - 1
+ def __set__(self, value ):
+ self._setindex(3, str(value+1))
property end:
- '''feature end (in 0-based open/closed coordinates).'''
- def __get__( self ): return int( self._getindex( 4 ) )
- def __set__( self, value ): self._setindex( 4, str(value) )
+ '''feature end (in 0-based open/closed coordinates).'''
+ def __get__(self):
+ return int(self._getindex(4))
+ def __set__(self, value):
+ self._setindex(4, str(value))
property score:
- '''feature score.'''
- def __get__( self ):
- v = self._getindex(5)
- if v == "" or v[0] == '.':
- return None
- else:
- return float(v)
+ '''feature score.'''
+ def __get__(self):
+ v = self._getindex(5)
+ if v == "" or v[0] == '.':
+ return None
+ else:
+ return float(v)
- def __set__( self, value ): self._setindex( 5, value )
+ def __set__(self, value):
+ self._setindex(5, value)
property strand:
- '''feature strand.'''
- def __get__( self ): return self._getindex( 6 )
- def __set__( self, value ): self._setindex( 6, value )
+ '''feature strand.'''
+ def __get__(self ):
+ return self._getindex(6)
+ def __set__(self, value ):
+ self._setindex(6, value)
property frame:
'''feature frame.'''
- def __get__( self ): return self._getindex( 7 )
- def __set__( self, value ): self._setindex( 7, value )
+ def __get__(self):
+ return self._getindex(7)
+ def __set__(self, value):
+ self._setindex(7, value)
property attributes:
- '''feature attributes (as a string).'''
- def __get__( self ):
- if self.hasOwnAttributes:
- return self._attributes
- else:
- return self._getindex(8)
- def __set__( self, value ):
- if self.hasOwnAttributes:
- free(self._attributes)
- self._attributes = NULL
- self.hasOwnAttributes = False
- self._setindex(8, value )
+ '''feature attributes (as a string).'''
+ def __get__(self):
+ if self.hasOwnAttributes:
+ return self._attributes
+ else:
+ return self._getindex(8)
+ def __set__( self, value):
+ if self.hasOwnAttributes:
+ free(self._attributes)
+ self._attributes = NULL
+ self.hasOwnAttributes = False
+ self._setindex(8, value)
cdef char * getAttributes(self):
- '''return pointer to attributes.'''
- if self.hasOwnAttributes:
- return self._attributes
- else:
- return self.fields[8]
+ '''return pointer to attributes.'''
+ cdef char * attributes
+ if self.hasOwnAttributes:
+ attributes = self._attributes
+ else:
+ attributes = self.fields[8]
+ if attributes == NULL:
+ raise KeyError("no attributes defined GTF entry")
+ return attributes
- def asDict( self ):
+ def asDict(self):
"""parse attributes - return as dict
"""
@@ -475,11 +490,21 @@ cdef class GTFProxy(TupleProxy):
attributes = self.attributes
# separate into fields
- fields = [x.strip() for x in attributes.split(";")[:-1]]
+ # Fields might contain a ";", for example in ENSEMBL GTF file
+ # for mouse, v78:
+ # ...; transcript_name "TXNRD2;-001"; ....
+ # The current heuristic is to split on a semicolon followed by a
+ # space, see also http://mblab.wustl.edu/GTF22.html
+ fields = [x.strip() for x in attributes.split("; ")]
result = {}
for f in fields:
+
+ # strip semicolon (GTF files without a space after the last semicolon)
+ if f.endswith(";"):
+ f = f[:-1]
+
# split at most once in order to avoid separating
# multi-word values
d = [x.strip() for x in string.split(f, " ", maxsplit=1)]
@@ -523,10 +548,10 @@ cdef class GTFProxy(TupleProxy):
a = "; ".join( aa ) + ";"
p = a
l = len(a)
- self._attributes = <char *>calloc( l + 1, sizeof(char) )
+ self._attributes = <char *>calloc(l + 1, sizeof(char))
if self._attributes == NULL:
- raise ValueError("out of memory" )
- memcpy( self._attributes, p, l )
+ raise ValueError("out of memory")
+ memcpy(self._attributes, p, l)
self.hasOwnAttributes = True
self.is_modified = True
@@ -560,11 +585,12 @@ cdef class GTFProxy(TupleProxy):
end = max(self.start, self.end)
self.start, self.end = lcontig - end, lcontig - start
- def keys( self ):
+ def keys(self):
'''return a list of attributes defined in this entry.'''
r = self.attributes
return [x.strip().split(" ")[0]
- for x in r.split(";") if x.strip() != '']
+ # separator is ';' followed by space
+ for x in r.split("; ") if x.strip() != '']
def __getitem__(self, key):
return self.__getattr__(key)
@@ -593,6 +619,8 @@ cdef class GTFProxy(TupleProxy):
# disappeard after accessing the C data structures
# directly and so did the bug.
cdef char * attributes = self.getAttributes()
+ if attributes == NULL:
+ raise KeyError("key %s not found, no attributes" % item)
# add space in order to make sure
# to not pick up a field that is a prefix of another field
diff --git a/pysam/__init__.py b/pysam/__init__.py
index e38427a..efe39fd 100644
--- a/pysam/__init__.py
+++ b/pysam/__init__.py
@@ -10,6 +10,8 @@ import pysam.cfaidx as cfaidx
from pysam.cfaidx import *
import pysam.cvcf as cvcf
from pysam.cvcf import *
+import pysam.cbcf as cbcf
+from pysam.cbcf import *
import pysam.csamtools as csamtools
import pysam.Pileup as Pileup
@@ -61,7 +63,8 @@ class SamtoolsDispatcher(object):
'''execute a samtools command
'''
retval, stderr, stdout = csamtools._samtools_dispatch(
- self.dispatch, args)
+ self.dispatch, args, catch_stdout=kwargs.get("catch_stdout", True))
+
if retval:
raise SamtoolsError(
'csamtools returned with error %i: %s' %
@@ -76,8 +79,7 @@ class SamtoolsDispatcher(object):
if not (x.startswith("[sam_header_read2]") or
x.startswith("[bam_index_load]") or
x.startswith("[bam_sort_core]") or
- x.startswith("[samopen] SAM header is present"))
- ]
+ x.startswith("[samopen] SAM header is present"))]
if stderr:
raise SamtoolsError("\n".join(stderr))
@@ -145,6 +147,7 @@ __all__ = \
libchtslib.__all__ + \
ctabix.__all__ + \
cvcf.__all__ +\
+ cbcf.__all__ +\
cfaidx.__all__ +\
calignmentfile.__all__ +\
csamfile.__all__ +\
@@ -180,4 +183,5 @@ def get_libraries():
'cfaidx.so',
'csamfile.so',
'cvcf.so',
+ 'cbcf.so',
'ctabix.so')]
diff --git a/pysam/calignmentfile.pxd b/pysam/calignmentfile.pxd
index 1eb925e..b75c1fd 100644
--- a/pysam/calignmentfile.pxd
+++ b/pysam/calignmentfile.pxd
@@ -12,6 +12,9 @@ cdef extern from *:
cdef extern from "htslib_util.h":
+ int hts_set_verbosity(int verbosity)
+ int hts_get_verbosity()
+
# add *nbytes* into the variable length data of *src* at *pos*
bam1_t * pysam_bam_update(bam1_t * b,
size_t nbytes_old,
@@ -69,7 +72,14 @@ cdef class AlignedSegment:
# add an alignment tag with value to the AlignedSegment
# an existing tag of the same name will be replaced.
- cpdef setTag( self, tag, value, value_type = ?, replace = ? )
+ cpdef set_tag(self, tag, value, value_type=?, replace=?)
+
+ # add an alignment tag with value to the AlignedSegment
+ # an existing tag of the same name will be replaced.
+ cpdef get_tag(self, tag)
+
+ # return true if tag exists
+ cpdef has_tag(self, tag)
cdef class AlignmentFile:
@@ -78,19 +88,18 @@ cdef class AlignmentFile:
# pointer to htsFile structure
cdef htsFile * htsfile
- # pointer to compressed file
- cdef BGZF * fp
-
# pointer to index
cdef hts_idx_t *index
# header structure
cdef bam_hdr_t * header
- # true if file is a bam file
- cdef int isbam
+ # true if file is bam format
+ cdef readonly bint is_bam
+ # true if file is bam format
+ cdef readonly bint is_cram
# true if not a file but a stream
- cdef int isstream
+ cdef readonly bint is_stream
# true if file is not on the local filesystem
- cdef int isremote
+ cdef readonly bint is_remote
# current read within iteration
cdef bam1_t * b
# file opening mode
@@ -104,7 +113,7 @@ cdef class AlignmentFile:
cdef int cnext(self)
# write an aligned read
- cpdef int write(self, AlignedSegment read)
+ cpdef int write(self, AlignedSegment read) except -1
cdef char * _getrname(self, int tid)
@@ -122,6 +131,7 @@ cdef class PileupRead:
cdef uint32_t _is_del
cdef uint32_t _is_head
cdef uint32_t _is_tail
+ cdef uint32_t _is_refskip
cdef class IteratorRow:
cdef int retval
@@ -155,7 +165,6 @@ cdef class IteratorRowSelection(IteratorRow):
cdef positions
cdef bam1_t * getCurrent( self )
cdef int cnext(self)
- cdef BGZF * fp
cdef class IteratorColumn:
@@ -198,5 +207,4 @@ cdef class IndexedReads:
cdef htsFile * htsfile
cdef index
cdef int owns_samfile
- cdef BGZF * fp
cdef bam_hdr_t * header
diff --git a/pysam/calignmentfile.pyx b/pysam/calignmentfile.pyx
index c9b01a8..533b0ff 100644
--- a/pysam/calignmentfile.pyx
+++ b/pysam/calignmentfile.pyx
@@ -23,6 +23,8 @@ from cpython cimport array
from cpython.version cimport PY_MAJOR_VERSION
+cimport cython
+
########################################################################
########################################################################
########################################################################
@@ -151,6 +153,7 @@ cdef makePileupRead(bam_pileup1_t * src):
dest._is_del = src.is_del
dest._is_head = src.is_head
dest._is_tail = src.is_tail
+ dest._is_refskip = src.is_refskip
return dest
cdef convertBinaryTagToList( uint8_t * s ):
@@ -234,7 +237,7 @@ VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"),
cdef class AlignmentFile:
'''*(filename, mode=None, template = None,
- referencenames=None, referencelengths = None,
+ reference_names=None, reference_lengths = None,
text=NULL, header=None,
add_sq_text=False, check_header=True,
check_sq=True)*
@@ -282,12 +285,15 @@ cdef class AlignmentFile:
3. If *text* is given, new header text is copied from raw
text.
- 4. The names (*referencenames*) and lengths
- (*referencelengths*) are supplied directly as lists. By
+ 4. The names (*reference_names*) and lengths
+ (*reference_lengths*) are supplied directly as lists. By
default, 'SQ' and 'LN' tags will be added to the header
text. This option can be changed by unsetting the flag
*add_sq_text*.
+ For writing a CRAM file, the filename of the reference can be
+ added through a fasta formatted file (*reference_filename*)
+
By default, if a file is opened in mode 'r', it is checked
for a valid header (*check_header* = True) and a definition of
chromosome names (*check_sq* = True).
@@ -297,18 +303,21 @@ cdef class AlignmentFile:
def __cinit__(self, *args, **kwargs ):
self.htsfile = NULL
self._filename = None
- self.isbam = False
- self.isstream = False
+ self.is_bam = False
+ self.is_stream = False
+ self.is_cram = False
+ self.is_remote = False
+
self._open(*args, **kwargs)
# allocate memory for iterator
self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
- def _isOpen( self ):
+ def _isOpen(self):
'''return true if htsfile has been opened.'''
return self.htsfile != NULL
- def _hasIndex( self ):
+ def _hasIndex(self):
'''return true if htsfile has an existing (and opened) index.'''
return self.index != NULL
@@ -316,27 +325,36 @@ cdef class AlignmentFile:
filename,
mode=None,
AlignmentFile template=None,
- referencenames=None,
- referencelengths=None,
+ reference_names=None,
+ reference_lengths=None,
+ reference_filename=None,
text=None,
header=None,
port=None,
add_sq_text=True,
check_header=True,
- check_sq=True):
- '''open a sam/bam file.
+ check_sq=True,
+ referencenames=None,
+ referencelengths=None):
+ '''open a sam, bam or cram formatted file.
- If _open is called on an existing bamfile, the current file will be
- closed and a new file will be opened.
+ If _open is called on an existing file, the current file
+ will be closed and a new file will be opened.
'''
+ # for backwards compatibility:
+ if referencenames is not None:
+ reference_names = referencenames
+ if referencelengths is not None:
+ reference_lengths = referencelengths
# read mode autodetection
if mode is None:
try:
self._open(filename, 'rb',
template=template,
- referencenames=referencenames,
- referencelengths=referencelengths,
+ reference_names=reference_names,
+ reference_lengths=reference_lengths,
+ reference_filename=reference_filename,
text=text,
header=header,
port=port,
@@ -348,8 +366,9 @@ cdef class AlignmentFile:
self._open(filename, 'r',
template=template,
- referencenames=referencenames,
- referencelengths=referencelengths,
+ reference_names=reference_names,
+ reference_lengths=reference_lengths,
+ reference_filename=reference_filename,
text=text,
header=header,
port=port,
@@ -357,7 +376,9 @@ cdef class AlignmentFile:
check_sq=check_sq)
return
- assert mode in ("r","w","rb","wb", "wh", "wbu", "rU", "wb0"), \
+ assert mode in ("r","w","rb","wb", "wh",
+ "wbu", "rU", "wb0",
+ "rc", "wc"), \
"invalid file opening mode `%s`" % mode
# close a previously opened file
@@ -370,12 +391,13 @@ cdef class AlignmentFile:
cdef bytes bmode = mode.encode('ascii')
self._filename = filename = _encodeFilename(filename)
- self.isstream = filename == b"-"
-
- self.isbam = len(mode) > 1 and mode[1] == 'b'
- self.isremote = filename.startswith(b"http:") or \
- filename.startswith(b"ftp:")
+ # FIXME: Use htsFormat when it is available
+ self.is_bam = len(mode) > 1 and mode[1] == 'b'
+ self.is_cram = len(mode) > 1 and mode[1] == 'c'
+ self.is_stream = filename == b"-"
+ self.is_remote = filename.startswith(b"http:") or \
+ filename.startswith(b"ftp:")
cdef char * ctext
ctext = NULL
@@ -390,27 +412,27 @@ cdef class AlignmentFile:
self.header = self._buildHeader(header)
else:
# build header from a target names and lengths
- assert referencenames and referencelengths, \
+ assert reference_names and reference_lengths, \
("either supply options `template`, `header` "
- "or both `referencenames` and `referencelengths` "
+ "or both `reference_names` and `reference_lengths` "
"for writing")
- assert len(referencenames) == len(referencelengths), \
+ assert len(reference_names) == len(reference_lengths), \
"unequal names and lengths of reference sequences"
# allocate and fill header
- referencenames = [_forceBytes(ref) for ref in referencenames]
+ reference_names = [_forceBytes(ref) for ref in reference_names]
self.header = bam_hdr_init()
- self.header.n_targets = len(referencenames)
+ self.header.n_targets = len(reference_names)
n = 0
- for x in referencenames:
+ for x in reference_names:
n += len(x) + 1
self.header.target_name = <char**>calloc(
n, sizeof(char*))
self.header.target_len = <uint32_t*>calloc(
n, sizeof(uint32_t))
for x from 0 <= x < self.header.n_targets:
- self.header.target_len[x] = referencelengths[x]
- name = referencenames[x]
+ self.header.target_len[x] = reference_lengths[x]
+ name = reference_names[x]
self.header.target_name[x] = <char*>calloc(
len(name) + 1, sizeof(char))
strncpy(self.header.target_name[x], name, len(name))
@@ -421,8 +443,8 @@ cdef class AlignmentFile:
text = []
for x from 0 <= x < self.header.n_targets:
text.append("@SQ\tSN:%s\tLN:%s\n" % \
- (_forceStr(referencenames[x]),
- referencelengths[x]))
+ (_forceStr(reference_names[x]),
+ reference_lengths[x]))
text = ''.join(text)
if text is not None:
@@ -434,43 +456,46 @@ cdef class AlignmentFile:
strlen(ctext), sizeof(char))
memcpy(self.header.text, ctext, strlen(ctext))
- # open file. Header gets written to file at the same time for bam files
- # and sam files (in the latter case, the mode needs to be wh)
+ # open file (hts_open is synonym with sam_open)
self.htsfile = hts_open(filename, bmode)
-
- # for compatibility - "w" writes sam file without header
- if self.isbam or "h" in mode:
- # write header to htsfile
+
+ # set filename with reference sequences. If no filename
+ # is given, the CRAM reference arrays will be built from
+ # the @SQ header in the header
+ if self.is_cram and reference_filename:
+ # note that fn_aux takes ownership, so create
+ # a copy
+ fn = _encodeFilename(reference_filename)
+ self.htsfile.fn_aux = strdup(fn)
+
+ # write header to htsfile
+ if self.is_bam or self.is_cram or "h" in mode:
sam_hdr_write(self.htsfile, self.header)
-
+
elif mode[0] == "r":
# open file for reading
if (filename != b"-"
- and not self.isremote
+ and not self.is_remote
and not os.path.exists(filename)):
raise IOError("file `%s` not found" % filename)
- # try to detect errors
+ # open file (hts_open is synonym with sam_open)
self.htsfile = hts_open(filename, bmode)
if self.htsfile == NULL:
raise ValueError(
"could not open file (mode='%s') - "
"is it SAM/BAM format?" % mode)
- # get file pointer
- # TODO: this is specific to BAM files
- # refactor to make generalizable
- self.fp = self.htsfile.fp.bgzf
-
# bam files require a valid header
- if self.isbam:
+ if self.is_bam or self.is_cram:
self.header = sam_hdr_read(self.htsfile)
if self.header == NULL:
raise ValueError(
"file does not have valid header (mode='%s') "
"- is it BAM format?" % mode )
else:
- # in sam files it is optional (htsfile full of unmapped reads)
+ # in sam files it is optional (htsfile full of
+ # unmapped reads)
if check_header:
self.header = sam_hdr_read(self.htsfile)
if self.header == NULL:
@@ -490,24 +515,42 @@ cdef class AlignmentFile:
raise IOError("could not open file `%s`" % filename )
# check for index and open if present
- if mode[0] == "r" and self.isbam:
+ cdef int format_index = -1
+ if self.is_bam:
+ format_index = HTS_FMT_BAI
+ elif self.is_cram:
+ format_index = HTS_FMT_CRAI
+
+ if mode[0] == "r" and (self.is_bam or self.is_cram):
- if not self.isremote:
- if not os.path.exists(filename + b".bai") \
- and not os.path.exists( filename[:-4] + b".bai"):
+ # open index for remote files
+ if self.is_remote:
+ self.index = hts_idx_load(filename, format_index)
+ if self.index == NULL:
+ warnings.warn(
+ "unable to open remote index for '%s'" % filename)
+ else:
+ if self.is_bam \
+ and not os.path.exists(filename + b".bai") \
+ and not os.path.exists(filename[:-4] + b".bai"):
+ self.index = NULL
+ elif self.is_cram \
+ and not os.path.exists(filename + b".crai") \
+ and not os.path.exists(filename[:-4] + b".crai"):
self.index = NULL
else:
- # returns NULL if there is no index or index could not be opened
- self.index = hts_idx_load(filename, HTS_FMT_BAI)
+ # returns NULL if there is no index or index could
+ # not be opened
+ self.index = sam_index_load(self.htsfile,
+ filename)
if self.index == NULL:
- raise IOError("error while opening index `%s` " % filename )
- else:
- self.index = hts_idx_load(filename, HTS_FMT_BAI)
- if self.index == NULL:
- warnings.warn("unable to open index for `%s` " % filename)
+ raise IOError(
+ "error while opening index for '%s'" %
+ filename)
- if not self.isstream:
- self.start_offset = bgzf_tell(self.fp)
+ # save start of data section
+ if not self.is_stream:
+ self.start_offset = self.tell()
def gettid(self, reference):
'''
@@ -613,18 +656,19 @@ cdef class AlignmentFile:
return self.seek(self.start_offset, 0)
def seek(self, uint64_t offset, int where = 0):
- '''
- move file pointer to position *offset*, see :meth:`pysam.AlignmentFile.tell`.
+ '''move file pointer to position *offset*, see
+ :meth:`pysam.AlignmentFile.tell`.
'''
if not self._isOpen():
- raise ValueError( "I/O operation on closed file" )
- if not self.isbam:
- raise NotImplementedError("seek only available in bam files")
- if self.isstream:
+ raise ValueError("I/O operation on closed file")
+ if not self.is_bam:
+ raise NotImplementedError(
+ "seek only available in bam files")
+ if self.is_stream:
raise OSError("seek no available in streams")
- return bgzf_seek(self.fp, offset, where)
+ return bgzf_seek(hts_get_bgzfp(self.htsfile), offset, where)
def tell(self):
'''
@@ -632,10 +676,11 @@ cdef class AlignmentFile:
'''
if not self._isOpen():
raise ValueError("I/O operation on closed file")
- if not self.isbam:
- raise NotImplementedError("seek only available in bam files")
+ if not (self.is_bam or self.is_cram):
+ raise NotImplementedError(
+ "seek only available in bam files")
- return bgzf_tell(self.fp)
+ return bgzf_tell(hts_get_bgzfp(self.htsfile))
def fetch(self,
reference=None,
@@ -646,25 +691,26 @@ cdef class AlignmentFile:
callback=None,
until_eof=False,
multiple_iterators=False):
- '''fetch aligned reads in a :term:`region` using 0-based indexing. The
- region is specified by :term:`reference`, *start* and
- *end*. Alternatively, a samtools :term:`region` string can be
- supplied.
+ '''fetch aligned, i.e. mapped, reads in a :term:`region`
+ using 0-based
+ indexing. The region is specified by :term:`reference`,
+ *start* and *end*. Alternatively, a samtools :term:`region`
+ string can be supplied.
Without *reference* or *region* all mapped reads will be
fetched. The reads will be returned ordered by reference
sequence, which will not necessarily be the order within the
- file.
+ file.
If *until_eof* is given, all reads from the current file
position will be returned in order as they are within the
file. Using this option will also fetch unmapped reads.
Set *multiple_iterators* to true if you will be using multiple
- iterators on the same file at the same time. The iterator returned
- will receive its own copy of a filehandle to the file effectively
- re-opening the file. Re-opening a file creates some
- overhead, so beware.
+ iterators on the same file at the same time. The iterator
+ returned will receive its own copy of a filehandle to the file
+ effectively re-opening the file. Re-opening a file creates
+ some overhead, so beware.
If only *reference* is set, all reads aligned to *reference*
will be fetched.
@@ -685,12 +731,14 @@ cdef class AlignmentFile:
tid)
# Turn of re-opening if htsfile is a stream
- if self.isstream:
+ if self.is_stream:
multiple_iterators = False
- if self.isbam:
- if not until_eof and not self._hasIndex() and not self.isremote:
- raise ValueError("fetch called on bamfile without index")
+ if self.is_bam or self.is_cram:
+ if not until_eof and not self.is_remote:
+ if not self._hasIndex():
+ raise ValueError(
+ "fetch called on bamfile without index")
if has_coord:
return IteratorRowRegion(
@@ -698,22 +746,27 @@ cdef class AlignmentFile:
multiple_iterators=multiple_iterators)
else:
if until_eof:
- return IteratorRowAll(self,
- multiple_iterators=multiple_iterators)
+ return IteratorRowAll(
+ self,
+ multiple_iterators=multiple_iterators)
else:
- # AH: check - reason why no multiple_iterators for AllRefs?
- return IteratorRowAllRefs(self,
- multiple_iterators=multiple_iterators)
+ # AH: check - reason why no multiple_iterators for
+ # AllRefs?
+ return IteratorRowAllRefs(
+ self,
+ multiple_iterators=multiple_iterators)
else:
if has_coord:
raise ValueError(
"fetching by region is not available for sam files")
if callback:
- raise NotImplementedError("callback not implemented yet")
+ raise NotImplementedError(
+ "callback not implemented yet")
if self.header == NULL:
- raise ValueError("fetch called for htsfile without header")
+ raise ValueError(
+ "fetch called for htsfile without header")
# check if targets are defined
# give warning, sam_read1 segfaults
@@ -850,12 +903,13 @@ cdef class AlignmentFile:
Possible options for the stepper are
``all``
- use all reads for pileup.
-
- ``pass``
skip reads in which any of the following flags are set:
BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+ ``nofilter``
+ uses every single read
+
+
``samtools``
same filter and read processing as in :term:`csamtools`
pileup. This requires a *fastafile* to be given.
@@ -894,7 +948,7 @@ cdef class AlignmentFile:
has_coord, rtid, rstart, rend = self._parseRegion(
reference, start, end, region )
- if self.isbam:
+ if self.is_bam or self.is_cram:
if not self._hasIndex():
raise ValueError("no index available for pileup")
@@ -910,7 +964,75 @@ cdef class AlignmentFile:
else:
raise NotImplementedError( "pileup of samfiles not implemented yet" )
- def close( self ):
+ @cython.boundscheck(False) # we do manual bounds checking
+ def count_coverage(self, chr, start, stop, quality_threshold = 15,
+ read_callback = 'all'):
+ """Count ACGT in a part of a AlignmentFile.
+ Return 4 array.arrays of length = stop - start,
+ in order A C G T.
+
+ @quality_threshold is the minimum quality score (in phred) a
+ base has to reach to be counted. Possible @read_callback
+ values are
+
+ ``all``
+` skip reads in which any of the following
+ flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+ BAM_FDUP
+
+ ``nofilter``
+ uses every single read
+
+ Alternatively, @read_callback can be a function ```check_read(read)``1
+ that should return True only for those reads that shall be included in
+ the counting.
+
+ """
+
+ cdef int _start = start
+ cdef int _stop = stop
+ cdef int length = _stop - _start
+ cdef array.array int_array_template = array.array('L', [])
+ cdef array.array count_a
+ cdef array.array count_c
+ cdef array.array count_g
+ cdef array.array count_t
+ count_a = array.clone(int_array_template, length, zero=True)
+ count_c = array.clone(int_array_template, length, zero=True)
+ count_g = array.clone(int_array_template, length, zero=True)
+ count_t = array.clone(int_array_template, length, zero=True)
+
+ cdef char * seq
+ cdef array.array quality
+ cdef int qpos
+ cdef int refpos
+ cdef int c = 0
+ cdef int _threshold = quality_threshold
+ for read in self.fetch(chr, start, stop):
+ if read_callback == 'all':
+ if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+ continue
+ elif read_callback == 'nofilter':
+ pass
+ else:
+ if not read_callback(read):
+ continue
+ seq = read.seq
+ quality = read.query_qualities
+ for qpos, refpos in read.get_aligned_pairs(True):
+ if qpos is not None and refpos is not None and _start <= refpos < _stop:
+ if quality[qpos] > quality_threshold:
+ if seq[qpos] == 'A':
+ count_a.data.as_ulongs[refpos - _start] += 1
+ if seq[qpos] == 'C':
+ count_c.data.as_ulongs[refpos - _start] += 1
+ if seq[qpos] == 'G':
+ count_g.data.as_ulongs[refpos - _start] += 1
+ if seq[qpos] == 'T':
+ count_t.data.as_ulongs[refpos - _start] += 1
+ return count_a, count_c, count_g, count_t
+
+ def close(self):
'''
closes the :class:`pysam.AlignmentFile`.'''
if self.htsfile != NULL:
@@ -918,10 +1040,16 @@ cdef class AlignmentFile:
hts_idx_destroy(self.index);
self.htsfile = NULL
- def __dealloc__( self ):
+ def __dealloc__(self):
# remember: dealloc cannot call other methods
# note: no doc string
# note: __del__ is not called.
+
+ # FIXME[kbj]: isn't self.close a method? I've been duplicating
+ # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty
+ # solution and perhaps unnecessary given that calling self.close has
+ # been working for years.
+
self.close()
bam_destroy1(self.b)
if self.header != NULL:
@@ -936,11 +1064,17 @@ cdef class AlignmentFile:
if not self._isOpen():
return 0
- x = sam_write1(self.htsfile,
- self.header,
- read._delegate)
+ cdef int ret = sam_write1(self.htsfile,
+ self.header,
+ read._delegate)
- return x
+ # kbj: Still need to raise an exception with except -1. Otherwise
+ # when ret == -1 we get a "SystemError: error return without
+ # exception set".
+ if ret < 0:
+ raise ValueError('sam write failed')
+
+ return ret
def __enter__(self):
return self
@@ -955,7 +1089,7 @@ cdef class AlignmentFile:
## properties
###############################################################
property filename:
- '''number of :term:`filename` associated with this object.'''
+ '''filename associated with this object.'''
def __get__(self):
return self._filename
@@ -981,14 +1115,16 @@ cdef class AlignmentFile:
"""
def __get__(self):
- if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self._isOpen():
+ raise ValueError("I/O operation on closed file")
t = []
for x from 0 <= x < self.header.n_targets:
- t.append( self.header.target_len[x] )
+ t.append(self.header.target_len[x])
return tuple(t)
property mapped:
- """total number of mapped alignments in file.
+ """total number of mapped alignments according
+ to the statistics recorded in the index.
"""
def __get__(self):
self._checkIndex()
@@ -1005,15 +1141,18 @@ cdef class AlignmentFile:
an error.'''
if not self._isOpen():
raise ValueError("I/O operation on closed file")
- if not self.isbam:
- raise AttributeError("AlignmentFile.mapped only available in bam files")
+ if not self.is_bam and not self.is_cram:
+ raise AttributeError(
+ "AlignmentFile.mapped only available in bam files")
if self.index == NULL:
- raise ValueError("mapping information not recorded in index "
- "or index not available")
+ raise ValueError(
+ "mapping information not recorded in index "
+ "or index not available")
property unmapped:
- """total number of unmapped reads in file.
+ """total number of unmapped reads according
+ to the statistics recorded in the index.
"""
def __get__(self):
self._checkIndex()
@@ -1026,7 +1165,8 @@ cdef class AlignmentFile:
return total
property nocoordinate:
- """total number of reads without coordinates
+ """total number of reads without coordinates according
+ to the statistics recorded in the index.
"""
def __get__(self):
self._checkIndex()
@@ -1253,7 +1393,7 @@ cdef class AlignmentFile:
if not self._isOpen():
raise ValueError( "I/O operation on closed file" )
- if not self.isbam and self.header.n_targets == 0:
+ if not self.is_bam and self.header.n_targets == 0:
raise NotImplementedError(
"can not iterate over samfile without header")
return self
@@ -1276,14 +1416,12 @@ cdef class AlignmentFile:
cdef int ret = self.cnext()
if (ret >= 0):
return makeAlignedSegment(self.b)
- elif (ret == -2):
+ elif ret == -2:
raise IOError('truncated file')
else:
raise StopIteration
-##-------------------------------------------------------------------
-##-------------------------------------------------------------------
-##-------------------------------------------------------------------
+
cdef class IteratorRow:
'''abstract base class for iterators over mapped reads.
@@ -1302,8 +1440,10 @@ cdef class IteratorRow:
The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
.. note::
+
It is usually not necessary to create an object of this class
- explicitely. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
+ explicitely. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
'''
@@ -1349,8 +1489,10 @@ cdef class IteratorRowRegion(IteratorRow):
iterate over mapped reads in a region.
.. note::
+
It is usually not necessary to create an object of this class
- explicitely. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
+ explicitely. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
"""
@@ -1358,10 +1500,11 @@ cdef class IteratorRowRegion(IteratorRow):
int tid, int beg, int end,
int multiple_iterators=False):
- IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
+ IteratorRow.__init__(self, samfile,
+ multiple_iterators=multiple_iterators)
if not samfile._hasIndex():
- raise ValueError( "no index available for iteration" )
+ raise ValueError("no index available for iteration")
self.iter = sam_itr_queryi(
self.samfile.index,
@@ -1377,22 +1520,29 @@ cdef class IteratorRowRegion(IteratorRow):
cdef int cnext(self):
'''cversion of iterator. Used by IteratorColumn'''
- self.retval = hts_itr_next(self.htsfile.fp.bgzf,
+ self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
self.iter,
self.b,
- NULL)
+ self.htsfile)
def __next__(self):
"""python version of next().
"""
self.cnext()
- if self.retval < 0:
+ if self.retval >= 0:
+ return makeAlignedSegment(self.b)
+ elif self.retval == -2:
+ # Note: it is currently not the case that hts_iter_next
+ # returns -2 for a truncated file.
+ # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
+ raise IOError('truncated file')
+ else:
raise StopIteration
- return makeAlignedSegment(self.b)
def __dealloc__(self):
hts_itr_destroy(self.iter)
+
cdef class IteratorRowHead(IteratorRow):
"""*(AlignmentFile samfile, n, int multiple_iterators=False)*
@@ -1400,14 +1550,16 @@ cdef class IteratorRowHead(IteratorRow):
.. note::
It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a :meth:`AlignmentFile.head`.
-
+ explicitly. It is returned as a result of call to a
+ :meth:`AlignmentFile.head`.
"""
- def __init__(self, AlignmentFile samfile, int n, int multiple_iterators=False):
+ def __init__(self, AlignmentFile samfile, int n,
+ int multiple_iterators=False):
- IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
+ IteratorRow.__init__(self, samfile,
+ multiple_iterators=multiple_iterators)
self.max_rows = n
self.current_row = 0
@@ -1448,15 +1600,18 @@ cdef class IteratorRowAll(IteratorRow):
iterate over all reads in *samfile*
.. note::
+
It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
-
+ explicitly. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
"""
- def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
+ def __init__(self, AlignmentFile samfile,
+ int multiple_iterators=False):
- IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
+ IteratorRow.__init__(self, samfile,
+ multiple_iterators=multiple_iterators)
def __iter__(self):
return self
@@ -1485,14 +1640,18 @@ cdef class IteratorRowAll(IteratorRow):
cdef class IteratorRowAllRefs(IteratorRow):
- """iterates over all mapped reads by chaining iterators over each reference
+ """iterates over all mapped reads by chaining iterators over each
+ reference
.. note::
It is usually not necessary to create an object of this class
- explicitely. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
+ explicitely. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
+
"""
- def __init__(self, AlignmentFile samfile, multiple_iterators=False):
+ def __init__(self, AlignmentFile samfile,
+ multiple_iterators=False):
IteratorRow.__init__(self, samfile,
multiple_iterators=multiple_iterators)
@@ -1566,8 +1725,6 @@ cdef class IteratorRowSelection(IteratorRow):
self.positions = positions
self.current_pos = 0
- self.fp = self.htsfile.fp.bgzf
-
def __iter__(self):
return self
@@ -1580,7 +1737,7 @@ cdef class IteratorRowSelection(IteratorRow):
# end iteration if out of positions
if self.current_pos >= len(self.positions): return -1
- bgzf_seek(self.fp,
+ bgzf_seek(hts_get_bgzfp(self.htsfile),
self.positions[self.current_pos],
0)
self.current_pos += 1
@@ -1725,15 +1882,10 @@ cdef class IteratorColumn:
stepper
The stepper controls how the iterator advances.
- Valid values are None, "all" or "samtools".
-
- The default stepper "all" uses all reads for
- computing the pileup. This corresponds to the
- mpileup options "-B" and "-A".
-
- The stepper "samtools" uses the mpileup default
- parameterization to advance.
+ Valid values are None, "all" (default), "nofilter" or "samtools".
+ See AlignmentFile.pileup for description.
+
fastafile
A :class:`FastaFile` object
@@ -2066,7 +2218,7 @@ def fromQualityString(quality_string):
return array.array('B', [ord(x)-33 for x in quality_string])
-cdef inline uint8_t _getTypeCode(value, value_type = None):
+cdef inline uint8_t _get_value_code(value, value_type=None):
'''guess type code for a *value*. If *value_type* is None,
the type code will be inferred based on the Python type of
*value*'''
@@ -2087,92 +2239,134 @@ cdef inline uint8_t _getTypeCode(value, value_type = None):
else:
if value_type not in 'Zidf':
return 0
- value_type = _forceBytes( value_type )
+ value_type = _forceBytes(value_type)
_char_type = value_type
type_code = (<uint8_t*>_char_type)[0]
return type_code
-cdef inline convert_python_tag(pytag, value, fmts, args):
-
- if not type(pytag) is bytes:
- pytag = pytag.encode('ascii')
- t = type(value)
- if t is tuple or t is list:
- # binary tags - treat separately
- pytype = 'B'
- # get data type - first value determines type. If there is a
- # mix of types, the result is undefined.
- if type(value[0]) is float:
- datafmt, datatype = "f", "f"
- else:
- mi, ma = min(value), max(value)
- # signed ints
- if mi < 0:
- if mi >= -128 and ma < 128:
- datafmt, datatype = "b", 'c'
- elif mi >= -32768 and ma < 32768:
- datafmt, datatype = "h", 's'
- elif mi < -2147483648 or ma >= 2147483648:
- raise ValueError(
- "at least one signed integer out of range of "
- "BAM/SAM specification")
- else: datafmt, datatype = "i", 'i'
+cdef inline _get_value_type(value, maximum_value=None):
+ '''returns the value type of a value.
- # unsigned ints
- else:
- if ma < 256:
- datafmt, datatype = "B", 'C'
- elif ma < 65536:
- datafmt, datatype = "H", 'S'
- elif ma >= 4294967296:
- raise ValueError(
- "at least one integer out of range of BAM/SAM specification")
- else:
- datafmt, datatype = "I", 'I'
+ If max is specified, the approprite type is
+ returned for a range where value is the minimum.
+ '''
+
+ if maximum_value is None:
+ maximum_value = value
- datafmt = "2sccI%i%s" % (len(value), datafmt)
- args.extend([pytag[:2],
- pytype.encode('ascii'),
- datatype.encode('ascii'),
- len(value)] + list(value))
- fmts.append( datafmt )
- return
+ t = type(value)
if t is float:
- fmt, pytype = "2scf", 'f'
+ valuetype = b'f'
elif t is int:
- # negative values
- if value < 0:
- if value >= -127: fmt, pytype = "2scb", 'c'
- elif value >= -32767: fmt, pytype = "2sch", 's'
- elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
- else: fmt, pytype = "2sci", 'i'
- # positive values
+ # signed ints
+ if value < 0:
+ if value >= -128 and maximum_value < 128:
+ valuetype = b'c'
+ elif value >= -32768 and maximum_value < 32768:
+ valuetype = b's'
+ elif value < -2147483648 or maximum_value >= 2147483648:
+ raise ValueError(
+ "at least one signed integer out of range of "
+ "BAM/SAM specification")
+ else:
+ valuetype = b'i'
+ # unsigned ints
else:
- if value <= 255: fmt, pytype = "2scB", 'C'
- elif value <= 65535: fmt, pytype = "2scH", 'S'
- elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
- else: fmt, pytype = "2scI", 'I'
+ if maximum_value < 256:
+ valuetype = b'C'
+ elif maximum_value < 65536:
+ valuetype = b'S'
+ elif maximum_value >= 4294967296:
+ raise ValueError(
+ "at least one integer out of range of BAM/SAM specification")
+ else:
+ valuetype = b'I'
else:
# Note: hex strings (H) are not supported yet
if t is not bytes:
value = value.encode('ascii')
if len(value) == 1:
- fmt, pytype = "2scc", 'A'
+ valuetype = b"A"
+ else:
+ valuetype = b'Z'
+
+ return valuetype
+
+
+cdef inline _pack_tags(tags):
+ """pack a list of tags. Each tag is a tuple of (tag, tuple).
+
+ Values are packed into the most space efficient data structure
+ possible unless the tag contains a third field with the type code.
+
+ Returns a fmt string and the associated list of arguments
+ to used in a call to struct.pack_into.
+ """
+ fmts, args = ["<"], []
+
+ for tag in tags:
+
+ if len(tag) == 2:
+ pytag, value = tag
+ valuetype = None
+ elif len(tag) == 3:
+ pytag, value, valuetype = tag
+ else:
+ raise ValueError("malformatted tag: %s" % str(tag))
+
+ if not type(pytag) is bytes:
+ pytag = pytag.encode('ascii')
+
+ datatype2format = {'c': 'b',
+ 's': 'h',
+ 'i': 'i',
+ 'C': 'B',
+ 'S': 'H',
+ 'I': 'I',
+ 'f': 'f',
+ 'A': 'c',}
+
+ t = type(value)
+ if t is tuple or t is list:
+ # binary tags are treated separately
+ if valuetype is None:
+ # automatically determine value type - first value
+ # determines type. If there is a mix of types, the
+ # result is undefined.
+ valuetype = _get_value_type(min(value), max(value))
+
+ if valuetype not in datatype2format:
+ raise ValueError("invalid value type '%s'" % valuetype)
+ datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype])
+
+ args.extend([pytag[:2],
+ b"B",
+ valuetype,
+ len(value)] + list(value))
+ fmts.append(datafmt)
+
else:
- fmt, pytype = "2sc%is" % (len(value)+1), 'Z'
+
+ if valuetype is None:
+ valuetype = _get_value_type(value)
+
+ if valuetype == b"Z":
+ fmt = "2sc%is" % (len(value)+1)
+ else:
+ fmt = "2sc%s" % datatype2format[valuetype]
+
+ args.extend([pytag[:2],
+ valuetype,
+ value])
- args.extend([pytag[:2],
- pytype.encode('ascii'),
- value])
+ fmts.append(fmt)
- fmts.append(fmt)
+ return "".join(fmts), args
-###########################################################
-###########################################################
-###########################################################
+
cdef class AlignedSegment:
'''Class representing an aligned segment.
@@ -2233,7 +2427,10 @@ cdef class AlignedSegment:
self.tags)))
def compare(self, AlignedSegment other):
- '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
+ '''return -1,0,1, if contents in this are binary
+ <,=,> to *other*
+
+ '''
cdef int retval, x
cdef bam1_t *t
@@ -2265,15 +2462,33 @@ cdef class AlignedSegment:
return retval
return memcmp(t.data, o.data, t.l_data)
+ def __richcmp__(self, AlignedSegment other, int op):
+ if op == 2: # == operator
+ return self.compare(other) == 0
+ elif op == 3: # != operator
+ return self.compare(other) != 0
+ else:
+ return NotImplemented
+
# Disabled so long as __cmp__ is a special method
def __hash__(self):
- return _Py_HashPointer(<void *>self)
-
+ cdef bam1_t * src
+ src = self._delegate
+ # shift and xor values in the core structure
+ # make sure tid and mtid are shifted by different amounts
+ # should variable length data be included?
+ cdef uint32_t hash_value = src.core.tid << 24 ^ \
+ src.core.pos << 16 ^ \
+ src.core.qual << 8 ^ \
+ src.core.flag ^ \
+ src.core.isize << 24 ^ \
+ src.core.mtid << 16 ^ \
+ src.core.mpos << 8
+
+ return hash_value
- #######################################################################
- #######################################################################
+ ########################################################
## Basic attributes in order of appearance in SAM format
- #######################################################################
property query_name:
"""the query template name (None if not present)"""
def __get__(self):
@@ -2420,7 +2635,14 @@ cdef class AlignedSegment:
can be inferred from the CIGAR alignment, see
:meth:`pysam.AlignmentFile.infer_query_length.`.
- This property can be set by providing a sequence.
+ The length includes soft-clipped bases and is equal to
+ ``len(query_sequence)``.
+
+ This property is read-only but can be set by providing a
+ sequence.
+
+ Returns 0 if not available.
+
"""
def __get__(self):
return self._delegate.core.l_qseq
@@ -2542,8 +2764,9 @@ cdef class AlignedSegment:
src = self._delegate
p = pysam_bam_get_qual(src)
if qual is None or len(qual) == 0:
- # if absent - set to 0xff
- p[0] = 0xff
+ # if absent and there is a sequence: set to 0xff
+ if src.core.l_qseq != 0:
+ p[0] = 0xff
return
# check for length match
@@ -2563,120 +2786,7 @@ cdef class AlignedSegment:
# copy data
memcpy(p, result.data.as_voidptr, l)
- # TODO: opts object with mapping-like interface
- property tags:
- """the tags in the AUX field.
-
- This property permits convenience access to
- the tags. Changes it the returned list will
- not update the tags automatically. Instead,
- the following is required for adding a
- new tag::
-
- read.tags = read.tags + [("RG",0)]
-
- This method will happily write the same tag
- multiple times.
- """
- def __get__(self):
- cdef char * ctag
- cdef bam1_t * src
- cdef uint8_t * s
- cdef char auxtag[3]
- cdef char auxtype
- cdef uint8_t byte_size
- cdef int32_t nvalues
-
- src = self._delegate
- if src.l_data == 0:
- return []
- s = pysam_bam_get_aux(src)
- result = []
- auxtag[2] = 0
- while s < (src.data + src.l_data):
- # get tag
- auxtag[0] = s[0]
- auxtag[1] = s[1]
- s += 2
- auxtype = s[0]
- if auxtype in ('c', 'C'):
- value = <int>bam_aux2i(s)
- s += 1
- elif auxtype in ('s', 'S'):
- value = <int>bam_aux2i(s)
- s += 2
- elif auxtype in ('i', 'I'):
- value = <int32_t>bam_aux2i(s)
- s += 4
- elif auxtype == 'f':
- value = <float>bam_aux2f(s)
- s += 4
- elif auxtype == 'd':
- value = <double>bam_aux2f(s)
- s += 8
- elif auxtype == 'A':
- value = "%c" % <char>bam_aux2A(s)
- s += 1
- elif auxtype in ('Z', 'H'):
- value = _charptr_to_str(<char*>bam_aux2Z(s))
- # +1 for NULL terminated string
- s += len(value) + 1
- elif auxtype == 'B':
- s += 1
- byte_size, nvalues, value = convertBinaryTagToList( s )
- # 5 for 1 char and 1 int
- s += 5 + ( nvalues * byte_size) - 1
- else:
- raise KeyError("unknown type '%s'" % auxtype)
-
- s += 1
-
- result.append((_charptr_to_str(auxtag), value))
-
- return result
-
- def __set__(self, tags):
- cdef bam1_t * src
- cdef uint8_t * s
- cdef char * temp
- cdef int new_size = 0
- cdef int old_size
- src = self._delegate
- fmts, args = ["<"], []
-
- if tags is not None and len(tags) > 0:
- for pytag, value in tags:
- convert_python_tag(pytag, value, fmts, args)
- fmt = "".join(fmts)
- new_size = struct.calcsize(fmt)
- buffer = ctypes.create_string_buffer(new_size)
- struct.pack_into(fmt,
- buffer,
- 0,
- *args)
-
- # delete the old data and allocate new space.
- # If total_size == 0, the aux field will be
- # empty
- old_size = pysam_bam_get_l_aux(src)
- pysam_bam_update(src,
- old_size,
- new_size,
- pysam_bam_get_aux(src))
- # copy data only if there is any
- if new_size > 0:
-
- # get location of new data
- s = pysam_bam_get_aux(src)
-
- # check if there is direct path from buffer.raw to tmp
- p = buffer.raw
- # create handle to make sure buffer stays alive long
- # enough for memcpy, see issue 129
- temp = p
- memcpy(s, temp, new_size)
-
property bin:
"""properties bin"""
def __get__(self):
@@ -2758,13 +2868,22 @@ cdef class AlignedSegment:
return (self.flag & BAM_FDUP) != 0
def __set__(self, val):
pysam_update_flag(self._delegate, val, BAM_FDUP)
+ property is_supplementary:
+ """true if this is a supplementary alignment"""
+ def __get__(self):
+ return (self.flag & BAM_FSUPPLEMENTARY) != 0
+ def __set__(self, val):
+ pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
# 2. Coordinates and lengths
property reference_end:
- '''aligned reference position of the read on the reference genome.
+ '''aligned reference position of the read on the reference genome.
- aend points to one past the last aligned residue.
- Returns None if not available.'''
+ reference_end points to one past the last aligned residue.
+ Returns None if not available (read is unmapped or no cigar
+ alignment present).
+
+ '''
def __get__(self):
cdef bam1_t * src
src = self._delegate
@@ -2784,16 +2903,6 @@ cdef class AlignedSegment:
return bam_endpos(src) - \
self._delegate.core.pos
- property query_alignment_length:
- """length of the query template. This includes soft-clipped bases
- and is equal to ``len(seq)``.
-
- This property is read-only.
-
- Returns 0 if not available."""
- def __get__(self):
- return self._delegate.core.l_qseq
-
property query_alignment_sequence:
"""aligned portion of the read.
@@ -2870,7 +2979,7 @@ cdef class AlignedSegment:
def __get__(self):
return _getQueryEnd(self._delegate)
- property query_aligment_length:
+ property query_alignment_length:
"""length of the aligned query sequence.
This is equal to :attr:`qend` - :attr:`qstart`"""
@@ -2959,13 +3068,22 @@ cdef class AlignedSegment:
return qpos
- def get_aligned_pairs(self):
- """a list of aligned read and reference positions.
+ def get_aligned_pairs(self, matches_only = False):
+ """a list of aligned read (query) and reference positions.
+ For inserts, deletions, skipping either query or reference position may be None.
+
+ If @matches_only is True, only matched bases are returned - no None on either side.
+
+ Padding is currently not supported and leads to an exception
+
"""
cdef uint32_t k, i, pos, qpos
cdef int op
cdef uint32_t * cigar_p
cdef bam1_t * src
+ cdef int _matches_only
+
+ _matches_only = bool(matches_only)
src = self._delegate
if pysam_get_n_cigar(src) == 0:
@@ -2980,22 +3098,32 @@ cdef class AlignedSegment:
op = cigar_p[k] & BAM_CIGAR_MASK
l = cigar_p[k] >> BAM_CIGAR_SHIFT
- if op == BAM_CMATCH:
+ if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
for i from pos <= i < pos + l:
result.append((qpos, i))
qpos += 1
pos += l
- elif op == BAM_CINS:
- for i from pos <= i < pos + l:
- result.append((qpos, None))
- qpos += 1
+ elif op == BAM_CINS or op == BAM_CSOFT_CLIP:
+ if not _matches_only:
+ for i from pos <= i < pos + l:
+ result.append((qpos, None))
+ qpos += 1
+ else:
+ qpos += l
elif op == BAM_CDEL or op == BAM_CREF_SKIP:
- for i from pos <= i < pos + l:
- result.append((None, i))
+ if not _matches_only:
+ for i from pos <= i < pos + l:
+ result.append((None, i))
pos += l
+ elif op == BAM_CHARD_CLIP:
+ pass # advances neither
+
+ elif op == BAM_CPAD:
+ raise NotImplementedError("Padding (BAM_CPAD, 6) is currently not supported. Please implement. Sorry about that.")
+
return result
def get_blocks(self):
@@ -3071,8 +3199,6 @@ cdef class AlignedSegment:
#####################################################
## Unsorted as yet
-
-
# TODO: capture in CIGAR object
property cigartuples:
"""the :term:`cigar` alignment. The alignment
@@ -3176,17 +3302,25 @@ cdef class AlignedSegment:
5))
+ cpdef set_tag(self,
+ tag,
+ value,
+ value_type=None,
+ replace=True):
+ """sets a particular field *tag* to *value* in the optional alignment
+ section.
- cpdef setTag(self, tag, value,
- value_type = None,
- replace = True):
- '''
- Set optional field of alignment *tag* to *value*. *value_type* may be specified,
- but if not the type will be inferred based on the Python type of *value*
+ *value_type* describes the type of *value* that is to entered
+ into the alignment record.. It can be set explicitely to one
+ of the valid one-letter type codes. If unset, an appropriate
+ type will be chosen automatically.
- An existing value of the same tag will be overwritten unless
- *replace* is set to False.
- '''
+ An existing value of the same *tag* will be overwritten unless
+ replace is set to False. This is usually not recommened as a
+ tag may only appear once in the optional alignment section.
+
+ If *value* is None, the tag will be deleted.
+ """
cdef int value_size
cdef uint8_t * value_ptr
@@ -3200,14 +3334,24 @@ cdef class AlignedSegment:
if len(tag) != 2:
raise ValueError('Invalid tag: %s' % tag)
+
+ tag = _forceBytes(tag)
+ if replace:
+ existing_ptr = bam_aux_get(src, tag)
+ if existing_ptr:
+ bam_aux_del(src, existing_ptr)
+
+ # setting value to None deletes a tag
+ if value is None:
+ return
- type_code = _getTypeCode(value, value_type)
+ type_code = _get_value_code(value, value_type)
if type_code == 0:
raise ValueError("can't guess type or invalid type code specified")
# Not Endian-safe, but then again neither is samtools!
if type_code == 'Z':
- value = _forceBytes( value )
+ value = _forceBytes(value)
value_ptr = <uint8_t*><char*>value
value_size = len(value)+1
elif type_code == 'i':
@@ -3225,11 +3369,6 @@ cdef class AlignedSegment:
else:
raise ValueError('Unsupported value_type in set_option')
- tag = _forceBytes( tag )
- if replace:
- existing_ptr = bam_aux_get(src, tag)
- if existing_ptr:
- bam_aux_del(src, existing_ptr)
bam_aux_append(src,
tag,
@@ -3237,20 +3376,32 @@ cdef class AlignedSegment:
value_size,
value_ptr)
+ cpdef has_tag(self, tag):
+ """returns true if the optional alignment section
+ contains a given *tag*."""
+ cdef uint8_t * v
+ cdef int nvalues
+ btag = _forceBytes(tag)
+ v = bam_aux_get(self._delegate, btag)
+ return v != NULL
- #######################################################################
- #######################################################################
- ## Derived properties
- #######################################################################
+ cpdef get_tag(self, tag):
+ """retrieves data from the optional alignment section
+ given a two-letter *tag* denoting the field.
- def opt(self, tag):
- """retrieves optional data given a two-letter *tag*"""
- #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
+ If *tag* is not present, a KeyError is raised.
+
+ The returned value is cast into an appropriate python type.
+
+ This method is the fastest way to access the optional
+ alignment section if only few tags need to be retrieved.
+ """
cdef uint8_t * v
cdef int nvalues
btag = _forceBytes(tag)
v = bam_aux_get(self._delegate, btag)
- if v == NULL: raise KeyError( "tag '%s' not present" % tag )
+ if v == NULL:
+ raise KeyError("tag '%s' not present" % tag)
auxtype = chr(v[0])
if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
return <int>bam_aux2i(v)
@@ -3267,11 +3418,140 @@ cdef class AlignedSegment:
elif auxtype == 'Z':
return _charptr_to_str(<char*>bam_aux2Z(v))
elif auxtype == 'B':
- bytesize, nvalues, values = convertBinaryTagToList( v + 1 )
+ bytesize, nvalues, values = convertBinaryTagToList(v + 1)
return values
else:
raise ValueError("unknown auxilliary type '%s'" % auxtype)
+ def get_tags(self, with_value_type=False):
+ """the fields in the optional aligment section.
+
+ Returns a list of all fields in the optional
+ alignment section. Values are converted to appropriate python
+ values. For example:
+
+ [(NM, 2), (RG, "GJP00TM04")]
+
+ If *with_value_type* is set, the value type as encode in
+ the AlignedSegment record will be returned as well:
+
+ [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+
+ This method will convert all values in the optional alignment
+ section. When getting only one or few tags, please see
+ :meth:`get_tag` for a quicker way to achieve this.
+
+ """
+
+ cdef char * ctag
+ cdef bam1_t * src
+ cdef uint8_t * s
+ cdef char auxtag[3]
+ cdef char auxtype
+ cdef uint8_t byte_size
+ cdef int32_t nvalues
+
+ src = self._delegate
+ if src.l_data == 0:
+ return []
+ s = pysam_bam_get_aux(src)
+ result = []
+ auxtag[2] = 0
+ while s < (src.data + src.l_data):
+ # get tag
+ auxtag[0] = s[0]
+ auxtag[1] = s[1]
+ s += 2
+ auxtype = s[0]
+ if auxtype in ('c', 'C'):
+ value = <int>bam_aux2i(s)
+ s += 1
+ elif auxtype in ('s', 'S'):
+ value = <int>bam_aux2i(s)
+ s += 2
+ elif auxtype in ('i', 'I'):
+ value = <int32_t>bam_aux2i(s)
+ s += 4
+ elif auxtype == 'f':
+ value = <float>bam_aux2f(s)
+ s += 4
+ elif auxtype == 'd':
+ value = <double>bam_aux2f(s)
+ s += 8
+ elif auxtype == 'A':
+ value = "%c" % <char>bam_aux2A(s)
+ s += 1
+ elif auxtype in ('Z', 'H'):
+ value = _charptr_to_str(<char*>bam_aux2Z(s))
+ # +1 for NULL terminated string
+ s += len(value) + 1
+ elif auxtype == 'B':
+ s += 1
+ byte_size, nvalues, value = convertBinaryTagToList(s)
+ # 5 for 1 char and 1 int
+ s += 5 + (nvalues * byte_size) - 1
+ else:
+ raise KeyError("unknown type '%s'" % auxtype)
+
+ s += 1
+
+ result.append((_charptr_to_str(auxtag), value))
+
+ return result
+
+ def set_tags(self, tags):
+ """sets the fields in the optional alignmest section with
+ a list of (tag, value) tuples.
+
+ The :term:`value type` of the values is determined from the
+ python type. Optionally, a type may be given explicitely as
+ a third value in the tuple, For example:
+
+ x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+
+ This method will not enforce the rule that the same tag may appear
+ only once in the optional alignment section.
+ """
+
+ cdef bam1_t * src
+ cdef uint8_t * s
+ cdef char * temp
+ cdef int new_size = 0
+ cdef int old_size
+ src = self._delegate
+
+ # convert and pack the data
+ if tags is not None and len(tags) > 0:
+ fmt, args =_pack_tags(tags)
+ new_size = struct.calcsize(fmt)
+ buffer = ctypes.create_string_buffer(new_size)
+ struct.pack_into(fmt,
+ buffer,
+ 0,
+ *args)
+
+ # delete the old data and allocate new space.
+ # If total_size == 0, the aux field will be
+ # empty
+ old_size = pysam_bam_get_l_aux(src)
+ pysam_bam_update(src,
+ old_size,
+ new_size,
+ pysam_bam_get_aux(src))
+
+ # copy data only if there is any
+ if new_size > 0:
+
+ # get location of new data
+ s = pysam_bam_get_aux(src)
+
+ # check if there is direct path from buffer.raw to tmp
+ p = buffer.raw
+ # create handle to make sure buffer stays alive long
+ # enough for memcpy, see issue 129
+ temp = p
+ memcpy(s, temp, new_size)
+
########################################################
# Compatibility Accessors
@@ -3398,9 +3678,18 @@ cdef class AlignedSegment:
property positions:
def __get__(self):
return self.get_reference_positions()
+ property tags:
+ def __get__(self):
+ return self.get_tags()
+ def __set__(self, tags):
+ self.set_tags(tags)
def overlap(self):
return self.get_overlap()
-
+ def opt(self, tag):
+ return self.get_tag(tag)
+ def setTag(self, tag, value, value_type=None, replace=True):
+ return self.set_tag(tag, value, value_type, replace)
+
cdef class PileupColumn:
'''A pileup of reads at a particular reference sequence postion
@@ -3417,8 +3706,9 @@ cdef class PileupColumn:
def __str__(self):
return "\t".join(map(str,
- (self.reference_id, self.reference_pos,
- self.nsegmentes))) +\
+ (self.reference_id,
+ self.reference_pos,
+ self.nsegments))) +\
"\n" +\
"\n".join(map(str, self.pileups))
@@ -3484,7 +3774,8 @@ cdef class PileupRead:
'''
def __init__(self):
- raise TypeError("this class cannot be instantiated from Python")
+ raise TypeError(
+ "this class cannot be instantiated from Python")
def __str__(self):
return "\t".join(
@@ -3500,28 +3791,39 @@ cdef class PileupRead:
return self._alignment
property query_position:
- """position of the read base at the pileup site, 0-based"""
+ """position of the read base at the pileup site, 0-based.
+ None if is_del or is_refskip is set.
+
+ """
def __get__(self):
- return self._qpos
+ if self.is_del or self.is_refskip:
+ return None
+ else:
+ return self._qpos
property indel:
"""indel length; 0 for no indel, positive for ins and negative for del"""
def __get__(self):
return self._indel
+
property level:
"""the level of the read in the "viewer" mode"""
def __get__(self):
return self._level
+
property is_del:
"""1 iff the base on the padded read is a deletion"""
def __get__(self):
return self._is_del
+
property is_head:
def __get__(self):
return self._is_head
+
property is_tail:
def __get__(self):
return self._is_tail
+
property is_refskip:
def __get__(self):
return self._is_refskip
@@ -3602,7 +3904,7 @@ cdef class IndexedReads:
# object is alive.
self.samfile = samfile
- assert samfile.isbam, "can only IndexReads on bam files"
+ assert samfile.is_bam, "can only IndexReads on bam files"
# multiple_iterators the file - note that this makes the iterator
# slow and causes pileup to slow down significantly.
@@ -3617,23 +3919,20 @@ cdef class IndexedReads:
self.header = self.samfile.header
self.owns_samfile = False
- # TODO: BAM file specific
- self.fp = self.htsfile.fp.bgzf
-
def build(self):
'''build index.'''
self.index = collections.defaultdict(list)
- # this method will start indexing from the current file position
- # if you decide
+ # this method will start indexing from the current file
+ # position if you decide
cdef int ret = 1
cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
cdef uint64_t pos
while ret > 0:
- pos = bgzf_tell(self.fp)
+ pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
ret = sam_read1(self.htsfile,
self.samfile.header,
b)
@@ -3663,6 +3962,16 @@ cdef class IndexedReads:
hts_close(self.htsfile)
bam_hdr_destroy(self.header)
+cpdef set_verbosity(int verbosity):
+ u"""Set htslib's hts_verbose global variable to the specified value.
+ """
+ return hts_set_verbosity(verbosity)
+
+cpdef get_verbosity():
+ u"""Return the value of htslib's hts_verbose global variable.
+ """
+ return hts_get_verbosity()
+
__all__ = ["AlignmentFile",
"IteratorRow",
"IteratorColumn",
@@ -3671,7 +3980,9 @@ __all__ = ["AlignmentFile",
"PileupRead",
"IndexedReads",
"toQualityString",
- "fromQualityString"]
+ "fromQualityString",
+ "get_verbosity",
+ "set_verbosity"]
# "IteratorSNPCalls",
# "SNPCaller",
# "IndelCaller",
diff --git a/pysam/cbcf.pxd b/pysam/cbcf.pxd
new file mode 100644
index 0000000..83e628a
--- /dev/null
+++ b/pysam/cbcf.pxd
@@ -0,0 +1,158 @@
+###############################################################################
+###############################################################################
+## Cython wrapper for htslib VCF/BCF reader/writer
+###############################################################################
+#
+# NOTICE: This code is incomplete and preliminary. It is nearly complete as
+# an immutable interface, but has no capability (yet) to mutate the
+# resulting data (beyond dropping all samples). Documentation still
+# needs to be written and a unit test suite is in the works. The
+# code is also specific to Python 2 and will require a bit of work
+# to properly adapt to Python 3.
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Kevin Jacobs (jacobs at bioinformed.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+
+from chtslib cimport *
+
+
+cdef class VariantHeader(object):
+ cdef bcf_hdr_t *ptr
+
+ cdef _subset_samples(self, include_samples)
+
+
+cdef class VariantHeaderRecord(object):
+ cdef VariantHeader header
+ cdef bcf_hrec_t *ptr
+
+
+cdef class VariantHeaderRecords(object):
+ cdef VariantHeader header
+
+
+cdef class VariantHeaderContigs(object):
+ cdef VariantHeader header
+
+
+cdef class VariantHeaderSamples(object):
+ cdef VariantHeader header
+
+
+cdef class VariantContig(object):
+ cdef VariantHeader header
+ cdef int id
+
+
+cdef class VariantMetadata(object):
+ cdef VariantHeader header
+ cdef int type
+ cdef int id
+
+
+cdef class VariantHeaderMetadata(object):
+ cdef VariantHeader header
+ cdef int32_t type
+
+
+cdef class VariantRecord(object):
+ cdef VariantHeader header
+ cdef bcf1_t *ptr
+
+
+cdef class VariantRecordFilter(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordFormat(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordInfo(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordSamples(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordSample(object):
+ cdef VariantRecord record
+ cdef readonly int32_t index
+
+
+cdef class BaseIndex(object):
+ cdef tuple refs
+ cdef dict refmap
+
+
+cdef class BCFIndex(BaseIndex):
+ cdef VariantHeader header
+ cdef hts_idx_t *ptr
+
+
+cdef class TabixIndex(BaseIndex):
+ cdef tbx_t *ptr
+
+
+cdef class BaseIterator(object):
+ cdef VariantFile bcf
+ cdef hts_itr_t *iter
+
+
+cdef class BCFIterator(BaseIterator):
+ cdef BCFIndex index
+
+
+cdef class TabixIterator(BaseIterator):
+ cdef TabixIndex index
+ cdef kstring_t line_buffer
+
+
+cdef class VariantFile(object):
+ cdef htsFile *htsfile # pointer to htsFile structure
+ cdef int64_t start_offset # BGZF offset of first record
+
+ cdef readonly object filename # filename as supplied by user
+ cdef readonly object mode # file opening mode
+
+ cdef readonly VariantHeader header
+ cdef readonly BaseIndex index
+
+ cdef readonly bint drop_samples # true if sample information is to be ignored
+
+ # FIXME: Temporary, use htsFormat when it is available
+ cdef readonly bint is_bcf # true if file is a bcf file
+ cdef readonly bint is_stream # true if not a seekable file but a stream
+ cdef readonly bint is_remote # true if file is not on the local filesystem
+ cdef readonly bint is_reading # true if file has begun reading records
+
+ cpdef int write(self, VariantRecord record) except -1
diff --git a/pysam/cbcf.pyx b/pysam/cbcf.pyx
new file mode 100644
index 0000000..ae274d5
--- /dev/null
+++ b/pysam/cbcf.pyx
@@ -0,0 +1,2419 @@
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+## Cython wrapper for htslib VCF/BCF reader/writer
+###############################################################################
+#
+# NOTICE: This code is incomplete and preliminary. It does offer a nearly
+# complete immutable Pythonic interface to VCF/BCF metadata and data
+# with reading and writing capability, but has no capability (yet)
+# to mutate the resulting data (beyond dropping all samples).
+# Documentation still needs to be written and a unit test suite is
+# in the works. The code is also superficially specific to Python 2
+# and will require a bit of work to properly adapt to Python 3.
+#
+# Here is a minimal example of how to use the API:
+#
+# $ cat bcfview.py
+# import sys
+# from pysam import VariantFile
+#
+# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
+# bcf_out = VariantFile('-', 'w', header=bcf_in.header)
+#
+# for rec in bcf_in:
+# bcf_out.write(rec)
+#
+# Performance is fairly close to that of bcftools view. Here is an example
+# using some 1k Genomes data:
+#
+# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
+# 1103799
+#
+# real 0m56.114s
+# user 1m4.489s
+# sys 0m3.102s
+#
+# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
+# 1103800 # bcftools adds an extra header
+#
+# real 0m55.126s
+# user 1m3.502s
+# sys 0m3.459s
+#
+# Here is a quick tour through the API::
+#
+# VariantFile(filename, mode=None, header=None, drop_samples=False)
+#
+# Attributes / Properties
+#
+# htsfile: htsFile* [private]
+# start_offset: BGZF offset of first record [private]
+# filename: filename [read only]
+# mode: mode [read only]
+# header: VariantHeader object [read only]
+# index: TabixIndex, BCFIndex or None [read only]
+# drop_samples: sample information is to be ignored [read only]
+#
+# is_stream: file is stdin/stdout [read only]
+# is_remote: file is not on the local filesystem [read only]
+# is_reading: file has begun reading records [read only]
+# category: file format general category [read only]
+# format: file format [read only]
+# version: tuple of (major, minor) format version [read only]
+# compression: file compression
+# description: vaguely human readable description of [read only]
+# file format.
+#
+# Methods:
+# copy()
+# close()
+# open(filename, mode=None, header=None, drop_samples=False)
+# reset()
+# seek(offset)
+# tell()
+# fetch(contig=None, start=None, stop=None, region=None, reopen=False)
+# subset_samples(include_samples)
+#
+# VariantHeader(mode) # mode='r' for reading, mode='w' for writing
+#
+# version: VCF version
+# samples: sequence-like access to samples
+# records: sequence-like access to partially parsed headers
+# contigs: mapping-like object for contig name -> VariantContig
+#
+# filters: mapping-like object for filter name -> VariantMetadata
+# info: mapping-like object for info name -> VariantMetadata
+# formats: mapping-like object for formats name -> VariantMetadata
+#
+# VariantRecord(...)
+#
+# header: VariantHeader object
+# rid: reference id (i.e. tid)
+# chrom: chromosome/contig string
+# contig: synonym for chrom
+# pos: 1-based start position (inclusive)
+# start: 0-based start position (inclusive)
+# stop: 0-based stop position (exclusive)
+# rlen: reference length (stop - start)
+# id: record identifier
+# ref: reference allele
+# alleles: alleles (ref followed by alts)
+# alts: alt alleles
+# qual: quality (float)
+# filter: mapping-like object for filter name -> type info
+# info: mapping-like object for info name -> value
+# format: mapping-like object for format name -> type info
+# samples: mapping-like object of sample genotypes & attrs
+#
+# VariantRecordSample(...)
+#
+# name: sample name
+# index: sample index
+# allele_indices: tuple of allele indices (ref=0, alt=1..len(alts), missing=-1)
+# alleles: tuple of alleles (missing=None)
+#
+# VariantRecordSample is also a mapping object from formats to values
+#
+# VariantContig(...)
+#
+# id: reference id (i.e. tid)
+# name: chromosome/contig string
+# length: contig length if provided, else None
+# header: defining VariantHeaderRecord
+#
+# VariantMetadata(...) # for FILTER, INFO and FORMAT metadata
+#
+# id: internal id
+# name: metadata name
+# type: value data type
+# number: number of values
+# header: defining VariantHeaderRecord
+#
+# VariantHeaderRecord(...) # replace with single tuple of key/value pairs?
+#
+# type: record type
+# key: first record key
+# value: first record value
+# attrs: remaining key/value pairs
+#
+###############################################################################
+#
+# TODO list for next major sprint:
+#
+# * more genotype methods
+# * unit test suite (perhaps py.test based)
+# * documentation
+# * htslib 1.2 format info
+#
+# For later sprints:
+#
+# * ability to create indices
+# * mutable header and record data
+# * pickle support
+# * Python 3 support
+# * left/right locus normalization
+# * parallel iteration (like synced_bcf_reader)
+# * fix reopen to re-use fd
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Kevin Jacobs (jacobs at bioinformed.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+from __future__ import division, print_function
+
+import os
+import sys
+
+from libc.string cimport strcmp
+
+cimport cython
+
+from cpython cimport PyBytes_Check, PyUnicode_Check
+from cpython.version cimport PY_MAJOR_VERSION
+
+__all__ = ['VariantFile', 'VariantHeader']
+
+########################################################################
+########################################################################
+## Constants
+########################################################################
+
+cdef int MAX_POS = 2 << 29
+cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
+cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
+cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
+
+cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
+cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
+ 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
+cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
+
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+
+IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+
+
+# filename encoding (copied from lxml.etree.pyx)
+cdef str FILENAME_ENCODING
+FILENAME_ENCODING = sys.getfilesystemencoding()
+if FILENAME_ENCODING is None:
+ FILENAME_ENCODING = sys.getdefaultencoding()
+if FILENAME_ENCODING is None:
+ FILENAME_ENCODING = 'ascii'
+
+
+cdef bytes encode_filename(object filename):
+ """Make sure a filename is 8-bit encoded (or None)."""
+ if filename is None:
+ return None
+ elif PyBytes_Check(filename):
+ return filename
+ elif PyUnicode_Check(filename):
+ return filename.encode(FILENAME_ENCODING)
+ else:
+ raise TypeError('Argument must be string or unicode.')
+
+
+cdef force_str(object s):
+ """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
+ if s is None:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif PyBytes_Check(s):
+ return s.decode('ascii')
+ else:
+ # assume unicode
+ return s
+
+
+cdef bytes force_bytes(object s):
+ """convert string or unicode object to bytes, assuming ascii encoding."""
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif s is None:
+ return None
+ elif PyBytes_Check(s):
+ return s
+ elif PyUnicode_Check(s):
+ return s.encode('ascii')
+ else:
+ raise TypeError('Argument must be string, bytes or unicode.')
+
+
+cdef charptr_to_str(const char* s):
+ if PY_MAJOR_VERSION < 3:
+ return s
+ else:
+ return s.decode('ascii')
+
+
+########################################################################
+########################################################################
+## Low level type conversion helpers
+########################################################################
+
+
+cdef tuple char_array_to_tuple(const char **a, int n, int free_after=0):
+ if not a:
+ return None
+ try:
+ return tuple( charptr_to_str(a[i]) for i in range(n) )
+ finally:
+ if free_after and a:
+ free(a)
+
+
+cdef bcf_array_to_object(void *data, int type, int n, int scalar=0):
+ cdef char *datac
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ cdef float *dataf
+ cdef int i
+
+ if not data or n <= 0:
+ return None
+
+ if type == BCF_BT_CHAR:
+ datac = <char *>data
+ value = datac[:n] if datac[0] != bcf_str_missing else None
+ else:
+ value = []
+ if type == BCF_BT_INT8:
+ data8 = <int8_t *>data
+ for i in range(n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ value.append(data8[i] if data8[i] != bcf_int8_missing else None)
+ elif type == BCF_BT_INT16:
+ data16 = <int16_t *>data
+ for i in range(n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ value.append(data16[i] if data16[i] != bcf_int16_missing else None)
+ elif type == BCF_BT_INT32:
+ data32 = <int32_t *>data
+ for i in range(n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ value.append(data32[i] if data32[i] != bcf_int32_missing else None)
+ elif type == BCF_BT_FLOAT:
+ dataf = <float *>data
+ for i in range(n):
+ if bcf_float_is_vector_end(dataf[i]):
+ break
+ value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
+ else:
+ raise TypeError('unsupported info type code')
+
+ if not value:
+ value = None
+ elif scalar and len(value) == 1:
+ value = value[0]
+ else:
+ value = tuple(value)
+
+ return value
+
+
+cdef object bcf_info_value(const bcf_info_t *z):
+ cdef char *s
+
+ if not z:
+ return None
+ elif z.len == 0:
+ value = True
+ elif z.len == 1:
+ if z.type == BCF_BT_INT8:
+ value = z.v1.i if z.v1.i != bcf_int8_missing else None
+ elif z.type == BCF_BT_INT16:
+ value = z.v1.i if z.v1.i != bcf_int16_missing else None
+ elif z.type == BCF_BT_INT32:
+ value = z.v1.i if z.v1.i != bcf_int32_missing else None
+ elif z.type == BCF_BT_FLOAT:
+ value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
+ elif z.type == BCF_BT_CHAR:
+ s = <char *>&z.v1.i
+ value = s if not s or s[0] != bcf_str_missing else None
+ else:
+ raise TypeError('unsupported info type code')
+ else:
+ value = bcf_array_to_object(z.vptr, z.type, z.len)
+
+ return value
+
+
+cdef inline int is_gt_fmt(bcf_hdr_t *h, bcf_fmt_t *fmt):
+ return strcmp(bcf_hdr_int2id(h, BCF_DT_ID, fmt.id), "GT") == 0
+
+
+########################################################################
+########################################################################
+## Variant Header objects
+########################################################################
+
+#FIXME: implement a full mapping interface
+#FIXME: passing bcf_hrec_t* may not be the safest approach once mutating
+# operations are allowed.
+cdef class VariantHeaderRecord(object):
+ """header record from a :class:`VariantHeader` object"""
+
+ property type:
+ """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
+ def __get__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ return METADATA_TYPES[r.type]
+
+ property key:
+ """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
+ def __get__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ return r.key if r.key else None
+
+ property value:
+ """header value. Set only for generic lines, None for FILTER/INFO, etc."""
+ def __get__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ return r.value if r.value else None
+
+ property attrs:
+ """sequence of additional header attributes"""
+ def __get__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ cdef int i
+ return tuple( (r.keys[i] if r.keys[i] else None,
+ r.vals[i] if r.vals[i] else None) for i in range(r.nkeys) )
+
+ def __str__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ if r.type == BCF_HL_GEN:
+ return '##{}={}'.format(self.key, self.value)
+ else:
+ attrs = ','.join('{}={}'.format(k, v) for k,v in self.attrs if k != 'IDX')
+ return '##{}=<{}>'.format(self.type, attrs)
+
+
+cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *h):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if not h:
+ return None
+
+ cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
+ record.header = header
+ record.ptr = h
+
+ return record
+
+
+cdef class VariantHeaderRecords(object):
+ """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
+
+ def __len__(self):
+ return self.header.ptr.nhrec
+
+ def __bool__(self):
+ return self.header.ptr.nhrec != 0
+
+ def __getitem__(self, index):
+ cdef int32_t i = index
+ if i < 0 or i >= self.header.ptr.nhrec:
+ raise IndexError('invalid header record index')
+ return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
+
+ def __iter__(self):
+ cdef int32_t i
+ for i in range(self.header.ptr.nhrec):
+ yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
+
+ __hash__ = None
+
+
+cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
+ records.header = header
+ return records
+
+
+cdef class VariantMetadata(object):
+ """filter, info or format metadata record from a :class:`VariantHeader` object"""
+ property name:
+ """metadata name"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ return h.id[BCF_DT_ID][self.id].key
+
+ # Q: Should this be exposed?
+ property id:
+ """metadata internal header id number"""
+ def __get__(self):
+ return self.id
+
+ property number:
+ """metadata number (i.e. cardinality)"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ if not bcf_hdr_idinfo_exists(h, self.type, self.id) or self.type == BCF_HL_FLT:
+ return None
+ cdef int l = bcf_hdr_id2length(h, self.type, self.id)
+ if l == BCF_VL_FIXED:
+ return bcf_hdr_id2number(h, self.type, self.id)
+ elif l == BCF_VL_VAR:
+ return '.'
+ else:
+ return METADATA_LENGTHS[l]
+
+ property type:
+ """metadata value type"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ if not bcf_hdr_idinfo_exists(h, self.type, self.id) or self.type == BCF_HL_FLT:
+ return None
+ return VALUE_TYPES[bcf_hdr_id2type(h, self.type, self.id)]
+
+ property header:
+ """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ if not bcf_hdr_idinfo_exists(h, self.type, self.id):
+ return None
+ cdef bcf_hrec_t *hrec = h.id[BCF_DT_ID][self.id].val.hrec[self.type]
+ if not hrec:
+ return None
+ return makeVariantHeaderRecord(self.header, hrec)
+
+
+cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
+ raise ValueError('invalid metadata type')
+
+ if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
+ raise ValueError('invalid metadata id')
+
+ cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
+ meta.header = header
+ meta.type = type
+ meta.id = id
+
+ return meta
+
+
+cdef class VariantHeaderMetadata(object):
+ """mapping from filter, info or format name to :class:`VariantMetadata` object"""
+
+ def __len__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef bcf_idpair_t *idpair
+ cdef int32_t i, n = 0
+
+ for i in range(h.n[BCF_DT_ID]):
+ idpair = h.id[BCF_DT_ID] + i
+ if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+ n += 1
+
+ return n
+
+ def __bool__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef bcf_idpair_t *idpair
+ cdef int32_t i
+
+ for i in range(h.n[BCF_DT_ID]):
+ idpair = h.id[BCF_DT_ID] + i
+ if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+ return True
+
+ return False
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>h.dict[BCF_DT_ID]
+ cdef khiter_t k = kh_get_vdict(d, key)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
+ raise KeyError('invalid filter')
+
+ return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef bcf_idpair_t *idpair
+ cdef int32_t i
+
+ for i in range(h.n[BCF_DT_ID]):
+ idpair = h.id[BCF_DT_ID] + i
+ if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+ yield idpair.key
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ try:
+ self[key]
+ except KeyError:
+ return False
+ else:
+ return True
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
+ meta.header = header
+ meta.type = type
+
+ return meta
+
+
+cdef class VariantContig(object):
+ """contig metadata from a :class:`VariantHeader`"""
+
+ property name:
+ """contig name"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ return h.id[BCF_DT_CTG][self.id].key
+
+ property id:
+ """contig internal id number"""
+ def __get__(self):
+ return self.id
+
+ property length:
+ """contig length or None if not available"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef uint32_t length = h.id[BCF_DT_CTG][self.id].val.info[0]
+ return length if length else None
+
+ property header:
+ """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef bcf_hrec_t *hrec = h.id[BCF_DT_CTG][self.id].val.hrec[0]
+ return makeVariantHeaderRecord(self.header, hrec)
+
+
+cdef VariantContig makeVariantContig(VariantHeader header, int id):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
+ raise ValueError('invalid contig id')
+
+ cdef VariantContig contig = VariantContig.__new__(VariantContig)
+ contig.header = header
+ contig.id = id
+
+ return contig
+
+
+cdef class VariantHeaderContigs(object):
+ """mapping from contig name or index to :class:`VariantContig` object."""
+
+ def __len__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ assert kh_size(<vdict_t *>h.dict[BCF_DT_CTG]) == h.n[BCF_DT_CTG]
+ return h.n[BCF_DT_CTG]
+
+ def __bool__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ assert kh_size(<vdict_t *>h.dict[BCF_DT_CTG]) == h.n[BCF_DT_CTG]
+ return h.n[BCF_DT_CTG] != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef int index
+
+ if isinstance(key, int):
+ index = key
+ if index < 0 or index >= h.n[BCF_DT_CTG]:
+ raise IndexError('invalid contig index')
+ return makeVariantContig(self.header, index)
+
+ cdef vdict_t *d = <vdict_t *>h.dict[BCF_DT_CTG]
+ cdef khiter_t k = kh_get_vdict(d, key)
+
+ if k == kh_end(d):
+ raise KeyError('invalid contig')
+
+ cdef int id = kh_val_vdict(d, k).id
+
+ return makeVariantContig(self.header, id)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>h.dict[BCF_DT_CTG]
+ cdef uint32_t n = kh_size(d)
+
+ assert n == h.n[BCF_DT_CTG]
+
+ for i in range(n):
+ yield bcf_hdr_id2name(h, i)
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ try:
+ self[key]
+ except KeyError:
+ return False
+ else:
+ return True
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
+ contigs.header = header
+
+ return contigs
+
+
+cdef class VariantHeaderSamples(object):
+ """sequence of sample names from a :class:`VariantHeader` object"""
+
+ def __len__(self):
+ return bcf_hdr_nsamples(self.header.ptr)
+
+ def __bool__(self):
+ return bcf_hdr_nsamples(self.header.ptr) != 0
+
+ def __getitem__(self, index):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef int32_t n = bcf_hdr_nsamples(h)
+ cdef int32_t i = index
+
+ if i < 0 or i >= n:
+ raise IndexError('invalid sample index')
+
+ return h.samples[i]
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef int32_t n = bcf_hdr_nsamples(h)
+ cdef int32_t i
+
+ for i in range(n):
+ yield h.samples[i]
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *h = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>h.dict[BCF_DT_SAMPLE]
+ cdef khiter_t k = kh_get_vdict(d, key)
+
+ return k != kh_end(d)
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
+ samples.header = header
+
+ return samples
+
+
+cdef class VariantHeader(object):
+ """header information for a :class:`VariantFile` object"""
+
+ #FIXME: Add structured proxy
+ #FIXME: Add generic proxy
+ #FIXME: Add mutable methods
+
+ # See makeVariantHeader for C constructor
+ def __cinit__(self, mode):
+ self.ptr = NULL
+
+ # Python constructor
+ def __init__(self, mode):
+ if mode not in 'rw':
+ raise ValueError("invalid header mode specified '{}'".format(mode))
+
+ mode = force_bytes(mode)
+ self.ptr = bcf_hdr_init(mode)
+
+ if not self.ptr:
+ raise ValueError('cannot create VariantHeader')
+
+ def __dealloc__(self):
+ if self.ptr:
+ bcf_hdr_destroy(self.ptr)
+ self.ptr = NULL
+
+ def __bool__(self):
+ # self.ptr == NULL should be impossible
+ return self.ptr != NULL
+
+ def copy(self):
+ return makeVariantHeader(bcf_hdr_dup(self.ptr))
+
+ property version:
+ """VCF version"""
+ def __get__(self):
+ return bcf_hdr_get_version(self.ptr)
+
+ property samples:
+ """samples (:class:`VariantHeaderSamples`)"""
+ def __get__(self):
+ return makeVariantHeaderSamples(self)
+
+ property records:
+ """header records (:class:`VariantHeaderRecords`)"""
+ def __get__(self):
+ return makeVariantHeaderRecords(self)
+
+ property contigs:
+ """contig information (:class:`VariantHeaderContigs`)"""
+ def __get__(self):
+ return makeVariantHeaderContigs(self)
+
+ property filters:
+ """filter metadata (:class:`VariantHeaderMetadata`)"""
+ def __get__(self):
+ return makeVariantHeaderMetadata(self, BCF_HL_FLT)
+
+ property info:
+ """info metadata (:class:`VariantHeaderMetadata`)"""
+ def __get__(self):
+ return makeVariantHeaderMetadata(self, BCF_HL_INFO)
+
+ property formats:
+ """format metadata (:class:`VariantHeaderMetadata`)"""
+ def __get__(self):
+ return makeVariantHeaderMetadata(self, BCF_HL_FMT)
+
+ # only safe to do when opening an htsfile
+ cdef _subset_samples(self, include_samples):
+ keep_samples = set(self.samples)
+ include_samples = set(include_samples)
+ missing_samples = include_samples - keep_samples
+ keep_samples &= include_samples
+
+ if missing_samples:
+ # FIXME: add specialized exception with payload
+ raise ValueError('missing {:d} requested samples'.format(len(missing_samples)))
+
+ keep_samples = ','.join(keep_samples)
+ cdef char *keep = <char *>keep_samples if keep_samples else NULL
+ cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
+
+ if ret != 0:
+ raise ValueError('bcf_hdr_set_samples failed: ret = {}'.format(ret))
+
+ def __str__(self):
+ cdef int hlen
+ cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
+
+ ret = hstr[:hlen]
+ free(hstr)
+ return force_str(hstr)
+
+
+cdef VariantHeader makeVariantHeader(bcf_hdr_t *h):
+ if not h:
+ raise ValueError('cannot create VariantHeader')
+
+ cdef VariantHeader header = VariantHeader.__new__(VariantHeader, None)
+ header.ptr = h
+
+ return header
+
+
+########################################################################
+########################################################################
+## Variant Record objects
+########################################################################
+
+cdef class VariantRecordFilter(object):
+ """mapping from filter index or name to :class:`VariantMetadata` object for filters set on a :class:`VariantRecord` object."""
+
+ def __len__(self):
+ return self.record.ptr.d.n_flt
+
+ def __bool__(self):
+ return self.record.ptr.d.n_flt != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int index, id
+ cdef int n = r.d.n_flt
+
+ if isinstance(key, int):
+ index = key
+
+ if index < 0 or index >= n:
+ raise IndexError('invalid filter index')
+
+ id = r.d.flt[index]
+ else:
+ if key == '.':
+ key = 'PASS'
+
+ id = bcf_hdr_id2int(h, BCF_DT_ID, key)
+
+ if not bcf_hdr_idinfo_exists(h, BCF_HL_FLT, id) or not bcf_has_filter(h, self.record.ptr, key):
+ raise KeyError('Invalid filter')
+
+ return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = r.d.n_flt
+
+ for i in range(n):
+ yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.flt[i])
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ return bcf_has_filter(h, r, key) == 1
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
+ filter.record = record
+
+ return filter
+
+
+cdef class VariantRecordFormat(object):
+ """mapping from format name or index to :class:`VariantMetadata` object for formats present in a :class:`VariantRecord` object."""
+
+ def __len__(self):
+ return self.record.ptr.n_fmt
+
+ def __bool__(self):
+ return self.record.ptr.n_fmt != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt
+ cdef int index
+ cdef int n = r.n_fmt
+
+ if isinstance(key, int):
+ index = key
+ if index < 0 or index >= n:
+ raise IndexError('invalid format index')
+ fmt = &r.d.fmt[index]
+ else:
+ fmt = bcf_get_fmt(h, r, key)
+ if not fmt:
+ raise KeyError('unknown format')
+
+ return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = r.n_fmt
+
+ for i in range(n):
+ yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.fmt[i].id)
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(h, r, key)
+ return fmt != NULL
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
+ format.record = record
+
+ return format
+
+
+#TODO: Add a getmeta method to return the corresponding VariantMetadata?
+cdef class VariantRecordInfo(object):
+ """mapping from info metadata name to value for info data present in a :class:`VariantRecord` object."""
+
+ def __len__(self):
+ return self.record.ptr.n_info
+
+ def __bool__(self):
+ return self.record.ptr.n_info != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info = bcf_get_info(h, r, key)
+
+ if not info:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+
+ return bcf_info_value(info)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = r.n_info
+
+ for i in range(n):
+ yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.info[i].key)
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info = bcf_get_info(h, r, key)
+
+ return info != NULL
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef int i, n = r.n_info
+
+ for i in range(n):
+ info = &r.d.info[i]
+ yield bcf_info_value(info)
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef int i, n = r.n_info
+
+ for i in range(n):
+ info = &r.d.info[i]
+ key = bcf_hdr_int2id(h, BCF_DT_ID, info.key)
+ value = bcf_info_value(info)
+ yield key, value
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
+ info.record = record
+
+ return info
+
+
+cdef class VariantRecordSamples(object):
+ """mapping from sample index or name to :class:`makeVariantRecordSample` object."""
+
+ def __len__(self):
+ return bcf_hdr_nsamples(self.record.header.ptr)
+
+ def __bool__(self):
+ return bcf_hdr_nsamples(self.record.header.ptr) != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int n = bcf_hdr_nsamples(h)
+ cdef int sample_index
+ cdef vdict_t *d
+ cdef khiter_t k
+
+ if isinstance(key, int):
+ sample_index = key
+ else:
+ sample_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, key)
+ if sample_index < 0:
+ raise KeyError('invalid sample name')
+
+ if sample_index < 0 or sample_index >= n:
+ raise IndexError('invalid sample index')
+
+ return makeVariantRecordSample(self.record, sample_index)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t i, n = bcf_hdr_nsamples(h)
+
+ for i in range(n):
+ yield h.samples[i]
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int n = bcf_hdr_nsamples(h)
+ cdef int sample_index
+ cdef vdict_t *d
+ cdef khiter_t k
+
+ if isinstance(key, int):
+ sample_index = key
+ else:
+ sample_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, key)
+ if sample_index < 0:
+ raise KeyError('invalid sample name')
+
+ return 0 <= sample_index < n
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t i, n = bcf_hdr_nsamples(h)
+
+ for i in range(n):
+ yield makeVariantRecordSample(self.record, i)
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t i, n = bcf_hdr_nsamples(h)
+
+ for i in range(n):
+ yield h.samples[i], makeVariantRecordSample(self.record, i)
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordSamples genos = VariantRecordSamples.__new__(VariantRecordSamples)
+ genos.record = record
+
+ return genos
+
+
+cdef class VariantRecord(object):
+ """Variant record"""
+
+ def __dealloc__(self):
+ if self.ptr:
+ bcf_destroy1(self.ptr)
+ self.ptr = NULL
+
+ property rid:
+ """internal reference id number"""
+ def __get__(self):
+ return self.ptr.rid
+
+ property chrom:
+ """chromosome/contig name"""
+ def __get__(self):
+ return bcf_hdr_id2name(self.header.ptr, self.ptr.rid)
+
+ property contig:
+ """chromosome/contig name"""
+ def __get__(self):
+ return bcf_hdr_id2name(self.header.ptr, self.ptr.rid)
+
+ property pos:
+ """record start position on chrom/contig (1-based inclusive)"""
+ def __get__(self):
+ return self.ptr.pos + 1
+
+ property start:
+ """record start position on chrom/contig (0-based inclusive)"""
+ def __get__(self):
+ return self.ptr.pos
+
+ property stop:
+ """record stop position on chrom/contig (0-based exclusive)"""
+ def __get__(self):
+ return self.ptr.pos + self.ptr.rlen
+
+ property rlen:
+ """record length on chrom/contig (rec.stop - rec.start)"""
+ def __get__(self):
+ return self.ptr.rlen
+
+ property qual:
+ """phred scaled quality score or None if not available"""
+ def __get__(self):
+ return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
+
+# property n_info:
+# def __get__(self):
+# if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
+# raise ValueError('Error unpacking BCFRecord')
+# return self.ptr.n_info
+
+# property n_allele:
+# def __get__(self):
+# return self.ptr.n_allele
+
+# property n_fmt:
+# def __get__(self):
+# return self.ptr.n_fmt
+
+# property n_sample:
+# def __get__(self):
+# return self.ptr.n_sample
+
+# property shared:
+# def __get__(self):
+# return self.ptr.shared.s
+
+# property indiv:
+# def __get__(self):
+# return self.ptr.indiv.s
+
+# property n_flt:
+# def __get__(self):
+# if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
+# raise ValueError('Error unpacking VariantRecord')
+# return self.ptr.d.n_flt
+
+ property id:
+ """record identifier or None if not available"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ id = self.ptr.d.id
+ return id if id != b'.' else None
+
+ property ref:
+ """reference allele"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return self.ptr.d.allele[0] if self.ptr.d.allele else None
+
+ property alleles:
+ """tuple of reference allele followed by alt alleles"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ if not self.ptr.d.allele:
+ return None
+ return tuple(self.ptr.d.allele[i] for i in range(self.ptr.n_allele))
+
+ property alts:
+ """tuple of alt alleles"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ if self.ptr.n_allele < 2 or not self.ptr.d.allele:
+ return None
+ return tuple(self.ptr.d.allele[i] for i in range(1,self.ptr.n_allele))
+
+ property filter:
+ """filter information (see :class:`VariantRecordFilter`)"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordFilter(self)
+
+ property info:
+ """info data (see :class:`VariantRecordInfo`)"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordInfo(self)
+
+ property format:
+ """sample format metadata (see :class:`VariantRecordFormat`)"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordFormat(self)
+
+ property samples:
+ """sample data (see :class:`VariantRecordSamples`)"""
+ def __get__(self):
+ if bcf_unpack(self.ptr, BCF_UN_IND) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordSamples(self)
+
+ def __str__(self):
+ cdef kstring_t line
+ cdef char c
+
+ line.l = line.m = 0
+ line.s = NULL
+
+ if vcf_format(self.header.ptr, self.ptr, &line) < 0:
+ if line.m:
+ free(line.s)
+ raise ValueError('vcf_format failed')
+
+ # Strip CR/LF?
+ #while line.l:
+ # c = line.s[line.l - 1]
+ # if c != b'\n' and c != b'\r':
+ # break
+ # line.l -= 1
+
+ ret = line.s[:line.l]
+ ret = force_str(ret)
+
+ if line.m:
+ free(line.s)
+
+ return ret
+
+
+cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if not r:
+ raise ValueError('cannot create VariantRecord')
+
+ cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
+ record.header = header
+ record.ptr = r
+
+ return record
+
+
+########################################################################
+########################################################################
+## Variant Sampletype object
+########################################################################
+
+
+cdef class VariantRecordSample(object):
+ """Data for a single sample from a :class:`VariantRecord` object.
+ Provides data accessors for genotypes and a mapping interface from format name to values.
+ """
+
+ property name:
+ """sample name"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(h)
+
+ if self.index < 0 or self.index >= n:
+ raise ValueError('invalid sample index')
+
+ return h.samples[self.index]
+
+ property allele_indices:
+ """allele indices for called genotype, if present. Otherwise None"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(h)
+
+ if self.index < 0 or self.index >= n or not r.n_fmt:
+ return None
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(h, fmt0)
+
+ if not gt0 or not fmt0.n:
+ return None
+
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ alleles = []
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + self.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ alleles.append( (data8[i] >> 1) - 1 )
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + self.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ alleles.append( (data16[i] >> 1) - 1 )
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + self.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ alleles.append( (data32[i] >> 1) - 1 )
+
+ return tuple(alleles)
+
+ property alleles:
+ """alleles for called genotype, if present. Otherwise None"""
+ def __get__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t nsamples = bcf_hdr_nsamples(h)
+ cdef int32_t nalleles = r.n_allele
+
+ if self.index < 0 or self.index >= nsamples or not r.n_fmt:
+ return None
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(h, fmt0)
+
+ if not gt0 or not fmt0.n:
+ return None
+
+ cdef int32_t a
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ alleles = []
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + self.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ a = (data8[i] >> 1) - 1
+ alleles.append(r.d.allele[a] if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + self.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ a = (data16[i] >> 1) - 1
+ alleles.append(r.d.allele[a] if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + self.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ a = (data32[i] >> 1) - 1
+ alleles.append(r.d.allele[a] if 0 <= a < nalleles else None)
+
+ return tuple(alleles)
+
+ def __len__(self):
+ return self.record.ptr.n_fmt
+
+ def __bool__(self):
+ return self.record.ptr.n_fmt != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt
+ cdef int index
+
+ if isinstance(key, int):
+ index = key
+ if index < 0 or index >= r.n_fmt:
+ raise IndexError('invalid format index')
+ fmt = r.d.fmt + index
+ else:
+ fmt = bcf_get_fmt(h, r, key)
+
+ if not fmt:
+ raise KeyError('invalid format requested')
+
+ if is_gt_fmt(h, fmt):
+ return self.alleles
+ elif fmt.p and fmt.n and fmt.size:
+ return bcf_array_to_object(fmt.p + self.index * fmt.size, fmt.type, fmt.n, scalar=1)
+ else:
+ return None
+
+ def __iter__(self):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = r.n_fmt
+
+ for i in range(n):
+ yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.fmt[i].id)
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *h = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(h, r, key)
+ return fmt != NULL
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
+ if not record or sample_index < 0:
+ raise ValueError('cannot create VariantRecordSample')
+
+ cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
+ sample.record = record
+ sample.index = sample_index
+
+ return sample
+
+
+########################################################################
+########################################################################
+## Index objects
+########################################################################
+
+
+cdef class BaseIndex(object):
+ def __init__(self):
+ self.refs = ()
+ self.remap = {}
+
+ def __len__(self):
+ return len(self.refs)
+
+ def __bool__(self):
+ return len(self.refs) != 0
+
+ def __getitem__(self, key):
+ if isinstance(key, int):
+ return self.refs[key]
+ else:
+ return self.refmap[key]
+
+ def __iter__(self):
+ return iter(self.refs)
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ try:
+ self[key]
+ except KeyError:
+ return False
+ else:
+ return True
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef class BCFIndex(object):
+ """CSI index data structure for BCF files"""
+ def __init__(self):
+ self.refs = ()
+ self.refmap = {}
+
+ if not self.ptr:
+ raise ValueError('Invalid index object')
+
+ cdef int n
+ cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
+
+ if not refs:
+ raise ValueError('Cannot retrieve reference sequence names')
+
+ self.refs = char_array_to_tuple(refs, n, free_after=1)
+ self.refmap = { r:i for i,r in enumerate(self.refs) }
+
+ def __dealloc__(self):
+ if self.ptr:
+ hts_idx_destroy(self.ptr)
+ self.ptr = NULL
+
+ def fetch(self, bcf, contig, start, stop, region, reopen):
+ return BCFIterator(bcf, contig, start, stop, region, reopen)
+
+
+cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
+ if not idx:
+ return None
+
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
+ index.header = header
+ index.ptr = idx
+ index.__init__()
+
+ return index
+
+
+cdef class TabixIndex(BaseIndex):
+ """Tabix index data structure for VCF files"""
+ def __init__(self):
+ self.refs = ()
+ self.refmap = {}
+
+ if not self.ptr:
+ raise ValueError('Invalid index object')
+
+ cdef int n
+ cdef const char **refs = tbx_seqnames(self.ptr, &n)
+
+ if not refs:
+ raise ValueError('Cannot retrieve reference sequence names')
+
+ self.refs = char_array_to_tuple(refs, n, free_after=1)
+ self.refmap = { r:i for i,r in enumerate(self.refs) }
+
+ def __dealloc__(self):
+ if self.ptr:
+ tbx_destroy(self.ptr)
+ self.ptr = NULL
+
+ def fetch(self, bcf, contig, start, stop, region, reopen):
+ return TabixIterator(bcf, contig, start, stop, region, reopen)
+
+
+cdef TabixIndex makeTabixIndex(tbx_t *idx):
+ if not idx:
+ return None
+
+ cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
+ index.ptr = idx
+ index.__init__()
+
+ return index
+
+
+########################################################################
+########################################################################
+## Iterators
+########################################################################
+
+
+cdef class BaseIterator(object):
+ pass
+
+
+# Interal function to clean up after iteration stop or failure.
+# This would be a nested function if it weren't a cdef function.
+cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
+ bcf_destroy1(record)
+
+ # destroy iter so future calls to __next__ raise StopIteration
+ bcf_itr_destroy(self.iter)
+ self.iter = NULL
+
+
+cdef class BCFIterator(BaseIterator):
+ def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+
+ if not isinstance(bcf.index, BCFIndex):
+ raise ValueError('bcf index required')
+
+ cdef BCFIndex index = bcf.index
+
+ if not index:
+ raise ValueError('bcf index required')
+
+ if reopen:
+ bcf = bcf.copy()
+
+ if region is not None:
+ if contig is not None or start is not None or stop is not None:
+ raise ValueError # FIXME
+
+ self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, region)
+ else:
+ if contig is None:
+ raise ValueError # FIXME
+
+ rid = index.refmap.get(contig, -1)
+
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = MAX_POS
+
+ self.iter = bcf_itr_queryi(index.ptr, rid, start, stop)
+
+ # Do not fail on self.iter == NULL, since it signifies a null query.
+
+ self.bcf = bcf
+ self.index = index
+
+ def __dealloc__(self):
+ if self.iter:
+ bcf_itr_destroy(self.iter)
+ self.iter = NULL
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if not self.iter:
+ raise StopIteration
+
+ cdef bcf1_t *record = bcf_init1()
+
+ record.pos = -1
+ if self.bcf.drop_samples:
+ record.max_unpack = BCF_UN_SHR
+
+ cdef int ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
+
+ if ret < 0:
+ _stop_BCFIterator(self, record)
+ if ret == -1:
+ raise StopIteration
+ else:
+ raise ValueError('error reading BCF file')
+
+ ret = bcf_subset_format(self.bcf.header.ptr, record)
+
+ if ret < 0:
+ _stop_BCFIterator(self, record)
+ raise ValueError('error in bcf_subset_format')
+
+ return makeVariantRecord(self.bcf.header, record)
+
+
+cdef class TabixIterator(BaseIterator):
+ def __cinit__(self, *args, **kwargs):
+ self.line_buffer.l = 0
+ self.line_buffer.m = 0
+ self.line_buffer.s = NULL
+
+ def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+ if not isinstance(bcf.index, TabixIndex):
+ raise ValueError('tabix index required')
+
+ cdef TabixIndex index = bcf.index
+
+ if not index:
+ raise ValueError('bcf index required')
+
+ if reopen:
+ bcf = bcf.copy()
+
+ if region is not None:
+ if contig is not None or start is not None or stop is not None:
+ raise ValueError # FIXME
+
+ self.iter = tbx_itr_querys(index.ptr, region)
+ else:
+ if contig is None:
+ raise ValueError # FIXME
+
+ rid = index.refmap.get(contig, -1)
+
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = MAX_POS
+
+ self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
+
+ # Do not fail on self.iter == NULL, since it signifies a null query.
+
+ self.bcf = bcf
+ self.index = index
+
+ def __dealloc__(self):
+ if self.iter:
+ tbx_itr_destroy(self.iter)
+ self.iter = NULL
+
+ if self.line_buffer.m:
+ free(self.line_buffer.s)
+
+ self.line_buffer.l = 0
+ self.line_buffer.m = 0
+ self.line_buffer.s = NULL
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if not self.iter:
+ raise StopIteration
+
+ cdef int ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
+
+ if ret < 0:
+ tbx_itr_destroy(self.iter)
+ self.iter = NULL
+ if ret == -1:
+ raise StopIteration
+ else:
+ raise ValueError('error reading indexed VCF file')
+
+ cdef bcf1_t *record = bcf_init1()
+
+ record.pos = -1
+ if self.bcf.drop_samples:
+ record.max_unpack = BCF_UN_SHR
+
+ ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
+
+ # FIXME: stop iteration on parse failure?
+ if ret < 0:
+ bcf_destroy1(record)
+ raise ValueError('error in vcf_parse')
+
+ return makeVariantRecord(self.bcf.header, record)
+
+
+########################################################################
+########################################################################
+## Variant File
+########################################################################
+
+
+cdef class VariantFile(object):
+ """*(filename, mode=None, header=None, drop_samples=False)*
+
+ A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
+ opened.
+
+ *mode* should be ``r`` for reading or ``w`` for writing. The default is
+ text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
+ ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
+
+ If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
+ modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
+ For instance, to open a :term:`BCF` formatted file for reading, type::
+
+ f = pysam.VariantFile('ex1.bcf','rb')
+
+ If mode is not specified, we will try to auto-detect in the order 'rb',
+ 'r', thus both the following should work::
+
+ f1 = pysam.VariantFile('ex1.bcf')
+ f2 = pysam.VariantFile('ex1.vcf')
+
+ If an index for a variant file exists (.csi or .tbi), it will be opened
+ automatically. Without an index random access to records via
+ :meth:`fetch` is disabled.
+
+ For writing, a :class:`VariantHeader` object must be provided, typically
+ obtained from another :term:`VCF` file/:term:`BCF` file.
+ """
+ def __cinit__(self, *args, **kwargs):
+ self.htsfile = NULL
+
+ def __init__(self, *args, **kwargs):
+ self.header = None
+ self.index = None
+ self.filename = None
+ self.mode = None
+ self.is_stream = False
+ self.is_remote = False
+ self.is_reading = False
+ self.drop_samples = False
+ self.start_offset = -1
+
+ self.open(*args, **kwargs)
+
+ def __dealloc__(self):
+ if self.htsfile:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
+ property category:
+ """General file format category. One of UNKNOWN, ALIGNMENTS, VARIANTS, INDEX, REGIONS"""
+ def __get__(self):
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return FORMAT_CATEGORIES[self.htsfile.format.category]
+
+ property format:
+ """File format.
+ One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
+ """
+ def __get__(self):
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return FORMATS[self.htsfile.format.format]
+
+ property version:
+ """Tuple of file format version numbers (major, minor)"""
+ def __get__(self):
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return self.htsfile.format.version.major, self.htsfile.format.version.minor
+
+ property compression:
+ """File compression. One of NONE, GZIP, BGZF, CUSTOM."""
+ def __get__(self):
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return COMPRESSION[self.htsfile.format.compression]
+
+ property description:
+ """Vaguely human readable description of the file format"""
+ def __get__(self):
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ cdef char *desc = hts_format_description(&self.htsfile.format)
+ try:
+ return force_str(desc)
+ finally:
+ free(desc)
+
+ def close(self):
+ """closes the :class:`pysam.VariantFile`."""
+ if self.htsfile:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+ self.header = self.index = None
+
+ property is_open:
+ def __get__(self):
+ """return True if VariantFile is open and in a valid state."""
+ return self.htsfile != NULL
+
+ def __iter__(self):
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+
+ if self.mode[0] != b'r':
+ raise ValueError('cannot iterate over Variantfile opened for writing')
+
+ self.is_reading = 1
+ return self
+
+ def __next__(self):
+ cdef int ret
+ cdef bcf1_t *record = bcf_init1()
+
+ record.pos = -1
+ if self.drop_samples:
+ record.max_unpack = BCF_UN_SHR
+
+ ret = bcf_read1(self.htsfile, self.header.ptr, record)
+
+ if ret < 0:
+ bcf_destroy1(record)
+ if ret == -1:
+ raise StopIteration
+ elif ret == -2:
+ raise IOError('truncated file')
+ else:
+ raise ValueError('Variant read failed')
+
+ return makeVariantRecord(self.header, record)
+
+ def copy(self):
+ if not self.is_open:
+ raise ValueError
+
+ cdef VariantFile vars = VariantFile.__new__(VariantFile)
+
+ # FIXME: re-open using fd or else header and index could be invalid
+ vars.htsfile = hts_open(self.filename, self.mode)
+
+ if not vars.htsfile:
+ raise ValueError('Cannot re-open htsfile')
+
+ # minimize overhead by re-using header and index. This approach is
+ # currently risky, but see above for how this can be mitigated.
+ vars.header = self.header
+ vars.index = self.index
+
+ vars.filename = self.filename
+ vars.mode = self.mode
+ vars.drop_samples = self.drop_samples
+ vars.is_stream = self.is_stream
+ vars.is_remote = self.is_remote
+ vars.is_reading = self.is_reading
+ vars.start_offset = self.start_offset
+
+ if self.htsfile.is_bin:
+ vars.seek(self.tell())
+ else:
+ makeVariantHeader(bcf_hdr_read(vars.htsfile))
+
+ return vars
+
+ def open(self, filename, mode=None, VariantHeader header=None, drop_samples=False):
+ """open a vcf/bcf file.
+
+ If open is called on an existing VariantFile, the current file will be
+ closed and a new file will be opened.
+ """
+ # close a previously opened file
+ if self.is_open:
+ self.close()
+
+ # read mode autodetection
+ if mode is None:
+ try:
+ self.open(filename, 'rb', header=header)
+ return
+ except ValueError, msg:
+ pass
+
+ self.open(filename, 'r', header=header)
+ return
+
+ if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'):
+ raise ValueError('invalid file opening mode `{}`'.format(mode))
+
+ mode = mode.encode('ascii')
+
+ # for htslib, wbu seems to not work
+ if mode == b'wbu':
+ mode = b'wb0'
+
+ self.mode = mode
+ self.filename = filename = encode_filename(filename)
+ self.drop_samples = bool(drop_samples)
+
+ # FIXME: Use htsFormat when it is available
+ self.is_remote = filename.startswith(b'http:') or filename.startswith(b'ftp:')
+ self.is_stream = filename == b'-'
+
+ if mode[0] == b'w':
+ # open file for writing
+
+ # header structure (used for writing)
+ if header:
+ self.header = header.copy()
+ else:
+ raise ValueError('a VariantHeader must be specified')
+
+ # open file. Header gets written to file at the same time for bam files
+ # and sam files (in the latter case, the mode needs to be wh)
+ self.htsfile = hts_open(filename, mode)
+
+ if not self.htsfile:
+ raise ValueError("could not open file `{}` (mode='{}')".format((filename, mode)))
+
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ elif mode[0] == b'r':
+ # open file for reading
+ if filename != b'-' and not self.is_remote and not os.path.exists(filename):
+ raise IOError('file `{}` not found'.format(filename))
+
+ self.htsfile = hts_open(filename, mode)
+
+ if not self.htsfile:
+ raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format((filename, mode)))
+
+ self.header = makeVariantHeader(bcf_hdr_read(self.htsfile))
+
+ if not self.header:
+ raise ValueError("file `{}` does not have valid header (mode='{}') - is it BCF format?".format((filename, mode)))
+
+ # check for index and open if present
+ if self.htsfile.format.format == bcf:
+ self.index = makeBCFIndex(self.header, bcf_index_load(filename))
+ else:
+ self.index = makeTabixIndex(tbx_index_load(filename + '.tbi'))
+
+ if not self.is_stream:
+ self.start_offset = self.tell()
+
+ def reset(self):
+ """reset file position to beginning of file just after the header."""
+ return self.seek(self.start_offset, 0)
+
+ def seek(self, uint64_t offset):
+ """move file pointer to position *offset*, see :meth:`pysam.VariantFile.tell`."""
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+ if self.is_stream:
+ raise OSError('seek not available in streams')
+
+ if self.htsfile.format.compression != no_compression:
+ return bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
+ else:
+ return hts_useek(self.htsfile, offset, SEEK_SET)
+
+ def tell(self):
+ """return current file position, see :meth:`pysam.VariantFile.seek`."""
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+ if self.is_stream:
+ raise OSError('tell not available in streams')
+
+ if self.htsfile.format.compression != no_compression:
+ return bgzf_tell(hts_get_bgzfp(self.htsfile))
+ else:
+ return hts_utell(self.htsfile)
+
+ def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
+ """fetch records in a :term:`region` using 0-based indexing. The
+ region is specified by :term:`contig`, *start* and *end*.
+ Alternatively, a samtools :term:`region` string can be supplied.
+
+ Without *contig* or *region* all mapped records will be fetched. The
+ records will be returned ordered by contig, which will not necessarily
+ be the order within the file.
+
+ Set *reopen* to true if you will be using multiple iterators on the
+ same file at the same time. The iterator returned will receive its
+ own copy of a filehandle to the file effectively re-opening the
+ file. Re-opening a file incurrs some overhead, so use with care.
+
+ If only *contig* is set, all records on *contig* will be fetched.
+ If both *region* and *contig* are given, an exception is raised.
+
+ Note that a :term:`VCF` file without a tabix index (.tbi) or a
+ :term:`BCF` file without a CSI index can only be read sequentially.
+ """
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+
+ if self.mode[0] != b'r':
+ raise ValueError('cannot fetch from Variantfile opened for writing')
+
+ if contig is None and region is None:
+ self.is_reading = 1
+ bcf = self.copy() if reopen else self
+ bcf.seek(self.start_offset)
+ return iter(bcf)
+
+ if not self.index:
+ raise ValueError('fetch requires an index')
+
+ self.is_reading = 1
+ return self.index.fetch(self, contig, start, stop, region, reopen)
+
+ cpdef int write(self, VariantRecord record) except -1:
+ """
+ write a single :class:`pysam.VariantRecord` to disk.
+
+ returns the number of bytes written.
+ """
+ if not self.is_open:
+ return 0
+
+ cdef int ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
+
+ if ret < 0:
+ raise ValueError('write failed')
+
+ return ret
+
+ def subset_samples(self, include_samples):
+ """
+ Read only a subset of samples to reduce processing time and memory.
+ Must be called prior to retrieving records.
+ """
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+
+ if self.mode[0] != b'r':
+ raise ValueError('cannot subset samples from Variantfile opened for writing')
+
+ if self.is_reading:
+ raise ValueError('cannot subset samples after fetching records')
+
+ self.header._subset_samples(include_samples)
+
+ # potentially unnecessary optimization that also sets max_unpack
+ if not include_samples:
+ self.drop_samples = True
diff --git a/pysam/cfaidx.pxd b/pysam/cfaidx.pxd
index 2539340..b7926df 100644
--- a/pysam/cfaidx.pxd
+++ b/pysam/cfaidx.pxd
@@ -22,18 +22,21 @@ cdef class FastqProxy:
cdef kseq_t * _delegate
-cdef class FastqFile:
+cdef class FastxFile:
cdef object _filename
cdef gzFile fastqfile
cdef kseq_t * entry
- cdef kseq_t * getCurrent( self )
+ cdef kseq_t * getCurrent(self)
cdef int cnext(self)
+# Compatibility Layer for pysam 0.8.1
+cdef class FastqFile(FastxFile):
+ pass
# Compatibility Layer for pysam < 0.8
cdef class Fastafile(FastaFile):
pass
-cdef class Fastqfile(FastqFile):
+cdef class Fastqfile(FastxFile):
pass
diff --git a/pysam/cfaidx.pyx b/pysam/cfaidx.pyx
index d5ce6a3..5338299 100644
--- a/pysam/cfaidx.pyx
+++ b/pysam/cfaidx.pyx
@@ -123,7 +123,7 @@ cdef class FastaFile:
self.close()
property filename:
- '''number of :term:`filename` associated with this object.'''
+ '''filename associated with this object.'''
def __get__(self):
return self._filename
@@ -237,11 +237,6 @@ cdef class FastaFile:
'''return true if reference in fasta file.'''
return reference in self.reference2length
-######################################################################
-######################################################################
-######################################################################
-## Fastq file
-######################################################################
cdef class FastqProxy:
def __init__(self): pass
@@ -266,13 +261,19 @@ cdef class FastqProxy:
return self._delegate.qual.s
else: return None
-cdef class FastqFile:
+
+cdef class FastxFile:
'''*(filename)*
- A *FASTQ* file. The file is automatically opened.
+ A :term:`fastq` or :term:`fasta` formatted file. The file
+ is automatically opened.
+
+ Entries in the file can be both fastq or fasta formatted
+ or even a mixture of the two.
This file object permits iterating over all entries in
- a fastq file. Random access is not implemented.
+ the file. Random access is not implemented. The iteration
+ returns objects of type :class:`FastqProxy`
'''
def __cinit__(self, *args, **kwargs):
@@ -286,14 +287,12 @@ cdef class FastqFile:
return self.entry != NULL
def _open(self, filename):
- '''open an indexed fasta file.
-
- This method expects an indexed fasta file.
+ '''open a fastq/fasta file.
'''
self.close()
if not os.path.exists(filename):
- raise IOError("No such file or directory: %s" % filename)
+ raise IOError("no such file or directory: %s" % filename)
filename = _encodeFilename(filename)
self.fastqfile = gzopen(filename, "r")
@@ -312,13 +311,13 @@ cdef class FastqFile:
self.close()
property filename:
- '''number of :term:`filename` associated with this object.'''
+ '''filename associated with this object.'''
def __get__(self):
return self._filename
def __iter__(self):
if not self._isOpen():
- raise ValueError( "I/O operation on closed file" )
+ raise ValueError("I/O operation on closed file")
return self
cdef kseq_t * getCurrent(self):
@@ -334,17 +333,21 @@ cdef class FastqFile:
python version of next().
"""
cdef int l
- l = kseq_read( self.entry)
+ l = kseq_read(self.entry)
if (l > 0):
- return makeFastqProxy( self.entry )
+ return makeFastqProxy(self.entry)
else:
raise StopIteration
+# Compatibility Layer for pysam 0.8.1
+cdef class FastqFile(FastxFile):
+ pass
+
# Compatibility Layer for pysam < 0.8
cdef class Fastafile(FastaFile):
pass
-cdef class Fastqfile(FastqFile):
+cdef class Fastqfile(FastxFile):
pass
__all__ = ["FastaFile",
diff --git a/pysam/chtslib.pxd b/pysam/chtslib.pxd
index d62e281..d714072 100644
--- a/pysam/chtslib.pxd
+++ b/pysam/chtslib.pxd
@@ -3,6 +3,7 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
from libc.stdlib cimport malloc, calloc, realloc, free
from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
from libc.stdio cimport FILE, printf
+from posix.types cimport off_t
cdef extern from "Python.h":
long _Py_HashPointer(void*)
@@ -20,16 +21,127 @@ cdef extern from "zlib.h" nogil:
gzFile gzopen( char *path, char *mode)
gzFile gzdopen (int fd, char *mode)
char * gzgets(gzFile file, char *buf, int len)
- int gzeof( gzFile file )
+ int gzeof(gzFile file)
cdef extern from "htslib/kstring.h" nogil:
ctypedef struct kstring_t:
size_t l, m
char *s
+cdef extern from "htslib_util.h" nogil:
+ ctypedef uint32_t khint32_t
+ ctypedef uint32_t khint_t
+ ctypedef khint_t khiter_t
+
+ # Used to manage BCF Header info
+ ctypedef struct vdict_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ khint32_t *flags
+ const char *keys
+ bcf_idinfo_t *vals
+
+ # Used to manage indexed contigs in Tabix
+ ctypedef struct s2i_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ khint32_t *flags
+ const char *keys
+ int64_t *vals
+
+ # Generic khash methods
+ khint_t kh_size(void *d)
+ khint_t kh_begin(void *d)
+ khint_t kh_end(void *d)
+ int kh_exist(void *d, khiter_t i)
+
+ # Specialized khash methods for vdict
+ khint_t kh_get_vdict(vdict_t *d, const char *key)
+ const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
+ bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
+
+
cdef extern from "htslib/hfile.h" nogil:
ctypedef struct hFILE
+ # @abstract Open the named file or URL as a stream
+ # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+ hFILE *hopen(const char *filename, const char *mode)
+
+ # @abstract Associate a stream with an existing open file descriptor
+ # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+ # @notes For socket descriptors (on Windows), mode should contain 's'.
+ hFILE *hdopen(int fd, const char *mode)
+
+ # @abstract Report whether the file name or URL denotes remote storage
+ # @return 0 if local, 1 if remote.
+ # @notes "Remote" means involving e.g. explicit network access, with the
+ # implication that callers may wish to cache such files' contents locally.
+ int hisremote(const char *filename)
+
+ # @abstract Flush (for output streams) and close the stream
+ # @return 0 if successful, or EOF (with errno set) if an error occurred.
+ int hclose(hFILE *fp)
+
+ # @abstract Close the stream, without flushing or propagating errors
+ # @notes For use while cleaning up after an error only. Preserves errno.
+ void hclose_abruptly(hFILE *fp)
+
+ # @abstract Return the stream's error indicator
+ # @return Non-zero (in fact, an errno value) if an error has occurred.
+ # @notes This would be called herror() and return true/false to parallel
+ # ferror(3), but a networking-related herror(3) function already exists. */
+ int herrno(hFILE *fp)
+
+ # @abstract Clear the stream's error indicator
+ void hclearerr(hFILE *fp)
+
+ # @abstract Reposition the read/write stream offset
+ # @return The resulting offset within the stream (as per lseek(2)),
+ # or negative if an error occurred.
+ off_t hseek(hFILE *fp, off_t offset, int whence)
+
+ # @abstract Report the current stream offset
+ # @return The offset within the stream, starting from zero.
+ off_t htell(hFILE *fp)
+
+ # @abstract Read one character from the stream
+ # @return The character read, or EOF on end-of-file or error
+ int hgetc(hFILE *fp)
+
+ # @abstract Peek at characters to be read without removing them from buffers
+ # @param fp The file stream
+ # @param buffer The buffer to which the peeked bytes will be written
+ # @param nbytes The number of bytes to peek at; limited by the size of the
+ # internal buffer, which could be as small as 4K.
+ # @return The number of bytes peeked, which may be less than nbytes if EOF
+ # is encountered; or negative, if there was an I/O error.
+ # @notes The characters peeked at remain in the stream's internal buffer,
+ # and will be returned by later hread() etc calls.
+ ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
+
+ # @abstract Read a block of characters from the file
+ # @return The number of bytes read, or negative if an error occurred.
+ # @notes The full nbytes requested will be returned, except as limited
+ # by EOF or I/O errors.
+ ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
+
+ # @abstract Write a character to the stream
+ # @return The character written, or EOF if an error occurred.
+ int hputc(int c, hFILE *fp)
+
+ # @abstract Write a string to the stream
+ # @return 0 if successful, or EOF if an error occurred.
+ int hputs(const char *text, hFILE *fp)
+
+ # @abstract Write a block of characters to the file
+ # @return Either nbytes, or negative if an error occurred.
+ # @notes In the absence of I/O errors, the full nbytes will be written.
+ ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
+
+ # @abstract For writing streams, flush buffered output to the underlying stream
+ # @return 0 if successful, or EOF if an error occurred.
+ int hflush(hFILE *fp)
+
+
cdef extern from "htslib/bgzf.h" nogil:
ctypedef struct bgzf_mtaux_t
ctypedef struct bgzidx_t
@@ -61,9 +173,10 @@ cdef extern from "htslib/bgzf.h" nogil:
# Open an existing file descriptor for reading or writing.
#
# @param fd file descriptor
- # @param mode mode matching /[rwa][u0-9]+/: 'r' for reading, 'w' for
- # writing, or 'a' for appending, while a digit specifies
- # the zlib compression level.
+ # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+ # writing, 'a' for appending, 'g' for gzip rather than BGZF
+ # compression (with 'w' only), and digit specifies the zlib
+ # compression level.
# Note that there is a distinction between 'u' and '0': the
# first yields plain uncompressed output whereas the latter
# outputs uncompressed data wrapped in the zlib format.
@@ -124,12 +237,13 @@ cdef extern from "htslib/bgzf.h" nogil:
# Write the data in the buffer to the file.
int bgzf_flush(BGZF *fp)
+ int SEEK_SET
+
# Return a virtual file pointer to the current location in the file.
# No interpetation of the value should be made, other than a subsequent
# call to bgzf_seek can be used to position the file at the same point.
# Return value is non-negative on success.
- #define bgzf_tell(fp) (((fp)->block_address << 16) | ((fp)->block_offset & 0xFFFF))
- int64_t bgzf_tell(BGZF * fp)
+ int64_t bgzf_tell(BGZF *fp)
# Set the file to read from the location specified by _pos_.
#
@@ -244,33 +358,82 @@ cdef extern from "htslib/hts.h" nogil:
ctypedef struct cram_fd
ctypedef union FilePointerUnion:
- BGZF * bgzf
- cram_fd * cram
- hFILE * hfile
- void * voidp
+ BGZF *bgzf
+ cram_fd *cram
+ hFILE *hfile
+ void *voidp
+
+ ctypedef enum htsFormatCategory:
+ unknown_category
+ sequence_data # Sequence data -- SAM, BAM, CRAM, etc
+ variant_data # Variant calling data -- VCF, BCF, etc
+ index_file # Index file associated with some data file
+ region_list # Coordinate intervals or regions -- BED, etc
+ category_maximum
+
+ ctypedef enum htsExactFormat:
+ unknown_format
+ binary_format
+ text_format
+ sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
+ format_maximum
+
+ ctypedef enum htsCompression:
+ no_compression, gzip, bgzf, custom
+ compression_maximum
+
+ cdef struct htsVersion:
+ short major, minor
+
+ ctypedef struct htsFormat:
+ htsFormatCategory category
+ htsExactFormat format
+ htsVersion version
+ htsCompression compression
ctypedef struct htsFile:
- # uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, is_compressed:2, is_kstream:1, dummy:25;
- uint32_t is_bin
+ uint8_t is_bin
+ uint8_t is_write
+ uint8_t is_be
+ uint8_t is_cram
int64_t lineno
kstring_t line
- char * fn
- char * fn_aux
+ char *fn
+ char *fn_aux
FilePointerUnion fp
+ htsFormat format
int hts_verbose
- # @abstract Table for converting a nucleotide character to the 4-bit encoding.
+ # @abstract Table for converting a nucleotide character to 4-bit encoding.
+ # The input character may be either an IUPAC ambiguity code, '=' for 0, or
+ # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
+ # for A/C/G/T or combinations of these bits for ambiguous bases.
const unsigned char *seq_nt16_table
- # @abstract Table for converting a 4-bit encoded nucleotide to a letter.
+ # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+ # ambiguity code letter (or '=' when given 0).
const char *seq_nt16_str
+ # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+ # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+ const int *seq_nt16_int
+
# @abstract Get the htslib version number
# @return For released versions, a string like "N.N[.N]"; or git describe
# output if using a library built within a Git repository.
const char *hts_version()
+ # @abstract Determine format by peeking at the start of a file
+ # @param fp File opened for reading, positioned at the beginning
+ # @param fmt Format structure that will be filled out on return
+ # @return 0 for success, or negative if an error occurred.
+ int hts_detect_format(hFILE *fp, htsFormat *fmt)
+
+ # @abstract Get a human-readable description of the file format
+ # @return Description string, to be freed by the caller after use.
+ char *hts_format_description(const htsFormat *format)
+
# @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
# @param fn The file name or "-" for stdin/stdout
# @param mode Mode matching /[rwa][bcuz0-9]+/
@@ -281,8 +444,9 @@ cdef extern from "htslib/hts.h" nogil:
# specifier letters:
# b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
# c CRAM format
+ # g gzip compressed
# u uncompressed
- # z compressed
+ # z bgzf compressed
# [0-9] zlib compression level
# Note that there is a distinction between 'u' and '0': the first yields
# plain uncompressed output whereas the latter outputs uncompressed data
@@ -294,11 +458,29 @@ cdef extern from "htslib/hts.h" nogil:
# [rw] .. uncompressed VCF
htsFile *hts_open(const char *fn, const char *mode)
+ # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+ # @param fp The already-open file handle
+ # @param fn The file name or "-" for stdin/stdout
+ # @param mode Open mode, as per hts_open()
+ htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
+
# @abstract Close a file handle, flushing buffered data for output streams
# @param fp The file handle to be closed
# @return 0 for success, or negative if an error occurred.
int hts_close(htsFile *fp)
+ # @abstract Returns the file's format information
+ # @param fp The file handle
+ # @return Read-only pointer to the file's htsFormat.
+ const htsFormat *hts_get_format(htsFile *fp)
+
+ # @abstract Sets a specified CRAM option on the open file handle.
+ # @param fp The file handle open the open file.
+ # @param opt The CRAM_OPT_* option.
+ # @param ... Optional arguments, dependent on the option used.
+ # @return 0 for success, or negative if an error occurred.
+ #int hts_set_opt(htsFile *fp, enum cram_option opt, ...)
+
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
char **hts_readlines(const char *fn, int *_n)
@@ -334,6 +516,10 @@ cdef extern from "htslib/hts.h" nogil:
int8_t HTS_FMT_TBI
int8_t HTS_FMT_CRAI
+ BGZF *hts_get_bgzfp(htsFile *fp)
+ int hts_useek(htsFile *fp, long uoffset, int where)
+ long hts_utell(htsFile *fp)
+
ctypedef struct hts_idx_t
ctypedef struct hts_pair64_t:
@@ -349,6 +535,7 @@ cdef extern from "htslib/hts.h" nogil:
uint32_t read_rest
uint32_t finished
int tid, bed, end, n_off, i
+ int curr_tid, curr_beg, curr_end
uint64_t curr_off
hts_pair64_t *off
hts_readrec_func *readfunc
@@ -399,8 +586,8 @@ cdef extern from "htslib/hts.h" nogil:
#
# Returns one of the FT_* defines.
#
- # This function was added in order to avoid the need for excessive command
- # line switches.
+ # DEPRECATED: This function has been replaced by hts_detect_format().
+ # It and these FT_* macros will be removed in a future HTSlib release.
int FT_UNKN
int FT_GZ
int FT_VCF
@@ -711,7 +898,7 @@ cdef extern from "htslib/sam.h" nogil:
# set bam_pileup1_t::level, while the later does. Level helps the
# implementation of alignment viewers, but calculating this has some
# overhead.
- #
+ #
# is_del, is_head, etc are a bit field, declaring as below should
# work as expected, see
# https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
@@ -763,6 +950,7 @@ cdef extern from "htslib/sam.h" nogil:
# Added by AH
# ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
+
cdef extern from "pysam_stream.h" nogil:
ctypedef struct kstream_t:
@@ -775,12 +963,12 @@ cdef extern from "pysam_stream.h" nogil:
kstring_t qual
gzFile gzopen(char *, char *)
- kseq_t * kseq_init(gzFile)
+ kseq_t *kseq_init(gzFile)
int kseq_read(kseq_t *)
void kseq_destroy(kseq_t *)
int gzclose(gzFile)
- kstream_t * ks_init(gzFile)
+ kstream_t *ks_init(gzFile)
void ks_destroy(kstream_t *)
# Retrieve characters from stream until delimiter
@@ -790,6 +978,7 @@ cdef extern from "pysam_stream.h" nogil:
kstring_t * str,
int * dret)
+
cdef extern from "htslib/faidx.h":
ctypedef struct faidx_t:
@@ -817,9 +1006,10 @@ cdef extern from "htslib/faidx.h":
int faidx_seq_len(faidx_t *fai, const char *seq)
+
# tabix support
cdef extern from "htslib/tbx.h" nogil:
-
+
# tbx.h definitions
int8_t TBX_MAX_SHIFT
int8_t TBX_GENERIC
@@ -842,7 +1032,7 @@ cdef extern from "htslib/tbx.h" nogil:
tbx_conf_t tbx_conf_psltbl
tbx_conf_t tbx_conf_sam
tbx_conf_t tbx_conf_vcf
-
+
void tbx_itr_destroy(hts_itr_t * iter)
hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
@@ -853,7 +1043,7 @@ cdef extern from "htslib/tbx.h" nogil:
int tbx_index_build(char *fn,
int min_shift,
tbx_conf_t *conf)
-
+
tbx_t * tbx_index_load(char *fn)
# free the array but not the values
@@ -861,3 +1051,654 @@ cdef extern from "htslib/tbx.h" nogil:
void tbx_destroy(tbx_t *tbx)
+
+# VCF/BCF API
+cdef extern from "htslib/vcf.h" nogil:
+
+ # Header struct
+
+ uint8_t BCF_HL_FLT # header line
+ uint8_t BCF_HL_INFO
+ uint8_t BCF_HL_FMT
+ uint8_t BCF_HL_CTG
+ uint8_t BCF_HL_STR # structured header line TAG=<A=..,B=..>
+ uint8_t BCF_HL_GEN # generic header line
+
+ uint8_t BCF_HT_FLAG # header type
+ uint8_t BCF_HT_INT
+ uint8_t BCF_HT_REAL
+ uint8_t BCF_HT_STR
+
+ uint8_t BCF_VL_FIXED # variable length
+ uint8_t BCF_VL_VAR
+ uint8_t BCF_VL_A
+ uint8_t BCF_VL_G
+ uint8_t BCF_VL_R
+
+ # === Dictionary ===
+ #
+ # The header keeps three dictonaries. The first keeps IDs in the
+ # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
+ # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
+ # is the actual hash table, which is opaque to the end users. In the hash
+ # table, the key is the ID or sample name as a C string and the value is a
+ # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
+ # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
+ # size of the hash table or, equivalently, the length of the id[] arrays.
+
+ uint8_t BCF_DT_ID # dictionary type
+ uint8_t BCF_DT_CTG
+ uint8_t BCF_DT_SAMPLE
+
+ # Complete textual representation of a header line
+ ctypedef struct bcf_hrec_t:
+ int type # One of the BCF_HL_* type
+ char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
+ char *value # Set only for generic lines, NULL for FILTER/INFO, etc.
+ int nkeys # Number of structured fields
+ char **keys # The key=value pairs
+ char **vals
+
+ ctypedef struct bcf_idinfo_t:
+ uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
+ bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
+ int id
+
+ ctypedef struct bcf_idpair_t:
+ const char *key
+ const bcf_idinfo_t *val
+
+ ctypedef struct bcf_hdr_t:
+ int32_t n[3]
+ bcf_idpair_t *id[3]
+ void *dict[3] # ID dictionary, contig dict and sample dict
+ char **samples
+ bcf_hrec_t **hrec
+ int nhrec, dirty
+ int ntransl
+ int *transl[2] # for bcf_translate()
+ int nsamples_ori # for bcf_hdr_set_samples()
+ uint8_t *keep_samples
+ kstring_t mem
+
+ uint8_t bcf_type_shift[]
+
+ # * VCF record *
+
+ uint8_t BCF_BT_NULL
+ uint8_t BCF_BT_INT8
+ uint8_t BCF_BT_INT16
+ uint8_t BCF_BT_INT32
+ uint8_t BCF_BT_FLOAT
+ uint8_t BCF_BT_CHAR
+
+ uint8_t VCF_REF
+ uint8_t VCF_SNP
+ uint8_t VCF_MNP
+ uint8_t VCF_INDEL
+ uint8_t VCF_OTHER
+
+ ctypedef struct variant_t:
+ int type, n # variant type and the number of bases affected, negative for deletions
+
+ ctypedef struct bcf_fmt_t:
+ int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
+ int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
+ uint8_t *p # same as vptr and vptr_* in bcf_info_t below
+ uint32_t p_len
+ uint32_t p_off
+ uint8_t p_free
+
+ ctypedef union bcf_info_union_t:
+ int32_t i # integer value
+ float f # float value
+
+ ctypedef struct bcf_info_t:
+ int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
+ int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars
+
+ # v1 union only set if $len==1; for easier access
+ bcf_info_union_t v1
+ uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
+ uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset
+ uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes
+ uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new
+ # data block is bigger than the original
+
+ uint8_t BCF1_DIRTY_ID
+ uint8_t BCF1_DIRTY_ALS
+ uint8_t BCF1_DIRTY_FLT
+ uint8_t BCF1_DIRTY_INF
+
+ ctypedef struct bcf_dec_t:
+ int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change
+ int n_flt # Number of FILTER fields
+ int *flt # FILTER keys in the dictionary
+ char *id # ID
+ char *als # REF+ALT block (\0-seperated)
+ char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated
+ bcf_info_t *info # INFO
+ bcf_fmt_t *fmt # FORMAT and individual sample
+ variant_t *var # $var and $var_type set only when set_variant_types called
+ int n_var, var_type
+ int shared_dirty # if set, shared.s must be recreated on BCF output
+ int indiv_dirty # if set, indiv.s must be recreated on BCF output
+
+ uint8_t BCF_ERR_CTG_UNDEF
+ uint8_t BCF_ERR_TAG_UNDEF
+ uint8_t BCF_ERR_NCOLS
+
+ # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
+ # is slower because the string is first to be parsed, packed into BCF line
+ # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
+ # is known in advance that some of the fields will not be required (notably
+ # the sample columns), parsing of these can be skipped by setting max_unpack
+ # appropriately.
+ # Similarly, it is fast to output a BCF line because the columns (kept in
+ # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
+ # line must be formatted in vcf_format.
+
+ ctypedef struct bcf1_t:
+ int32_t rid # CHROM
+ int32_t pos # POS
+ int32_t rlen # length of REF
+ float qual # QUAL
+ uint32_t n_info, n_allele
+ uint32_t n_fmt, n_sample
+ kstring_t shared, indiv
+ bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
+ int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
+ int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
+ int unpack_size[3] # the original block size of ID, REF+ALT and FILTER
+ int errcode # one of BCF_ERR_* codes
+
+ ####### API #######
+
+ # BCF and VCF I/O
+ #
+ # A note about naming conventions: htslib internally represents VCF
+ # records as bcf1_t data structures, therefore most functions are
+ # prefixed with bcf_. There are a few exceptions where the functions must
+ # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
+ # these cases, functions prefixed with bcf_ are more general and work
+ # with both BCF and VCF.
+
+ # bcf_hdr_init() - create an empty BCF header.
+ # @param mode "r" or "w"
+ #
+ # When opened for writing, the mandatory fileFormat and
+ # FILTER=PASS lines are added automatically.
+ bcf_hdr_t *bcf_hdr_init(const char *mode)
+
+ # Destroy a BCF header struct
+ void bcf_hdr_destroy(bcf_hdr_t *h)
+
+ # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
+ bcf1_t *bcf_init()
+
+ # Deallocate a bcf1_t object
+ void bcf_destroy(bcf1_t *v)
+
+ # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
+ # not the bcf1_t object itself.
+ void bcf_empty(bcf1_t *v)
+
+ # Make the bcf1_t object ready for next read. Intended mostly for
+ # internal use, the user should rarely need to call this function
+ # directly.
+ void bcf_clear(bcf1_t *v)
+
+ # Reads VCF or BCF header
+ bcf_hdr_t *bcf_hdr_read(htsFile *fp)
+
+ # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
+ # @samples: samples to include or exclude from file or as a comma-separated string.
+ # LIST|FILE .. select samples in list/file
+ # ^LIST|FILE .. exclude samples from list/file
+ # - .. include all samples
+ # NULL .. exclude all samples
+ # @is_file: @samples is a file (1) or a comma-separated list (1)
+ #
+ # The bottleneck of VCF reading is parsing of genotype fields. If the
+ # reader knows in advance that only subset of samples is needed (possibly
+ # no samples at all), the performance of bcf_read() can be significantly
+ # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
+ # The function bcf_read() will subset the VCF/BCF records automatically
+ # with the notable exception when reading records via bcf_itr_next().
+ # In this case, bcf_subset_format() must be called explicitly, because
+ # bcf_readrec() does not see the header.
+ #
+ # Returns 0 on success, -1 on error or a positive integer if the list
+ # contains samples not present in the VCF header. In such a case, the
+ # return value is the index of the offending sample.
+ #
+ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
+ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
+
+ # Writes VCF or BCF header
+ int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
+
+ # Parse VCF line contained in kstring and populate the bcf1_t struct
+ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
+
+ # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
+ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
+
+ # bcf_read() - read next VCF or BCF record
+ #
+ # Returns -1 on critical errors, 0 otherwise. On errors which are not
+ # critical for reading, such as missing header definitions, v->errcode is
+ # set to one of BCF_ERR* code and must be checked before calling
+ # vcf_write().
+ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+
+ # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
+ #
+ # Note that bcf_unpack() must be called even when reading VCF. It is safe
+ # to call the function repeatedly, it will not unpack the same field
+ # twice.
+ uint8_t BCF_UN_STR # up to ALT inclusive
+ uint8_t BCF_UN_FLT # up to FILTER
+ uint8_t BCF_UN_INFO # up to INFO
+ uint8_t BCF_UN_SHR # all shared information
+ uint8_t BCF_UN_FMT # unpack format and each sample
+ uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT
+ uint8_t BCF_UN_ALL # everything
+
+ int bcf_unpack(bcf1_t *b, int which)
+
+ # bcf_dup() - create a copy of BCF record.
+ #
+ # Note that bcf_unpack() must be called on the returned copy as if it was
+ # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
+ # internally to reflect any changes made by bcf_update_* functions.
+ bcf1_t *bcf_dup(bcf1_t *src)
+ bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
+
+ # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
+ int bcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+
+ # The following functions work only with VCFs and should rarely be called
+ # directly. Usually one wants to use their bcf_* alternatives, which work
+ # transparently with both VCFs and BCFs.
+ bcf_hdr_t *vcf_hdr_read(htsFile *fp)
+ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
+ int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+
+ #************************************************************************
+ # Header querying and manipulation routines
+ #************************************************************************
+
+ # Create a new header using the supplied template
+ bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
+
+ # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
+ # Returns 0 on success or sets a bit on error:
+ # 1 .. conflicting definitions of tag length
+ # # todo
+ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
+
+ # bcf_hdr_add_sample() - add a new sample.
+ # @param sample: sample name to be added
+ int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
+
+ # Read VCF header from a file and update the header
+ int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
+
+ # Returns formatted header (newly allocated string) and its length,
+ # excluding the terminating \0. If is_bcf parameter is unset, IDX
+ # fields are discarded.
+ char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
+
+ # Append new VCF header line, returns 0 on success
+ int bcf_hdr_append(bcf_hdr_t *h, const char *line)
+ int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
+
+ # VCF version, e.g. VCFv4.2
+ const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
+ void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
+
+ # bcf_hdr_remove() - remove VCF header tag
+ # @param type: one of BCF_HL_*
+ # @param key: tag name
+ void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
+
+ # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
+ # @param n: number of samples to keep
+ # @param samples: names of the samples to keep
+ # @param imap: mapping from index in @samples to the sample index in the original file
+ #
+ # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
+ # by comparing n and bcf_hdr_nsamples(out_hdr).
+ # This function can be used to reorder samples.
+ # See also bcf_subset() which subsets individual records.
+ #
+ bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
+
+ # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
+ const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
+
+ # Get number of samples
+ int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
+
+ # The following functions are for internal use and should rarely be called directly
+ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
+ int bcf_hdr_sync(bcf_hdr_t *h)
+ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
+ void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
+ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
+
+ # bcf_hdr_get_hrec() - get header line info
+ # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
+ # @param key: the header key for generic lines (e.g. "fileformat"), any field
+ # for structured lines, typically "ID".
+ # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
+ # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
+ #
+ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
+ bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
+ void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
+ void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
+ int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
+ void hrec_add_idx(bcf_hrec_t *hrec, int idx)
+ void bcf_hrec_destroy(bcf_hrec_t *hrec)
+
+ #************************************************************************
+ # Individual record querying and manipulation routines
+ #************************************************************************
+
+ # See the description of bcf_hdr_subset()
+ int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
+
+ # bcf_translate() - translate tags ids to be consistent with different header. This function
+ # is useful when lines from multiple VCF need to be combined.
+ # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
+ # @src_hdr: the source header, used in bcf_read()
+ # @src_line: line obtained by bcf_read()
+ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
+
+ # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc
+ int bcf_get_variant_types(bcf1_t *rec)
+ int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
+ int bcf_is_snp(bcf1_t *v)
+
+ # bcf_update_filter() - sets the FILTER column
+ # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_id2int(hdr, BCF_DT_ID, "PASS")
+ # @n: Number of filters. If n==0, all filters are removed
+ int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
+
+ # bcf_add_filter() - adds to the FILTER column
+ # @flt_id: filter ID to add, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS")
+ #
+ # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
+ int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
+
+ # bcf_remove_filter() - removes from the FILTER column
+ # @flt_id: filter ID to remove, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS")
+ # @pass: when set to 1 and no filters are present, set to PASS
+ int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
+
+ # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
+ int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
+
+ # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
+ # @alleles: Array of alleles
+ # @nals: Number of alleles
+ # @alleles_string: Comma-separated alleles, starting with the REF allele
+ #
+ # Not that in order for indexing to work correctly in presence of INFO/END tag,
+ # the length of reference allele (line->rlen) must be set explicitly by the caller,
+ # or otherwise, if rlen is zero, strlen(line->d.allele[0]) is used to set the length
+ # on bcf_write().
+ #
+ int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
+ int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
+ int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+
+ # bcf_update_info_*() - functions for updating INFO fields
+ # @hdr: the BCF header
+ # @line: VCF line to be edited
+ # @key: the INFO tag to be updated
+ # @values: pointer to the array of values. Pass NULL to remove the tag.
+ # @n: number of values in the array. When set to 0, the INFO tag is removed
+ #
+ # The @string in bcf_update_info_flag() is optional, @n indicates whether
+ # the flag is set or removed.
+ #
+ # Returns 0 on success or negative value on error.
+ #
+ int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
+ int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
+ int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+ int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+
+ # bcf_update_format_*() - functions for updating FORMAT fields
+ # @values: pointer to the array of values, the same number of elements
+ # is expected for each sample. Missing values must be padded
+ # with bcf_*_missing or bcf_*_vector_end values.
+ # @n: number of values in the array. If n==0, existing tag is removed.
+ #
+ # The function bcf_update_format_string() is a higher-level (slower) variant of
+ # bcf_update_format_char(). The former accepts array of \0-terminated strings
+ # whereas the latter requires that the strings are collapsed into a single array
+ # of fixed-length strings. In case of strings with variable length, shorter strings
+ # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
+ # are not \0-terminated.
+ #
+ # Returns 0 on success or negative value on error.
+ #
+ int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
+ int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
+ int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+ int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
+ int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
+ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+
+ # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
+ # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
+ # from bcf_get_genotypes() below.
+ uint32_t bcf_gt_phased(uint32_t idx)
+ uint32_t bcf_gt_unphased(uint32_t idx)
+ uint32_t bcf_gt_missing
+ uint32_t bcf_gt_is_missing(uint32_t val)
+ uint32_t bcf_gt_is_phased(uint32_t idx)
+ uint32_t bcf_gt_allele(uint32_t val)
+
+ # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
+ uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
+ void bcf_gt2alleles(int igt, int *a, int *b)
+
+ # bcf_get_fmt() - returns pointer to FORMAT's field data
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @fmt: one of GT,PL,...
+ #
+ # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
+ # is not available.
+ #
+ bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+ bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+
+ # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
+ # @line: VCF line obtained from vcf_parse1
+ # @id: The header index for the tag, obtained from bcf_hdr_id2int()
+ #
+ # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
+ # as their goal is to avoid the header lookup.
+ #
+ bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
+ bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
+
+ # bcf_get_info_*() - get INFO values, integers or floats
+ # @hdr: BCF header
+ # @line: BCF record
+ # @tag: INFO tag to retrieve
+ # @dst: *dst is pointer to a memory location, can point to NULL
+ # @ndst: pointer to the size of allocated memory
+ #
+ # Returns negative value on error or the number of written values on
+ # success. bcf_get_info_string() returns on success the number of
+ # characters written excluding the null-terminating byte. bcf_get_info_flag()
+ # returns 1 when flag is set or 0 if not.
+ #
+ # List of return codes:
+ # -1 .. no such INFO tag defined in the header
+ # -2 .. clash between types defined in the header and encountered in the VCF record
+ # -3 .. tag is not present in the VCF record
+ #
+ int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
+ int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
+ int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
+ int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
+ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+
+ # bcf_get_format_*() - same as bcf_get_info*() above
+ #
+ # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
+ # see the description of bcf_update_format_string() and bcf_update_format_char() above.
+ # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
+ # a single block of \0-terminated strings collapsed into a single array and an array of pointers
+ # to these strings. Both arrays must be cleaned by the user.
+ #
+ # Returns negative value on error or the number of written values on success.
+ #
+ # Example:
+ # int ndst = 0; char **dst = NULL
+ # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
+ # for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
+ # free(dst[0]); free(dst)
+ #
+ # Example:
+ # int ngt, *gt_arr = NULL, ngt_arr = 0
+ # ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr)
+ #
+ int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
+ int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
+ int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
+ int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst)
+ int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
+ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+
+ #************************************************************************
+ # Helper functions
+ #************************************************************************
+
+ #
+ # bcf_hdr_id2int() - Translates string into numeric ID
+ # bcf_hdr_int2id() - Translates numeric ID into string
+ # @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
+ # @id: tag name, such as: PL, DP, GT, etc.
+ #
+ # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
+ # fields in BCF records.
+ #
+ int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
+ const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
+
+ # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
+ # bcf_hdr_id2name() - Translates numeric ID to sequence name
+ #
+ int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
+ const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
+ const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
+
+ #
+ # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
+ # @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
+ # @int_id: return value of bcf_id2int, must be >=0
+ #
+ # The returned values are:
+ # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
+ # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields
+ # bcf_hdr_id2type .. the field type, one of BCF_HT_*
+ # bcf_hdr_id2coltype .. the column type, one of BCF_HL_*
+ #
+ # Notes: Prior to using the macros, the presence of the info should be
+ # tested with bcf_hdr_idinfo_exists().
+ #
+ int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
+ bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
+
+ void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
+ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
+
+ void bcf_enc_vchar(kstring_t *s, int l, const char *a)
+ void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
+ void bcf_enc_vfloat(kstring_t *s, int n, float *a)
+
+ #************************************************************************
+ # BCF index
+ #
+ # Note that these functions work with BCFs only. See synced_bcf_reader.h
+ # which provides (amongst other things) an API to work transparently with
+ # both indexed BCFs and VCFs.
+ #************************************************************************
+
+ int bcf_index_build(const char *fn, int min_shift)
+
+ #*******************
+ # Typed value I/O *
+ #******************
+
+ # Note that in contrast with BCFv2.1 specification, HTSlib implementation
+ # allows missing values in vectors. For integer types, the values 0x80,
+ # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
+ # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of
+ # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
+ # end-of-vector indicator.
+ # Note that the end-of-vector byte is not part of the vector.
+
+ # This trial BCF version (v2.2) is compatible with the VCF specification and
+ # enables to handle correctly vectors with different ploidy in presence of
+ # missing values.
+
+ int32_t bcf_int8_vector_end
+ int32_t bcf_int16_vector_end
+ int32_t bcf_int32_vector_end
+ int32_t bcf_str_vector_end
+ int32_t bcf_int8_missing
+ int32_t bcf_int16_missing
+ int32_t bcf_int32_missing
+ int32_t bcf_str_missing
+
+ uint32_t bcf_float_vector_end
+ uint32_t bcf_float_missing
+
+ void bcf_float_set(float *ptr, uint32_t value)
+ void bcf_float_set_vector_end(float *x)
+ void bcf_float_set_missing(float *x)
+
+ int bcf_float_is_missing(float f)
+ int bcf_float_is_vector_end(float f)
+ void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
+ void bcf_enc_size(kstring_t *s, int size, int type)
+ int bcf_enc_inttype(long x)
+ void bcf_enc_int1(kstring_t *s, int32_t x)
+ int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
+ int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
+ int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
+
+ # These trivial wrappers are defined only for consistency with other parts of htslib
+ bcf1_t *bcf_init1()
+ int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ void bcf_destroy1(bcf1_t *v)
+ int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
+ void bcf_clear1(bcf1_t *v)
+ int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
+
+ # Other nice wrappers
+ void bcf_itr_destroy(hts_itr_t *iter)
+ hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+ hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
+ int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
+ hts_idx_t *bcf_index_load(const char *fn)
+ const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
diff --git a/pysam/chtslib.pyx b/pysam/chtslib.pyx
index 5bb2c5f..2f91396 100644
--- a/pysam/chtslib.pyx
+++ b/pysam/chtslib.pyx
@@ -90,5 +90,5 @@ cdef _charptr_to_str(char* s):
else:
return s.decode("ascii")
-
__all__ = []
+
diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx
index 789284f..7a3dd1f 100644
--- a/pysam/csamtools.pyx
+++ b/pysam/csamtools.pyx
@@ -88,6 +88,8 @@ def _samtools_dispatch(method,
which are then read into memory in their entirety. This method
is slow and might cause large memory overhead.
+ Catching of stdout can be turned of by setting *catch_stdout* to False.
+
See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
on the topic of redirecting stderr/stdout.
'''
diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx
index f712233..056c21e 100644
--- a/pysam/ctabix.pyx
+++ b/pysam/ctabix.pyx
@@ -709,12 +709,12 @@ def tabix_compress(filename_in,
fn = _encodeFilename(filename_out)
fp = bgzf_open( fn, "w")
if fp == NULL:
- raise IOError("could not open '%s' for writing")
+ raise IOError("could not open '%s' for writing" % (filename_out, ))
fn = _encodeFilename(filename_in)
fd_src = open(fn, O_RDONLY)
if fd_src == 0:
- raise IOError("could not open '%s' for reading")
+ raise IOError("could not open '%s' for reading" % (filename_in, ))
buffer = malloc(WINDOW_SIZE)
c = 1
diff --git a/pysam/cvcf.pyx b/pysam/cvcf.pyx
index 8a6708b..5feb2a6 100644
--- a/pysam/cvcf.pyx
+++ b/pysam/cvcf.pyx
@@ -16,7 +16,7 @@
# The sample keys are accessible through vcf.getsamples()
#
# A dictionary of values contains value keys (defined in ##INFO or
-# ##FORMAT lines) which map to a list, containign integers, floats,
+# ##FORMAT lines) which map to a list, containing integers, floats,
# strings, or characters. Missing values are replaced by a particular
# value, often -1 or .
#
diff --git a/pysam/htslib_util.c b/pysam/htslib_util.c
index ea048c0..28eeca2 100644
--- a/pysam/htslib_util.c
+++ b/pysam/htslib_util.c
@@ -13,6 +13,24 @@
#define inline __inline
#endif
+// set htslib verbosity level
+extern int hts_verbose;
+int hts_set_verbosity(int verbosity)
+{
+ int old_verbosity = hts_verbose;
+ hts_verbose = verbosity;
+ return old_verbosity;
+}
+
+int hts_get_verbosity()
+{
+ return hts_verbose;
+}
+
+
+int hts_get_hts_verbose();
+
+
// taken from samtools/bam_import.c
static inline uint8_t *alloc_data(bam1_t *b, size_t size)
{
diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h
index 3614473..1f9d491 100644
--- a/pysam/htslib_util.h
+++ b/pysam/htslib_util.h
@@ -1,6 +1,22 @@
-#ifndef PYSAM_UTIL_H
-#define PYSAM_UTIL_H
+#ifndef HTSLIB_UTIL_H
+#define HTSLIB_UTIL_H
+#include "htslib/sam.h"
+#include "htslib/vcf.h"
+#include "htslib/khash.h"
+
+int hts_useek(htsFile *fp, long uoffset, int where);
+long hts_utell(htsFile *fp);
+
+int hts_set_verbosity(int verbosity);
+int hts_get_verbosity();
+
+KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+typedef khash_t(vdict) vdict_t;
+
+KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
+typedef khash_t(s2i) s2i_t;
+
//////////////////////////////////////////////////////////////////
/*! set pysam standard error to point to file descriptor
diff --git a/pysam/version.py b/pysam/version.py
index 13749ed..02c7d45 100644
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,7 +1,7 @@
# pysam versioning information
-__version__ = "0.8.1"
+__version__ = "0.8.3"
-__samtools_version__ = "1.1"
+__samtools_version__ = "1.2"
-__htslib_version__ = "1.1"
+__htslib_version__ = "1.2.1"
diff --git a/requires.txt b/requires.txt
index 743df07..687929a 100644
--- a/requires.txt
+++ b/requires.txt
@@ -1 +1 @@
-cython>=0.17
+cython>=0.22
diff --git a/samtools/bam.h b/samtools/bam.h
index e822331..b8f7bc1 100644
--- a/samtools/bam.h
+++ b/samtools/bam.h
@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "1.1"
+#define BAM_VERSION "1.2"
#include <stdint.h>
#include <stdlib.h>
diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c
index c8d061e..e80e4c2 100644
--- a/samtools/bam2bcf_indel.c
+++ b/samtools/bam2bcf_indel.c
@@ -1,7 +1,7 @@
/* bam2bcf_indel.c -- indel caller.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
+ Copyright (C) 2012-2014 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -26,9 +26,8 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include <ctype.h>
#include <string.h>
-#include "bam.h"
+#include "htslib/sam.h"
#include "bam2bcf.h"
-#include "kaln.h"
#include "kprobaln.h"
#include "htslib/khash.h"
KHASH_SET_INIT_STR(rg)
@@ -197,7 +196,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
aux[m++] = MINUS_CONST + p->indel;
}
}
- j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
+ j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
}
float frac = (float)na/nt;
@@ -224,7 +223,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
if (n_types >= 64) {
free(aux);
- if (bam_verbose >= 2)
+ // TODO revisit how/whether to control printing this warning
+ if (hts_verbose >= 2)
fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
return -1;
}
@@ -264,7 +264,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
cns = calloc(L, 4);
ref0 = calloc(L, 1);
for (i = 0; i < right - left; ++i)
- ref0[i] = bam_nt16_table[(int)ref[i+left]];
+ ref0[i] = seq_nt16_table[(int)ref[i+left]];
for (s = 0; s < n; ++s) {
r = ref_sample[s] = calloc(L, 1);
memset(cns, 0, sizeof(int) * L);
@@ -272,8 +272,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (i = 0; i < n_plp[s]; ++i) {
bam_pileup1_t *p = plp[s] + i;
bam1_t *b = p->b;
- uint32_t *cigar = bam1_cigar(b);
- uint8_t *seq = bam1_seq(b);
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
int x = b->core.pos, y = 0;
for (k = 0; k < b->core.n_cigar; ++k) {
int op = cigar[k]&0xf;
@@ -281,7 +281,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j)
if (x + j >= left && x + j < right)
- cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
+ cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
x += l; y += l;
} else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
@@ -303,14 +303,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
free(ref0); free(cns);
}
{ // the length of the homopolymer run around the current position
- int c = bam_nt16_table[(int)ref[pos + 1]];
+ int c = seq_nt16_table[(int)ref[pos + 1]];
if (c == 15) l_run = 1;
else {
for (i = pos + 2; ref[i]; ++i)
- if (bam_nt16_table[(int)ref[i]] != c) break;
+ if (seq_nt16_table[(int)ref[i]] != c) break;
l_run = i;
for (i = pos; i >= 0; --i)
- if (bam_nt16_table[(int)ref[i]] != c) break;
+ if (seq_nt16_table[(int)ref[i]] != c) break;
l_run -= i + 1;
}
}
@@ -325,9 +325,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (i = 0; i < n_plp[s]; ++i) {
bam_pileup1_t *p = plp[s] + i;
if (p->indel == types[t]) {
- uint8_t *seq = bam1_seq(p->b);
+ uint8_t *seq = bam_get_seq(p->b);
for (k = 1; k <= p->indel; ++k) {
- int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)];
+ int c = bam_nt16_nt4_table[bam_seqi(seq, p->qpos + k)];
assert(c<5);
++inscns_aux[(t*max_ins+(k-1))*5 + c];
}
@@ -383,8 +383,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
int qbeg, qend, tbeg, tend, sc, kk;
- uint8_t *seq = bam1_seq(p->b);
- uint32_t *cigar = bam1_cigar(p->b);
+ uint8_t *seq = bam_get_seq(p->b);
+ uint32_t *cigar = bam_get_cigar(p->b);
if (p->b->core.flag&4) continue; // unmapped reads
// FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
@@ -392,17 +392,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (kk < p->b->core.n_cigar) continue;
// FIXME: the following skips soft clips, but using them may be more sensitive.
// determine the start and end of sequences for alignment
- qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg);
- qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend);
+ qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
+ qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
if (types[t] < 0) {
int l = -types[t];
tbeg = tbeg - l > left? tbeg - l : left;
}
// write the query sequence
for (l = qbeg; l < qend; ++l)
- query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)];
+ query[l - qbeg] = bam_nt16_nt4_table[bam_seqi(seq, l)];
{ // do realignment; this is the bottleneck
- const uint8_t *qual = bam1_qual(p->b), *bq;
+ const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
qq = calloc(qend - qbeg, 1);
bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c
index 480481b..8a469ee 100644
--- a/samtools/bam2bcf_indel.c.pysam.c
+++ b/samtools/bam2bcf_indel.c.pysam.c
@@ -3,7 +3,7 @@
/* bam2bcf_indel.c -- indel caller.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
+ Copyright (C) 2012-2014 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -28,9 +28,8 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include <ctype.h>
#include <string.h>
-#include "bam.h"
+#include "htslib/sam.h"
#include "bam2bcf.h"
-#include "kaln.h"
#include "kprobaln.h"
#include "htslib/khash.h"
KHASH_SET_INIT_STR(rg)
@@ -199,7 +198,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
aux[m++] = MINUS_CONST + p->indel;
}
}
- j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
+ j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
}
float frac = (float)na/nt;
@@ -226,7 +225,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
}
if (n_types >= 64) {
free(aux);
- if (bam_verbose >= 2)
+ // TODO revisit how/whether to control printing this warning
+ if (hts_verbose >= 2)
fprintf(pysamerr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
return -1;
}
@@ -266,7 +266,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
cns = calloc(L, 4);
ref0 = calloc(L, 1);
for (i = 0; i < right - left; ++i)
- ref0[i] = bam_nt16_table[(int)ref[i+left]];
+ ref0[i] = seq_nt16_table[(int)ref[i+left]];
for (s = 0; s < n; ++s) {
r = ref_sample[s] = calloc(L, 1);
memset(cns, 0, sizeof(int) * L);
@@ -274,8 +274,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (i = 0; i < n_plp[s]; ++i) {
bam_pileup1_t *p = plp[s] + i;
bam1_t *b = p->b;
- uint32_t *cigar = bam1_cigar(b);
- uint8_t *seq = bam1_seq(b);
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
int x = b->core.pos, y = 0;
for (k = 0; k < b->core.n_cigar; ++k) {
int op = cigar[k]&0xf;
@@ -283,7 +283,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j)
if (x + j >= left && x + j < right)
- cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
+ cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
x += l; y += l;
} else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
@@ -305,14 +305,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
free(ref0); free(cns);
}
{ // the length of the homopolymer run around the current position
- int c = bam_nt16_table[(int)ref[pos + 1]];
+ int c = seq_nt16_table[(int)ref[pos + 1]];
if (c == 15) l_run = 1;
else {
for (i = pos + 2; ref[i]; ++i)
- if (bam_nt16_table[(int)ref[i]] != c) break;
+ if (seq_nt16_table[(int)ref[i]] != c) break;
l_run = i;
for (i = pos; i >= 0; --i)
- if (bam_nt16_table[(int)ref[i]] != c) break;
+ if (seq_nt16_table[(int)ref[i]] != c) break;
l_run -= i + 1;
}
}
@@ -327,9 +327,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (i = 0; i < n_plp[s]; ++i) {
bam_pileup1_t *p = plp[s] + i;
if (p->indel == types[t]) {
- uint8_t *seq = bam1_seq(p->b);
+ uint8_t *seq = bam_get_seq(p->b);
for (k = 1; k <= p->indel; ++k) {
- int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)];
+ int c = bam_nt16_nt4_table[bam_seqi(seq, p->qpos + k)];
assert(c<5);
++inscns_aux[(t*max_ins+(k-1))*5 + c];
}
@@ -385,8 +385,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
int qbeg, qend, tbeg, tend, sc, kk;
- uint8_t *seq = bam1_seq(p->b);
- uint32_t *cigar = bam1_cigar(p->b);
+ uint8_t *seq = bam_get_seq(p->b);
+ uint32_t *cigar = bam_get_cigar(p->b);
if (p->b->core.flag&4) continue; // unmapped reads
// FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
@@ -394,17 +394,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
if (kk < p->b->core.n_cigar) continue;
// FIXME: the following skips soft clips, but using them may be more sensitive.
// determine the start and end of sequences for alignment
- qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg);
- qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend);
+ qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
+ qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
if (types[t] < 0) {
int l = -types[t];
tbeg = tbeg - l > left? tbeg - l : left;
}
// write the query sequence
for (l = qbeg; l < qend; ++l)
- query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)];
+ query[l - qbeg] = bam_nt16_nt4_table[bam_seqi(seq, l)];
{ // do realignment; this is the bottleneck
- const uint8_t *qual = bam1_qual(p->b), *bq;
+ const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
qq = calloc(qend - qbeg, 1);
bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c
index 70882be..b749062 100644
--- a/samtools/bam2depth.c
+++ b/samtools/bam2depth.c
@@ -98,7 +98,7 @@ int main_depth(int argc, char *argv[])
fprintf(stderr, "Options:\n");
fprintf(stderr, " -b <bed> list of positions or regions\n");
fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -l <int> minQLen\n");
+ fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
fprintf(stderr, " -q <int> base quality threshold\n");
fprintf(stderr, " -Q <int> mapping quality threshold\n");
fprintf(stderr, " -r <chr:from-to> region\n");
@@ -126,6 +126,16 @@ int main_depth(int argc, char *argv[])
status = EXIT_FAILURE;
goto depth_end;
}
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS,
+ SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR |
+ SAM_SEQ)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
data[i]->min_mapQ = mapQ; // set the mapQ filter
data[i]->min_len = min_len; // set the qlen filter
data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
@@ -177,8 +187,8 @@ int main_depth(int argc, char *argv[])
depth_end:
for (i = 0; i < n && data[i]; ++i) {
bam_hdr_destroy(data[i]->hdr);
- sam_close(data[i]->fp);
- if (data[i]->iter) hts_itr_destroy(data[i]->iter);
+ if (data[i]->fp) sam_close(data[i]->fp);
+ hts_itr_destroy(data[i]->iter);
free(data[i]);
}
free(data); free(reg);
diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c
index c991e08..5c588f9 100644
--- a/samtools/bam2depth.c.pysam.c
+++ b/samtools/bam2depth.c.pysam.c
@@ -100,7 +100,7 @@ int main_depth(int argc, char *argv[])
fprintf(pysamerr, "Options:\n");
fprintf(pysamerr, " -b <bed> list of positions or regions\n");
fprintf(pysamerr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(pysamerr, " -l <int> minQLen\n");
+ fprintf(pysamerr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
fprintf(pysamerr, " -q <int> base quality threshold\n");
fprintf(pysamerr, " -Q <int> mapping quality threshold\n");
fprintf(pysamerr, " -r <chr:from-to> region\n");
@@ -128,6 +128,16 @@ int main_depth(int argc, char *argv[])
status = EXIT_FAILURE;
goto depth_end;
}
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS,
+ SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR |
+ SAM_SEQ)) {
+ fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
data[i]->min_mapQ = mapQ; // set the mapQ filter
data[i]->min_len = min_len; // set the qlen filter
data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
@@ -179,8 +189,8 @@ int main_depth(int argc, char *argv[])
depth_end:
for (i = 0; i < n && data[i]; ++i) {
bam_hdr_destroy(data[i]->hdr);
- sam_close(data[i]->fp);
- if (data[i]->iter) hts_itr_destroy(data[i]->iter);
+ if (data[i]->fp) sam_close(data[i]->fp);
+ hts_itr_destroy(data[i]->iter);
free(data[i]);
}
free(data); free(reg);
diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c
index 70ee18b..017d5e1 100644
--- a/samtools/bam_mate.c
+++ b/samtools/bam_mate.c
@@ -74,7 +74,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
*
* How We Handle Input
*
- * Secondary Reads:
+ * Secondary and supplementary Reads:
* -write to output unchanged
* All Reads:
* -if pos == 0 (1 based), tid == -1 set UNMAPPED flag
@@ -94,6 +94,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
* -write to output
* Limitations
* -Does not handle tandem reads
+ * -Should mark supplementary reads the same as primary.
* Notes
* -CT definition appears to be something else in spec, this was in here before
* I started tampering with it, anyone know what is going on here? To work
diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c
index 210e6ea..be0dc37 100644
--- a/samtools/bam_mate.c.pysam.c
+++ b/samtools/bam_mate.c.pysam.c
@@ -76,7 +76,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
*
* How We Handle Input
*
- * Secondary Reads:
+ * Secondary and supplementary Reads:
* -write to output unchanged
* All Reads:
* -if pos == 0 (1 based), tid == -1 set UNMAPPED flag
@@ -96,6 +96,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
* -write to output
* Limitations
* -Does not handle tandem reads
+ * -Should mark supplementary reads the same as primary.
* Notes
* -CT definition appears to be something else in spec, this was in here before
* I started tampering with it, anyone know what is going on here? To work
diff --git a/samtools/bam_md.c b/samtools/bam_md.c
index 93f64f5..7d1c6a7 100644
--- a/samtools/bam_md.c
+++ b/samtools/bam_md.c
@@ -30,7 +30,6 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/faidx.h"
#include "sam.h"
#include "htslib/kstring.h"
-#include "kaln.h"
#include "kprobaln.h"
#define USE_EQUAL 1
diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c
index 840d774..5f5bb8a 100644
--- a/samtools/bam_md.c.pysam.c
+++ b/samtools/bam_md.c.pysam.c
@@ -32,7 +32,6 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/faidx.h"
#include "sam.h"
#include "htslib/kstring.h"
-#include "kaln.h"
#include "kprobaln.h"
#define USE_EQUAL 1
diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c
index a1b381d..d574cca 100644
--- a/samtools/bam_plcmd.c
+++ b/samtools/bam_plcmd.c
@@ -61,7 +61,9 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref
putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
}
if (!p->is_del) {
- int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
+ int c = p->qpos < p->b->core.l_qseq
+ ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]
+ : 'N';
if (ref) {
int rb = pos < ref_len? ref[pos] : 'N';
if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.';
@@ -264,6 +266,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
exit(1);
}
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
hts_set_fai_filename(data[i]->fp, conf->fai_fname);
data[i]->conf = conf;
h_tmp = sam_hdr_read(data[i]->fp);
@@ -271,7 +277,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
exit(1);
}
- data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
// Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
@@ -281,17 +286,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
exit(1);
}
- if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) {
- fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg);
+ if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) {
+ fprintf(stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
exit(1);
}
if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
hts_idx_destroy(idx);
}
- if (i == 0) h = h_tmp; /* save the header of first file in list */
+ else
+ data[i]->iter = NULL;
+
+ if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file
else {
- // FIXME: to check consistency
+ // FIXME: check consistency between h and h_tmp
bam_hdr_destroy(h_tmp);
+
+ // we store only the first file's header; it's (alleged to be)
+ // compatible with the i-th file's target_name lookup needs
+ data[i]->h = h;
}
}
// allocate data storage proportionate to number of samples being studied sm->n
@@ -316,6 +328,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
exit(1);
}
+ // BCF header creation
bcf_hdr = bcf_hdr_init("w");
kstring_t str = {0,0,0};
@@ -335,6 +348,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_append(bcf_hdr, str.s);
}
+ // Translate BAM @SQ tags to BCF ##contig tags
// todo: use/write new BAM header manipulation routines, fill also UR, M5
for (i=0; i<h->n_targets; i++)
{
@@ -381,7 +395,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]);
bcf_hdr_add_sample(bcf_hdr, NULL);
bcf_hdr_write(bcf_fp, bcf_hdr);
+ // End of BCF header creation
+ // Initialise the calling algorithm
bca = bcf_call_init(-1., conf->min_baseQ);
bcr = calloc(sm->n, sizeof(bcf_callret1_t));
bca->rghash = rghash;
@@ -422,7 +438,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
} else ref_tid = -1, ref = 0;
- // begin pileup
+ // init pileup
iter = bam_mplp_init(n, mplp_func, (void**)data);
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
max_depth = conf->max_depth;
@@ -436,6 +452,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bam_mplp_set_maxcnt(iter, max_depth);
bcf1_t *bcf_rec = bcf_init1();
int ret;
+ // begin pileup
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
@@ -477,7 +494,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
int j, cnt;
for (j = cnt = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
- if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt;
+ int c = p->qpos < p->b->core.l_qseq
+ ? bam_get_qual(p->b)[p->qpos]
+ : 0;
+ if (c >= conf->min_baseQ) ++cnt;
}
fprintf(pileup_fp, "\t%d\t", cnt);
if (n_plp[i] == 0) {
@@ -487,13 +507,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
} else {
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
- if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ)
+ int c = p->qpos < p->b->core.l_qseq
+ ? bam_get_qual(p->b)[p->qpos]
+ : 0;
+ if (c >= conf->min_baseQ)
pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
}
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
- int c = bam_get_qual(p->b)[p->qpos];
+ int c = p->qpos < p->b->core.l_qseq
+ ? bam_get_qual(p->b)[p->qpos]
+ : 0;
if (c >= conf->min_baseQ) {
c = c + 33 < 126? c + 33 : 126;
putc(c, pileup_fp);
diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c
index 2151a1b..9d2c987 100644
--- a/samtools/bam_plcmd.c.pysam.c
+++ b/samtools/bam_plcmd.c.pysam.c
@@ -63,7 +63,9 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref
putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
}
if (!p->is_del) {
- int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
+ int c = p->qpos < p->b->core.l_qseq
+ ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]
+ : 'N';
if (ref) {
int rb = pos < ref_len? ref[pos] : 'N';
if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.';
@@ -266,6 +268,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(pysamerr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
exit(1);
}
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
hts_set_fai_filename(data[i]->fp, conf->fai_fname);
data[i]->conf = conf;
h_tmp = sam_hdr_read(data[i]->fp);
@@ -273,7 +279,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(pysamerr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
exit(1);
}
- data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
// Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
@@ -283,17 +288,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(pysamerr, "[%s] fail to load index for %s\n", __func__, fn[i]);
exit(1);
}
- if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) {
- fprintf(pysamerr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg);
+ if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) {
+ fprintf(pysamerr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
exit(1);
}
if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
hts_idx_destroy(idx);
}
- if (i == 0) h = h_tmp; /* save the header of first file in list */
+ else
+ data[i]->iter = NULL;
+
+ if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file
else {
- // FIXME: to check consistency
+ // FIXME: check consistency between h and h_tmp
bam_hdr_destroy(h_tmp);
+
+ // we store only the first file's header; it's (alleged to be)
+ // compatible with the i-th file's target_name lookup needs
+ data[i]->h = h;
}
}
// allocate data storage proportionate to number of samples being studied sm->n
@@ -318,6 +330,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
exit(1);
}
+ // BCF header creation
bcf_hdr = bcf_hdr_init("w");
kstring_t str = {0,0,0};
@@ -337,6 +350,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_append(bcf_hdr, str.s);
}
+ // Translate BAM @SQ tags to BCF ##contig tags
// todo: use/write new BAM header manipulation routines, fill also UR, M5
for (i=0; i<h->n_targets; i++)
{
@@ -383,7 +397,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]);
bcf_hdr_add_sample(bcf_hdr, NULL);
bcf_hdr_write(bcf_fp, bcf_hdr);
+ // End of BCF header creation
+ // Initialise the calling algorithm
bca = bcf_call_init(-1., conf->min_baseQ);
bcr = calloc(sm->n, sizeof(bcf_callret1_t));
bca->rghash = rghash;
@@ -424,7 +440,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
} else ref_tid = -1, ref = 0;
- // begin pileup
+ // init pileup
iter = bam_mplp_init(n, mplp_func, (void**)data);
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
max_depth = conf->max_depth;
@@ -438,6 +454,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bam_mplp_set_maxcnt(iter, max_depth);
bcf1_t *bcf_rec = bcf_init1();
int ret;
+ // begin pileup
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
@@ -479,7 +496,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
int j, cnt;
for (j = cnt = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
- if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt;
+ int c = p->qpos < p->b->core.l_qseq
+ ? bam_get_qual(p->b)[p->qpos]
+ : 0;
+ if (c >= conf->min_baseQ) ++cnt;
}
fprintf(pileup_fp, "\t%d\t", cnt);
if (n_plp[i] == 0) {
@@ -489,13 +509,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
} else {
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
- if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ)
+ int c = p->qpos < p->b->core.l_qseq
+ ? bam_get_qual(p->b)[p->qpos]
+ : 0;
+ if (c >= conf->min_baseQ)
pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
}
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
- int c = bam_get_qual(p->b)[p->qpos];
+ int c = p->qpos < p->b->core.l_qseq
+ ? bam_get_qual(p->b)[p->qpos]
+ : 0;
if (c >= conf->min_baseQ) {
c = c + 33 < 126? c + 33 : 126;
putc(c, pileup_fp);
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c
index c9c1af3..e721c59 100644
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -230,9 +230,6 @@ static void pretty_header(char** text_in_out, int32_t text_len)
static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg)
{
- // No need to translate header into itself
- if (out == translate) { merge_rg = merge_pg = true; }
-
tbl->n_targets = translate->n_targets;
tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int));
tbl->rg_trans = kh_init(c2c);
@@ -594,6 +591,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
uint64_t idx = 0;
char **RG = NULL;
hts_itr_t **iter = NULL;
+ bam_hdr_t **hdr = NULL;
trans_tbl_t *translation_tbl = NULL;
// Is there a specified pre-prepared header to use for output?
@@ -612,6 +610,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
fp = (samFile**)calloc(n, sizeof(samFile*));
heap = (heap1_t*)calloc(n, sizeof(heap1_t));
iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
+ hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
// prepare RG tag from file names
if (flag & MERGE_RG) {
@@ -641,9 +640,20 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
return -1;
}
hin = sam_hdr_read(fp[i]);
- if (hout == NULL) hout = hin;
- trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
- if (hin != hout) bam_hdr_destroy(hin);
+ if (hout)
+ trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
+ else {
+ // As yet, no headers to merge into...
+ hout = bam_hdr_dup(hin);
+ // ...so no need to translate header into itself
+ trans_tbl_init(hout, hin, translation_tbl+i, true, true);
+ }
+
+ // TODO sam_itr_next() doesn't yet work for SAM files,
+ // so for those keep the headers around for use with sam_read1()
+ if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
+ else { bam_hdr_destroy(hin); hdr[i] = NULL; }
+
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
}
@@ -677,30 +687,38 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
}
hts_idx_destroy(idx);
+ if (iter[i] == NULL) break;
}
free(rtrans);
} else {
for (i = 0; i < n; ++i) {
- iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
- if (iter[i] == NULL) {
- fprintf(stderr, "[%s] Memory allocation failed\n", __func__);
- return -1;
+ if (hdr[i] == NULL) {
+ iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
+ if (iter[i] == NULL) break;
}
+ else iter[i] = NULL;
}
}
+ if (i < n) {
+ fprintf(stderr, "[%s] Memory allocation failed\n", __func__);
+ return -1;
+ }
+
// Load the first read from each file into the heap
for (i = 0; i < n; ++i) {
heap1_t *h = heap + i;
h->i = i;
- h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
- if (sam_itr_next(fp[i], iter[i], h->b) >= 0) {
+ h->b = bam_init1();
+ if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) {
bam_translate(h->b, translation_tbl + i);
h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
h->idx = idx++;
}
else {
h->pos = HEAP_EMPTY;
+ bam_destroy1(h->b);
+ h->b = NULL;
}
}
@@ -722,13 +740,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
sam_write1(fpout, hout, b);
- if ((j = sam_itr_next(fp[heap->i], iter[heap->i], b)) >= 0) {
+ if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
bam_translate(b, translation_tbl + heap->i);
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
heap->idx = idx++;
} else if (j == -1) {
heap->pos = HEAP_EMPTY;
- free(heap->b->data); free(heap->b);
+ bam_destroy1(heap->b);
heap->b = NULL;
} else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
ks_heapadjust(heap, 0, n, heap);
@@ -742,11 +760,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
for (i = 0; i < n; ++i) {
trans_tbl_destroy(translation_tbl + i);
hts_itr_destroy(iter[i]);
+ bam_hdr_destroy(hdr[i]);
sam_close(fp[i]);
}
bam_hdr_destroy(hout);
sam_close(fpout);
- free(translation_tbl); free(fp); free(heap); free(iter);
+ free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
return 0;
}
@@ -1020,12 +1039,12 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
// write sub files
for (;;) {
if (k == max_k) {
- size_t old_max = max_k;
+ size_t kk, old_max = max_k;
max_k = max_k? max_k<<1 : 0x10000;
buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*));
- memset(buf + old_max, 0, sizeof(bam1_t*) * (max_k - old_max));
+ for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL;
}
- if (buf[k] == NULL) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+ if (buf[k] == NULL) buf[k] = bam_init1();
b = buf[k];
if ((ret = sam_read1(fp, header, b)) < 0) break;
if (b->l_data < b->m_data>>2) { // shrink
@@ -1067,11 +1086,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
free(fns);
}
// free
- for (k = 0; k < max_k; ++k) {
- if (!buf[k]) continue;
- free(buf[k]->data);
- free(buf[k]);
- }
+ for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]);
free(buf);
bam_hdr_destroy(header);
sam_close(fp);
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c
index 630eb26..33d7f5c 100644
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -232,9 +232,6 @@ static void pretty_header(char** text_in_out, int32_t text_len)
static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg)
{
- // No need to translate header into itself
- if (out == translate) { merge_rg = merge_pg = true; }
-
tbl->n_targets = translate->n_targets;
tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int));
tbl->rg_trans = kh_init(c2c);
@@ -596,6 +593,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
uint64_t idx = 0;
char **RG = NULL;
hts_itr_t **iter = NULL;
+ bam_hdr_t **hdr = NULL;
trans_tbl_t *translation_tbl = NULL;
// Is there a specified pre-prepared header to use for output?
@@ -614,6 +612,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
fp = (samFile**)calloc(n, sizeof(samFile*));
heap = (heap1_t*)calloc(n, sizeof(heap1_t));
iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
+ hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
// prepare RG tag from file names
if (flag & MERGE_RG) {
@@ -643,9 +642,20 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
return -1;
}
hin = sam_hdr_read(fp[i]);
- if (hout == NULL) hout = hin;
- trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
- if (hin != hout) bam_hdr_destroy(hin);
+ if (hout)
+ trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
+ else {
+ // As yet, no headers to merge into...
+ hout = bam_hdr_dup(hin);
+ // ...so no need to translate header into itself
+ trans_tbl_init(hout, hin, translation_tbl+i, true, true);
+ }
+
+ // TODO sam_itr_next() doesn't yet work for SAM files,
+ // so for those keep the headers around for use with sam_read1()
+ if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
+ else { bam_hdr_destroy(hin); hdr[i] = NULL; }
+
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
}
@@ -679,30 +689,38 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
}
hts_idx_destroy(idx);
+ if (iter[i] == NULL) break;
}
free(rtrans);
} else {
for (i = 0; i < n; ++i) {
- iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
- if (iter[i] == NULL) {
- fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__);
- return -1;
+ if (hdr[i] == NULL) {
+ iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
+ if (iter[i] == NULL) break;
}
+ else iter[i] = NULL;
}
}
+ if (i < n) {
+ fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__);
+ return -1;
+ }
+
// Load the first read from each file into the heap
for (i = 0; i < n; ++i) {
heap1_t *h = heap + i;
h->i = i;
- h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
- if (sam_itr_next(fp[i], iter[i], h->b) >= 0) {
+ h->b = bam_init1();
+ if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) {
bam_translate(h->b, translation_tbl + i);
h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
h->idx = idx++;
}
else {
h->pos = HEAP_EMPTY;
+ bam_destroy1(h->b);
+ h->b = NULL;
}
}
@@ -724,13 +742,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
sam_write1(fpout, hout, b);
- if ((j = sam_itr_next(fp[heap->i], iter[heap->i], b)) >= 0) {
+ if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
bam_translate(b, translation_tbl + heap->i);
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
heap->idx = idx++;
} else if (j == -1) {
heap->pos = HEAP_EMPTY;
- free(heap->b->data); free(heap->b);
+ bam_destroy1(heap->b);
heap->b = NULL;
} else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
ks_heapadjust(heap, 0, n, heap);
@@ -744,11 +762,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char
for (i = 0; i < n; ++i) {
trans_tbl_destroy(translation_tbl + i);
hts_itr_destroy(iter[i]);
+ bam_hdr_destroy(hdr[i]);
sam_close(fp[i]);
}
bam_hdr_destroy(hout);
sam_close(fpout);
- free(translation_tbl); free(fp); free(heap); free(iter);
+ free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
return 0;
}
@@ -1022,12 +1041,12 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
// write sub files
for (;;) {
if (k == max_k) {
- size_t old_max = max_k;
+ size_t kk, old_max = max_k;
max_k = max_k? max_k<<1 : 0x10000;
buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*));
- memset(buf + old_max, 0, sizeof(bam1_t*) * (max_k - old_max));
+ for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL;
}
- if (buf[k] == NULL) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+ if (buf[k] == NULL) buf[k] = bam_init1();
b = buf[k];
if ((ret = sam_read1(fp, header, b)) < 0) break;
if (b->l_data < b->m_data>>2) { // shrink
@@ -1069,11 +1088,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const
free(fns);
}
// free
- for (k = 0; k < max_k; ++k) {
- if (!buf[k]) continue;
- free(buf[k]->data);
- free(buf[k]);
- }
+ for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]);
free(buf);
bam_hdr_destroy(header);
sam_close(fp);
diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c
index 5dbe04f..1bbebdb 100644
--- a/samtools/bam_stat.c
+++ b/samtools/bam_stat.c
@@ -23,7 +23,13 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
-#include "bam.h"
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "htslib/sam.h"
+//#include "bam.h"
#include "samtools.h"
typedef struct {
@@ -59,7 +65,7 @@ typedef struct {
if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \
} while (0)
-bam_flagstat_t *bam_flagstat_core(bamFile fp)
+bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h)
{
bam_flagstat_t *s;
bam1_t *b;
@@ -68,7 +74,7 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp)
s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
b = bam_init1();
c = &b->core;
- while ((ret = bam_read1(fp, b)) >= 0)
+ while ((ret = sam_read1(fp, h, b)) >= 0)
flagstat_loop(s, c);
bam_destroy1(b);
if (ret != -1)
@@ -77,23 +83,35 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp)
}
int bam_flagstat(int argc, char *argv[])
{
- bamFile fp;
- bam_header_t *header;
+ samFile *fp;
+ bam_hdr_t *header;
bam_flagstat_t *s;
if (argc == optind) {
fprintf(stderr, "Usage: samtools flagstat <in.bam>\n");
return 1;
}
- fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(STDIN_FILENO, "r");
+ fp = sam_open(argv[optind], "r");
if (fp == NULL) {
print_error_errno("Cannot open input file \"%s\"", argv[optind]);
return 1;
}
- header = bam_header_read(fp);
- s = bam_flagstat_core(fp);
+
+ if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
+ SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+
+ if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
+
+ header = sam_hdr_read(fp);
+ s = bam_flagstat_core(fp, header);
printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
- printf("%lld + %lld supplimentary\n", s->n_supp[0], s->n_supp[1]);
+ printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
@@ -105,7 +123,7 @@ int bam_flagstat(int argc, char *argv[])
printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
free(s);
- bam_header_destroy(header);
- bam_close(fp);
+ bam_hdr_destroy(header);
+ sam_close(fp);
return 0;
}
diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c
index a07d32d..15a1242 100644
--- a/samtools/bam_stat.c.pysam.c
+++ b/samtools/bam_stat.c.pysam.c
@@ -25,7 +25,13 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include <unistd.h>
-#include "bam.h"
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "htslib/sam.h"
+//#include "bam.h"
#include "samtools.h"
typedef struct {
@@ -61,7 +67,7 @@ typedef struct {
if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \
} while (0)
-bam_flagstat_t *bam_flagstat_core(bamFile fp)
+bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h)
{
bam_flagstat_t *s;
bam1_t *b;
@@ -70,7 +76,7 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp)
s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
b = bam_init1();
c = &b->core;
- while ((ret = bam_read1(fp, b)) >= 0)
+ while ((ret = sam_read1(fp, h, b)) >= 0)
flagstat_loop(s, c);
bam_destroy1(b);
if (ret != -1)
@@ -79,23 +85,35 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp)
}
int bam_flagstat(int argc, char *argv[])
{
- bamFile fp;
- bam_header_t *header;
+ samFile *fp;
+ bam_hdr_t *header;
bam_flagstat_t *s;
if (argc == optind) {
fprintf(pysamerr, "Usage: samtools flagstat <in.bam>\n");
return 1;
}
- fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(STDIN_FILENO, "r");
+ fp = sam_open(argv[optind], "r");
if (fp == NULL) {
print_error_errno("Cannot open input file \"%s\"", argv[optind]);
return 1;
}
- header = bam_header_read(fp);
- s = bam_flagstat_core(fp);
+
+ if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
+ SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) {
+ fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+
+ if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
+
+ header = sam_hdr_read(fp);
+ s = bam_flagstat_core(fp, header);
printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
- printf("%lld + %lld supplimentary\n", s->n_supp[0], s->n_supp[1]);
+ printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
@@ -107,7 +125,7 @@ int bam_flagstat(int argc, char *argv[])
printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
free(s);
- bam_header_destroy(header);
- bam_close(fp);
+ bam_hdr_destroy(header);
+ sam_close(fp);
return 0;
}
diff --git a/samtools/errmod.c b/samtools/errmod.c
index 9f5740b..e7759a0 100644
--- a/samtools/errmod.c
+++ b/samtools/errmod.c
@@ -134,11 +134,10 @@ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
// The total count of each base observed per strand
int w[32];
- /* zero out q */
- memset(q, 0, m * m * sizeof(float));
+ memset(q, 0, m * m * sizeof(float)); // initialise q to 0
if (n == 0) return 0;
- // calculate aux.esum and aux.fsum
- if (n > 255) { // then sample 255 bases
+ // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix
+ if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255
ks_shuffle(uint16_t, n, bases);
n = 255;
}
diff --git a/samtools/errmod.c.pysam.c b/samtools/errmod.c.pysam.c
index c19a201..db57001 100644
--- a/samtools/errmod.c.pysam.c
+++ b/samtools/errmod.c.pysam.c
@@ -136,11 +136,10 @@ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
// The total count of each base observed per strand
int w[32];
- /* zero out q */
- memset(q, 0, m * m * sizeof(float));
+ memset(q, 0, m * m * sizeof(float)); // initialise q to 0
if (n == 0) return 0;
- // calculate aux.esum and aux.fsum
- if (n > 255) { // then sample 255 bases
+ // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix
+ if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255
ks_shuffle(uint16_t, n, bases);
n = 255;
}
diff --git a/samtools/kaln.c b/samtools/kaln.c
deleted file mode 100644
index cd4826e..0000000
--- a/samtools/kaln.c
+++ /dev/null
@@ -1,486 +0,0 @@
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3 at gmail.com>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kaln.h"
-
-#define FROM_M 0
-#define FROM_I 1
-#define FROM_D 2
-
-typedef struct {
- int i, j;
- unsigned char ctype;
-} path_t;
-
-int aln_sm_blosum62[] = {
-/* A R N D C Q E G H I L K M F P S T W Y V * X */
- 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
- -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
- -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
- -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
- 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
- -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
- -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
- 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
- -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
- -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
- -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
- -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
- -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
- -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
- -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
- 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
- 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
- -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
- -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
- 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
- -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
- 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
-};
-
-int aln_sm_blast[] = {
- 1, -3, -3, -3, -2,
- -3, 1, -3, -3, -2,
- -3, -3, 1, -3, -2,
- -3, -3, -3, 1, -2,
- -2, -2, -2, -2, -2
-};
-
-int aln_sm_qual[] = {
- 0, -23, -23, -23, 0,
- -23, 0, -23, -23, 0,
- -23, -23, 0, -23, 0,
- -23, -23, -23, 0, 0,
- 0, 0, 0, 0, 0
-};
-
-ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
-ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
-
-ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
-
-static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
-{
- int i, n;
- uint32_t *cigar;
- unsigned char last_type;
-
- if (path_len == 0 || path == 0) {
- *n_cigar = 0;
- return 0;
- }
-
- last_type = path->ctype;
- for (i = n = 1; i < path_len; ++i) {
- if (last_type != path[i].ctype) ++n;
- last_type = path[i].ctype;
- }
- *n_cigar = n;
- cigar = (uint32_t*)calloc(*n_cigar, 4);
-
- cigar[0] = 1u << 4 | path[path_len-1].ctype;
- last_type = path[path_len-1].ctype;
- for (i = path_len - 2, n = 0; i >= 0; --i) {
- if (path[i].ctype == last_type) cigar[n] += 1u << 4;
- else {
- cigar[++n] = 1u << 4 | path[i].ctype;
- last_type = path[i].ctype;
- }
- }
-
- return cigar;
-}
-
-/***************************/
-/* START OF common_align.c */
-/***************************/
-
-#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
-
-#define set_M(MM, cur, p, sc) \
-{ \
- if ((p)->M >= (p)->I) { \
- if ((p)->M >= (p)->D) { \
- (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
- } else { \
- (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
- } \
- } else { \
- if ((p)->I > (p)->D) { \
- (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
- } else { \
- (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
- } \
- } \
-}
-#define set_I(II, cur, p) \
-{ \
- if ((p)->M - gap_open > (p)->I) { \
- (cur)->It = FROM_M; \
- (II) = (p)->M - gap_open - gap_ext; \
- } else { \
- (cur)->It = FROM_I; \
- (II) = (p)->I - gap_ext; \
- } \
-}
-#define set_end_I(II, cur, p) \
-{ \
- if (gap_end_ext >= 0) { \
- if ((p)->M - gap_end_open > (p)->I) { \
- (cur)->It = FROM_M; \
- (II) = (p)->M - gap_end_open - gap_end_ext; \
- } else { \
- (cur)->It = FROM_I; \
- (II) = (p)->I - gap_end_ext; \
- } \
- } else set_I(II, cur, p); \
-}
-#define set_D(DD, cur, p) \
-{ \
- if ((p)->M - gap_open > (p)->D) { \
- (cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_open - gap_ext; \
- } else { \
- (cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_ext; \
- } \
-}
-#define set_end_D(DD, cur, p) \
-{ \
- if (gap_end_ext >= 0) { \
- if ((p)->M - gap_end_open > (p)->D) { \
- (cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_end_open - gap_end_ext; \
- } else { \
- (cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_end_ext; \
- } \
- } else set_D(DD, cur, p); \
-}
-
-typedef struct {
- uint8_t Mt:3, It:2, Dt:3;
-} dpcell_t;
-
-typedef struct {
- int M, I, D;
-} dpscore_t;
-
-/***************************
- * banded global alignment *
- ***************************/
-uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
-{
- int i, j;
- dpcell_t **dpcell, *q;
- dpscore_t *curr, *last, *s;
- int b1, b2, tmp_end;
- int *mat, end, max = 0;
- uint8_t type, ctype;
- uint32_t *cigar = 0;
-
- int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
- int *score_matrix, N_MATRIX_ROW;
-
- /* initialize some align-related parameters. just for compatibility */
- gap_open = ap->gap_open;
- gap_ext = ap->gap_ext;
- gap_end_open = ap->gap_end_open;
- gap_end_ext = ap->gap_end_ext;
- b = ap->band_width;
- score_matrix = ap->matrix;
- N_MATRIX_ROW = ap->row;
-
- if (n_cigar) *n_cigar = 0;
- if (len1 == 0 || len2 == 0) return 0;
-
- /* calculate b1 and b2 */
- if (len1 > len2) {
- b1 = len1 - len2 + b;
- b2 = b;
- } else {
- b1 = b;
- b2 = len2 - len1 + b;
- }
- if (b1 > len1) b1 = len1;
- if (b2 > len2) b2 = len2;
- --seq1; --seq2;
-
- /* allocate memory */
- end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
- dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
- for (j = 0; j <= len2; ++j)
- dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
- for (j = b2 + 1; j <= len2; ++j)
- dpcell[j] -= j - b2;
- curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
- last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
-
- /* set first row */
- SET_INF(*curr); curr->M = 0;
- for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
- SET_INF(*s);
- set_end_D(s->D, dpcell[0] + i, s - 1);
- }
- s = curr; curr = last; last = s;
-
- /* core dynamic programming, part 1 */
- tmp_end = (b2 < len2)? b2 : len2 - 1;
- for (j = 1; j <= tmp_end; ++j) {
- q = dpcell[j]; s = curr; SET_INF(*s);
- set_end_I(s->I, q, last);
- end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- ++s; ++q;
- for (i = 1; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_D(s->D, q, s - 1);
- if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
- set_end_I(s->I, q, last + i);
- } else s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- }
- /* last row for part 1, use set_end_D() instead of set_D() */
- if (j == len2 && b2 != len2 - 1) {
- q = dpcell[j]; s = curr; SET_INF(*s);
- set_end_I(s->I, q, last);
- end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- ++s; ++q;
- for (i = 1; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
- set_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_end_D(s->D, q, s - 1);
- if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
- set_end_I(s->I, q, last + i);
- } else s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- ++j;
- }
-
- /* core dynamic programming, part 2 */
- for (; j <= len2 - b2 + 1; ++j) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- end = j + b1 - 1;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_D(s->D, q, s - 1);
- s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- }
-
- /* core dynamic programming, part 3 */
- for (; j < len2; ++j) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
- set_end_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- s = curr; curr = last; last = s;
- }
- /* last row */
- if (j == len2) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
- set_end_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- s = curr; curr = last; last = s;
- }
-
- *_score = last[len1].M;
- if (n_cigar) { /* backtrace */
- path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
- i = len1; j = len2;
- q = dpcell[j] + i;
- s = last + len1;
- max = s->M; type = q->Mt; ctype = FROM_M;
- if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
- if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
-
- p = path;
- p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
- ++p;
- do {
- switch (ctype) {
- case FROM_M: --i; --j; break;
- case FROM_I: --j; break;
- case FROM_D: --i; break;
- }
- q = dpcell[j] + i;
- ctype = type;
- switch (type) {
- case FROM_M: type = q->Mt; break;
- case FROM_I: type = q->It; break;
- case FROM_D: type = q->Dt; break;
- }
- p->ctype = ctype; p->i = i; p->j = j;
- ++p;
- } while (i || j);
- cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
- free(path);
- }
-
- /* free memory */
- for (j = b2 + 1; j <= len2; ++j)
- dpcell[j] += j - b2;
- for (j = 0; j <= len2; ++j)
- free(dpcell[j]);
- free(dpcell);
- free(curr); free(last);
-
- return cigar;
-}
-
-typedef struct {
- int M, I, D;
-} score_aux_t;
-
-#define MINUS_INF -0x40000000
-
-// matrix: len2 rows and len1 columns
-int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
-{
-
-#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
- int t1, t2; \
- score_aux_t *_q; \
- _q = _q0; \
- _p->M = _q->M >= _q->I? _q->M : _q->I; \
- _p->M = _p->M >= _q->D? _p->M : _q->D; \
- _p->M += (_sc); \
- ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
- _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
- }
-
- int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
- const uint8_t *seq1, *seq2;
- score_aux_t *curr, *last, *swap;
- bw = abs(len1 - len2) + ap->band_width;
- i = len1 > len2? len1 : len2;
- if (bw > i + 1) bw = i + 1;
- seq1 = _seq1 - 1; seq2 = _seq2 - 1;
- curr = calloc(len1 + 2, sizeof(score_aux_t));
- last = calloc(len1 + 2, sizeof(score_aux_t));
- { // the zero-th row
- int x, end = len1;
- score_aux_t *p;
- j = 0;
- x = j + bw; end = len1 < x? len1 : x; // band end
- p = curr;
- p->M = 0; p->I = p->D = MINUS_INF;
- for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
- p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
- p->M = p->I = p->D = MINUS_INF;
- swap = curr; curr = last; last = swap;
- }
- for (j = 1; j < len2; ++j) {
- int x, beg = 0, end = len1, *scrow, col_end;
- score_aux_t *p;
- x = j - bw; beg = 0 > x? 0 : x; // band start
- x = j + bw; end = len1 < x? len1 : x; // band end
- if (beg == 0) { // from zero-th column
- p = curr;
- p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
- ++beg; // then beg = 1
- }
- scrow = scmat + seq2[j] * scmat_size;
- if (end == len1) col_end = 1, --end;
- else col_end = 0;
- for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
- if (col_end) {
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
- ++p;
- }
- p->M = p->I = p->D = MINUS_INF;
-// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
- swap = curr; curr = last; last = swap;
- }
- { // the last row
- int x, beg = 0, *scrow;
- score_aux_t *p;
- j = len2;
- x = j - bw; beg = 0 > x? 0 : x; // band start
- if (beg == 0) { // from zero-th column
- p = curr;
- p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
- ++beg; // then beg = 1
- }
- scrow = scmat + seq2[j] * scmat_size;
- for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
-// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
- }
- ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
- ret = ret >= curr[len1].D? ret : curr[len1].D;
- free(curr); free(last);
- return ret;
-}
-
-#ifdef _MAIN
-int main(int argc, char *argv[])
-{
-// int len1 = 35, len2 = 35;
-// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
-// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
- int len1 = 4, len2 = 4;
- uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
- uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
- int sc;
-// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
- sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
- printf("%d\n", sc);
- return 0;
-}
-#endif
diff --git a/samtools/kaln.c.pysam.c b/samtools/kaln.c.pysam.c
deleted file mode 100644
index 1922cc1..0000000
--- a/samtools/kaln.c.pysam.c
+++ /dev/null
@@ -1,488 +0,0 @@
-#include "pysam.h"
-
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3 at gmail.com>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kaln.h"
-
-#define FROM_M 0
-#define FROM_I 1
-#define FROM_D 2
-
-typedef struct {
- int i, j;
- unsigned char ctype;
-} path_t;
-
-int aln_sm_blosum62[] = {
-/* A R N D C Q E G H I L K M F P S T W Y V * X */
- 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
- -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
- -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
- -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
- 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
- -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
- -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
- 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
- -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
- -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
- -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
- -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
- -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
- -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
- -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
- 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
- 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
- -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
- -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
- 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
- -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
- 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
-};
-
-int aln_sm_blast[] = {
- 1, -3, -3, -3, -2,
- -3, 1, -3, -3, -2,
- -3, -3, 1, -3, -2,
- -3, -3, -3, 1, -2,
- -2, -2, -2, -2, -2
-};
-
-int aln_sm_qual[] = {
- 0, -23, -23, -23, 0,
- -23, 0, -23, -23, 0,
- -23, -23, 0, -23, 0,
- -23, -23, -23, 0, 0,
- 0, 0, 0, 0, 0
-};
-
-ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
-ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
-
-ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
-
-static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
-{
- int i, n;
- uint32_t *cigar;
- unsigned char last_type;
-
- if (path_len == 0 || path == 0) {
- *n_cigar = 0;
- return 0;
- }
-
- last_type = path->ctype;
- for (i = n = 1; i < path_len; ++i) {
- if (last_type != path[i].ctype) ++n;
- last_type = path[i].ctype;
- }
- *n_cigar = n;
- cigar = (uint32_t*)calloc(*n_cigar, 4);
-
- cigar[0] = 1u << 4 | path[path_len-1].ctype;
- last_type = path[path_len-1].ctype;
- for (i = path_len - 2, n = 0; i >= 0; --i) {
- if (path[i].ctype == last_type) cigar[n] += 1u << 4;
- else {
- cigar[++n] = 1u << 4 | path[i].ctype;
- last_type = path[i].ctype;
- }
- }
-
- return cigar;
-}
-
-/***************************/
-/* START OF common_align.c */
-/***************************/
-
-#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
-
-#define set_M(MM, cur, p, sc) \
-{ \
- if ((p)->M >= (p)->I) { \
- if ((p)->M >= (p)->D) { \
- (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
- } else { \
- (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
- } \
- } else { \
- if ((p)->I > (p)->D) { \
- (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
- } else { \
- (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
- } \
- } \
-}
-#define set_I(II, cur, p) \
-{ \
- if ((p)->M - gap_open > (p)->I) { \
- (cur)->It = FROM_M; \
- (II) = (p)->M - gap_open - gap_ext; \
- } else { \
- (cur)->It = FROM_I; \
- (II) = (p)->I - gap_ext; \
- } \
-}
-#define set_end_I(II, cur, p) \
-{ \
- if (gap_end_ext >= 0) { \
- if ((p)->M - gap_end_open > (p)->I) { \
- (cur)->It = FROM_M; \
- (II) = (p)->M - gap_end_open - gap_end_ext; \
- } else { \
- (cur)->It = FROM_I; \
- (II) = (p)->I - gap_end_ext; \
- } \
- } else set_I(II, cur, p); \
-}
-#define set_D(DD, cur, p) \
-{ \
- if ((p)->M - gap_open > (p)->D) { \
- (cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_open - gap_ext; \
- } else { \
- (cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_ext; \
- } \
-}
-#define set_end_D(DD, cur, p) \
-{ \
- if (gap_end_ext >= 0) { \
- if ((p)->M - gap_end_open > (p)->D) { \
- (cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_end_open - gap_end_ext; \
- } else { \
- (cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_end_ext; \
- } \
- } else set_D(DD, cur, p); \
-}
-
-typedef struct {
- uint8_t Mt:3, It:2, Dt:3;
-} dpcell_t;
-
-typedef struct {
- int M, I, D;
-} dpscore_t;
-
-/***************************
- * banded global alignment *
- ***************************/
-uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
-{
- int i, j;
- dpcell_t **dpcell, *q;
- dpscore_t *curr, *last, *s;
- int b1, b2, tmp_end;
- int *mat, end, max = 0;
- uint8_t type, ctype;
- uint32_t *cigar = 0;
-
- int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
- int *score_matrix, N_MATRIX_ROW;
-
- /* initialize some align-related parameters. just for compatibility */
- gap_open = ap->gap_open;
- gap_ext = ap->gap_ext;
- gap_end_open = ap->gap_end_open;
- gap_end_ext = ap->gap_end_ext;
- b = ap->band_width;
- score_matrix = ap->matrix;
- N_MATRIX_ROW = ap->row;
-
- if (n_cigar) *n_cigar = 0;
- if (len1 == 0 || len2 == 0) return 0;
-
- /* calculate b1 and b2 */
- if (len1 > len2) {
- b1 = len1 - len2 + b;
- b2 = b;
- } else {
- b1 = b;
- b2 = len2 - len1 + b;
- }
- if (b1 > len1) b1 = len1;
- if (b2 > len2) b2 = len2;
- --seq1; --seq2;
-
- /* allocate memory */
- end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
- dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
- for (j = 0; j <= len2; ++j)
- dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
- for (j = b2 + 1; j <= len2; ++j)
- dpcell[j] -= j - b2;
- curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
- last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
-
- /* set first row */
- SET_INF(*curr); curr->M = 0;
- for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
- SET_INF(*s);
- set_end_D(s->D, dpcell[0] + i, s - 1);
- }
- s = curr; curr = last; last = s;
-
- /* core dynamic programming, part 1 */
- tmp_end = (b2 < len2)? b2 : len2 - 1;
- for (j = 1; j <= tmp_end; ++j) {
- q = dpcell[j]; s = curr; SET_INF(*s);
- set_end_I(s->I, q, last);
- end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- ++s; ++q;
- for (i = 1; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_D(s->D, q, s - 1);
- if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
- set_end_I(s->I, q, last + i);
- } else s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- }
- /* last row for part 1, use set_end_D() instead of set_D() */
- if (j == len2 && b2 != len2 - 1) {
- q = dpcell[j]; s = curr; SET_INF(*s);
- set_end_I(s->I, q, last);
- end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- ++s; ++q;
- for (i = 1; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
- set_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_end_D(s->D, q, s - 1);
- if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
- set_end_I(s->I, q, last + i);
- } else s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- ++j;
- }
-
- /* core dynamic programming, part 2 */
- for (; j <= len2 - b2 + 1; ++j) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- end = j + b1 - 1;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_D(s->D, q, s - 1);
- s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- }
-
- /* core dynamic programming, part 3 */
- for (; j < len2; ++j) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
- set_end_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- s = curr; curr = last; last = s;
- }
- /* last row */
- if (j == len2) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
- set_end_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- s = curr; curr = last; last = s;
- }
-
- *_score = last[len1].M;
- if (n_cigar) { /* backtrace */
- path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
- i = len1; j = len2;
- q = dpcell[j] + i;
- s = last + len1;
- max = s->M; type = q->Mt; ctype = FROM_M;
- if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
- if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
-
- p = path;
- p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
- ++p;
- do {
- switch (ctype) {
- case FROM_M: --i; --j; break;
- case FROM_I: --j; break;
- case FROM_D: --i; break;
- }
- q = dpcell[j] + i;
- ctype = type;
- switch (type) {
- case FROM_M: type = q->Mt; break;
- case FROM_I: type = q->It; break;
- case FROM_D: type = q->Dt; break;
- }
- p->ctype = ctype; p->i = i; p->j = j;
- ++p;
- } while (i || j);
- cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
- free(path);
- }
-
- /* free memory */
- for (j = b2 + 1; j <= len2; ++j)
- dpcell[j] += j - b2;
- for (j = 0; j <= len2; ++j)
- free(dpcell[j]);
- free(dpcell);
- free(curr); free(last);
-
- return cigar;
-}
-
-typedef struct {
- int M, I, D;
-} score_aux_t;
-
-#define MINUS_INF -0x40000000
-
-// matrix: len2 rows and len1 columns
-int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
-{
-
-#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
- int t1, t2; \
- score_aux_t *_q; \
- _q = _q0; \
- _p->M = _q->M >= _q->I? _q->M : _q->I; \
- _p->M = _p->M >= _q->D? _p->M : _q->D; \
- _p->M += (_sc); \
- ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
- _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
- }
-
- int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
- const uint8_t *seq1, *seq2;
- score_aux_t *curr, *last, *swap;
- bw = abs(len1 - len2) + ap->band_width;
- i = len1 > len2? len1 : len2;
- if (bw > i + 1) bw = i + 1;
- seq1 = _seq1 - 1; seq2 = _seq2 - 1;
- curr = calloc(len1 + 2, sizeof(score_aux_t));
- last = calloc(len1 + 2, sizeof(score_aux_t));
- { // the zero-th row
- int x, end = len1;
- score_aux_t *p;
- j = 0;
- x = j + bw; end = len1 < x? len1 : x; // band end
- p = curr;
- p->M = 0; p->I = p->D = MINUS_INF;
- for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
- p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
- p->M = p->I = p->D = MINUS_INF;
- swap = curr; curr = last; last = swap;
- }
- for (j = 1; j < len2; ++j) {
- int x, beg = 0, end = len1, *scrow, col_end;
- score_aux_t *p;
- x = j - bw; beg = 0 > x? 0 : x; // band start
- x = j + bw; end = len1 < x? len1 : x; // band end
- if (beg == 0) { // from zero-th column
- p = curr;
- p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
- ++beg; // then beg = 1
- }
- scrow = scmat + seq2[j] * scmat_size;
- if (end == len1) col_end = 1, --end;
- else col_end = 0;
- for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
- if (col_end) {
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
- ++p;
- }
- p->M = p->I = p->D = MINUS_INF;
-// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
- swap = curr; curr = last; last = swap;
- }
- { // the last row
- int x, beg = 0, *scrow;
- score_aux_t *p;
- j = len2;
- x = j - bw; beg = 0 > x? 0 : x; // band start
- if (beg == 0) { // from zero-th column
- p = curr;
- p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
- ++beg; // then beg = 1
- }
- scrow = scmat + seq2[j] * scmat_size;
- for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
-// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
- }
- ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
- ret = ret >= curr[len1].D? ret : curr[len1].D;
- free(curr); free(last);
- return ret;
-}
-
-#ifdef _MAIN
-int main(int argc, char *argv[])
-{
-// int len1 = 35, len2 = 35;
-// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
-// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
- int len1 = 4, len2 = 4;
- uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
- uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
- int sc;
-// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
- sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
- printf("%d\n", sc);
- return 0;
-}
-#endif
diff --git a/samtools/kaln.h b/samtools/kaln.h
deleted file mode 100644
index 8f4a2c6..0000000
--- a/samtools/kaln.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3 at live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#ifndef LH3_KALN_H_
-#define LH3_KALN_H_
-
-#include <stdint.h>
-
-#define MINOR_INF -1073741823
-
-typedef struct {
- int gap_open;
- int gap_ext;
- int gap_end_open;
- int gap_end_ext;
-
- int *matrix;
- int row;
- int band_width;
-} ka_param_t;
-
-typedef struct {
- int iio, iie, ido, ide;
- int eio, eie, edo, ede;
- int *matrix;
- int row;
- int band_width;
-} ka_param2_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap,
- int *_score, int *n_cigar);
- int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap);
-#ifdef __cplusplus
-}
-#endif
-
-extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */
-extern ka_param_t ka_param_qual; // only use this for global alignment!!!
-extern ka_param2_t ka_param2_qual; // only use this for global alignment!!!
-
-#endif
diff --git a/samtools/misc/ace2sam.c b/samtools/misc/ace2sam.c
index 078830a..24b6933 100644
--- a/samtools/misc/ace2sam.c
+++ b/samtools/misc/ace2sam.c
@@ -109,7 +109,7 @@ int main(int argc, char *argv[])
if (t[1].s[i] != '*') ++k;
}
// write out the SAM header and contig sequences
- fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
+ fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line
cns = &t[is_padded?1:2];
fprintf(stderr, "S >%s\n", t[0].s);
for (i = 0; i < cns->l; i += LINE_LEN) {
diff --git a/samtools/misc/ace2sam.c.pysam.c b/samtools/misc/ace2sam.c.pysam.c
index 53c82dc..a7f92e2 100644
--- a/samtools/misc/ace2sam.c.pysam.c
+++ b/samtools/misc/ace2sam.c.pysam.c
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
if (t[1].s[i] != '*') ++k;
}
// write out the SAM header and contig sequences
- fprintf(pysamerr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
+ fprintf(pysamerr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line
cns = &t[is_padded?1:2];
fprintf(pysamerr, "S >%s\n", t[0].s);
for (i = 0; i < cns->l; i += LINE_LEN) {
diff --git a/samtools/padding.c b/samtools/padding.c
index 89916ed..ea1c933 100644
--- a/samtools/padding.c
+++ b/samtools/padding.c
@@ -196,7 +196,7 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
r_tid = b->core.tid;
unpad_seq(b, &r);
if (h->target_len[r_tid] != r.l) {
- fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l);
+ fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam1_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
return -1;
}
if (fai) {
diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c
index 25ed0f4..562ceba 100644
--- a/samtools/padding.c.pysam.c
+++ b/samtools/padding.c.pysam.c
@@ -198,7 +198,7 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
r_tid = b->core.tid;
unpad_seq(b, &r);
if (h->target_len[r_tid] != r.l) {
- fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l);
+ fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam1_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
return -1;
}
if (fai) {
diff --git a/samtools/sam.c b/samtools/sam.c
index 61c7b3e..9f5f6a0 100644
--- a/samtools/sam.c
+++ b/samtools/sam.c
@@ -30,7 +30,7 @@ DEALINGS IN THE SOFTWARE. */
int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
{
- if (!fp->file->is_bin || !fp->file->is_write) return -1;
+ if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1;
bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
return 0;
}
@@ -47,12 +47,14 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
if (strchr(mode, 'r')) {
if (aux) hts_set_fai_filename(fp->file, aux);
fp->header = sam_hdr_read(fp->file); // samclose() will free this
+ fp->is_write = 0;
if (fp->header->n_targets == 0 && bam_verbose >= 1)
fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
}
else {
fp->header = (bam_hdr_t *)aux; // For writing, we won't free it
- if (fp->file->is_bin || fp->file->is_cram || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
+ fp->is_write = 1;
+ if (hts_get_format(fp->file)->format != sam || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
}
return fp;
@@ -61,7 +63,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
void samclose(samfile_t *fp)
{
if (fp) {
- if (!fp->file->is_write && fp->header) bam_hdr_destroy(fp->header);
+ if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header);
sam_close(fp->file);
free(fp);
}
diff --git a/samtools/sam.c.pysam.c b/samtools/sam.c.pysam.c
index bfa4fc0..3a2d860 100644
--- a/samtools/sam.c.pysam.c
+++ b/samtools/sam.c.pysam.c
@@ -32,7 +32,7 @@ DEALINGS IN THE SOFTWARE. */
int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
{
- if (!fp->file->is_bin || !fp->file->is_write) return -1;
+ if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1;
bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
return 0;
}
@@ -49,12 +49,14 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
if (strchr(mode, 'r')) {
if (aux) hts_set_fai_filename(fp->file, aux);
fp->header = sam_hdr_read(fp->file); // samclose() will free this
+ fp->is_write = 0;
if (fp->header->n_targets == 0 && bam_verbose >= 1)
fprintf(pysamerr, "[samopen] no @SQ lines in the header.\n");
}
else {
fp->header = (bam_hdr_t *)aux; // For writing, we won't free it
- if (fp->file->is_bin || fp->file->is_cram || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
+ fp->is_write = 1;
+ if (hts_get_format(fp->file)->format != sam || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
}
return fp;
@@ -63,7 +65,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
void samclose(samfile_t *fp)
{
if (fp) {
- if (!fp->file->is_write && fp->header) bam_hdr_destroy(fp->header);
+ if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header);
sam_close(fp->file);
free(fp);
}
diff --git a/samtools/sam.h b/samtools/sam.h
index 39da006..e642920 100644
--- a/samtools/sam.h
+++ b/samtools/sam.h
@@ -1,6 +1,6 @@
/* sam.h -- format-neutral SAM/BAM API.
- Copyright (C) 2009, 2013 Genome Research Ltd.
+ Copyright (C) 2009, 2013, 2014 Genome Research Ltd.
Author: Heng Li <lh3 at sanger.ac.uk>
@@ -50,6 +50,7 @@ typedef struct {
samFile *file;
struct { BGZF *bam; } x; // Hack so that fp->x.bam still works
bam_hdr_t *header;
+ short is_write:1;
} samfile_t;
#ifdef __cplusplus
diff --git a/samtools/sam_view.c b/samtools/sam_view.c
index e2a4420..55e7e3d 100644
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -353,17 +353,29 @@ int main_samview(int argc, char *argv[])
goto view_end;
}
if (fn_list) hts_set_fai_filename(out, fn_list);
- if (*out_format || is_header) sam_hdr_write(out, header);
+ if (*out_format || is_header) {
+ if (sam_hdr_write(out, header) != 0) {
+ fprintf(stderr, "[main_samview] failed to write the SAM header\n");
+ ret = 1;
+ goto view_end;
+ }
+ }
if (fn_un_out) {
if ((un_out = sam_open(fn_un_out, out_mode)) == 0) {
print_error_errno("failed to open \"%s\" for writing", fn_un_out);
ret = 1;
goto view_end;
}
- if (*out_format || is_header) sam_hdr_write(un_out, header);
+ if (*out_format || is_header) {
+ if (sam_hdr_write(un_out, header) != 0) {
+ fprintf(stderr, "[main_samview] failed to write the SAM header\n");
+ ret = 1;
+ goto view_end;
+ }
+ }
}
}
- if (n_threads > 1) { hts_set_threads(out, n_threads); }
+ if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
if (is_header_only) goto view_end; // no need to print alignments
if (argc == optind + 1) { // convert/print the entire file
@@ -572,6 +584,15 @@ int main_bam2fq(int argc, char *argv[])
print_error_errno("Cannot read file \"%s\"", argv[optind]);
return 1;
}
+ if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
+ SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+ if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
fpse = NULL;
if (fnse) {
fpse = fopen(fnse,"w");
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c
index 9ae5ed6..34840b9 100644
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -355,17 +355,29 @@ int main_samview(int argc, char *argv[])
goto view_end;
}
if (fn_list) hts_set_fai_filename(out, fn_list);
- if (*out_format || is_header) sam_hdr_write(out, header);
+ if (*out_format || is_header) {
+ if (sam_hdr_write(out, header) != 0) {
+ fprintf(pysamerr, "[main_samview] failed to write the SAM header\n");
+ ret = 1;
+ goto view_end;
+ }
+ }
if (fn_un_out) {
if ((un_out = sam_open(fn_un_out, out_mode)) == 0) {
print_error_errno("failed to open \"%s\" for writing", fn_un_out);
ret = 1;
goto view_end;
}
- if (*out_format || is_header) sam_hdr_write(un_out, header);
+ if (*out_format || is_header) {
+ if (sam_hdr_write(un_out, header) != 0) {
+ fprintf(pysamerr, "[main_samview] failed to write the SAM header\n");
+ ret = 1;
+ goto view_end;
+ }
+ }
}
}
- if (n_threads > 1) { hts_set_threads(out, n_threads); }
+ if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
if (is_header_only) goto view_end; // no need to print alignments
if (argc == optind + 1) { // convert/print the entire file
@@ -574,6 +586,15 @@ int main_bam2fq(int argc, char *argv[])
print_error_errno("Cannot read file \"%s\"", argv[optind]);
return 1;
}
+ if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
+ SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL)) {
+ fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+ if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
fpse = NULL;
if (fnse) {
fpse = fopen(fnse,"w");
diff --git a/samtools/stats.c b/samtools/stats.c
index 2eab477..fe43e71 100644
--- a/samtools/stats.c
+++ b/samtools/stats.c
@@ -355,10 +355,8 @@ int unclipped_length(bam1_t *bam_line)
return read_len;
}
-void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
+void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
{
- int read_len = unclipped_length(bam_line);
- if ( read_len >= stats->nbases ) realloc_buffers(stats,read_len);
int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
int icig,iread=0,icycle=0;
int iref = bam_line->core.pos - stats->rseq_pos;
@@ -428,7 +426,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
int idx = is_fwd ? icycle : read_len-icycle-1;
if ( idx>stats->max_len )
- error("mpc: %d>%d\n",idx,stats->max_len);
+ error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
idx = idx*stats->nquals + qual;
if ( idx>=stats->nquals*stats->nbases )
@@ -645,12 +643,13 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
int seq_len = bam_line->core.l_qseq;
if ( !seq_len ) return;
- if ( seq_len >= stats->nbases )
- realloc_buffers(stats,seq_len);
- if ( stats->max_len<seq_len )
- stats->max_len = seq_len;
+ int read_len = unclipped_length(bam_line);
+ if ( read_len >= stats->nbases )
+ realloc_buffers(stats,read_len);
+ if ( stats->max_len<read_len )
+ stats->max_len = read_len;
- stats->read_lengths[seq_len]++;
+ stats->read_lengths[read_len]++;
// Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored
uint8_t base, *seq = bam_get_seq(bam_line);
@@ -850,7 +849,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
}
- count_mismatches_per_cycle(stats,bam_line);
+ count_mismatches_per_cycle(stats,bam_line,read_len);
}
// No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
@@ -1053,7 +1052,7 @@ void output_stats(stats_t *stats, int sparse)
if ( ! sum ) continue;
printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
}
- printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
+ printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
for (isize=0; isize<ibulk; isize++) {
long in = (long)(stats->isize->inward(stats->isize->data, isize));
long out = (long)(stats->isize->outward(stats->isize->data, isize));
diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c
index 507c604..a7ea9e0 100644
--- a/samtools/stats.c.pysam.c
+++ b/samtools/stats.c.pysam.c
@@ -357,10 +357,8 @@ int unclipped_length(bam1_t *bam_line)
return read_len;
}
-void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
+void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len)
{
- int read_len = unclipped_length(bam_line);
- if ( read_len >= stats->nbases ) realloc_buffers(stats,read_len);
int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
int icig,iread=0,icycle=0;
int iref = bam_line->core.pos - stats->rseq_pos;
@@ -430,7 +428,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
int idx = is_fwd ? icycle : read_len-icycle-1;
if ( idx>stats->max_len )
- error("mpc: %d>%d\n",idx,stats->max_len);
+ error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line));
idx = idx*stats->nquals + qual;
if ( idx>=stats->nquals*stats->nbases )
@@ -647,12 +645,13 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
int seq_len = bam_line->core.l_qseq;
if ( !seq_len ) return;
- if ( seq_len >= stats->nbases )
- realloc_buffers(stats,seq_len);
- if ( stats->max_len<seq_len )
- stats->max_len = seq_len;
+ int read_len = unclipped_length(bam_line);
+ if ( read_len >= stats->nbases )
+ realloc_buffers(stats,read_len);
+ if ( stats->max_len<read_len )
+ stats->max_len = read_len;
- stats->read_lengths[seq_len]++;
+ stats->read_lengths[read_len]++;
// Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored
uint8_t base, *seq = bam_get_seq(bam_line);
@@ -852,7 +851,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
}
- count_mismatches_per_cycle(stats,bam_line);
+ count_mismatches_per_cycle(stats,bam_line,read_len);
}
// No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
@@ -1055,7 +1054,7 @@ void output_stats(stats_t *stats, int sparse)
if ( ! sum ) continue;
printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
}
- printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
+ printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
for (isize=0; isize<ibulk; isize++) {
long in = (long)(stats->isize->inward(stats->isize->data, isize));
long out = (long)(stats->isize->outward(stats->isize->data, isize));
diff --git a/samtools/test/merge/test_rtrans_build.c b/samtools/test/merge/test_rtrans_build.c
index d3fbbb3..df50921 100644
--- a/samtools/test/merge/test_rtrans_build.c
+++ b/samtools/test/merge/test_rtrans_build.c
@@ -76,8 +76,8 @@ int main(int argc, char**argv)
break;
}
}
- const long GIMIC_SEED = 0x1234abcd330e;
- srand48(GIMIC_SEED);
+ const long GIMMICK_SEED = 0x1234330e;
+ srand48(GIMMICK_SEED);
if (verbose) printf("BEGIN test 1\n");
// setup
diff --git a/samtools/test/merge/test_rtrans_build.c.pysam.c b/samtools/test/merge/test_rtrans_build.c.pysam.c
index ad7f36a..fcbc458 100644
--- a/samtools/test/merge/test_rtrans_build.c.pysam.c
+++ b/samtools/test/merge/test_rtrans_build.c.pysam.c
@@ -78,8 +78,8 @@ int main(int argc, char**argv)
break;
}
}
- const long GIMIC_SEED = 0x1234abcd330e;
- srand48(GIMIC_SEED);
+ const long GIMMICK_SEED = 0x1234330e;
+ srand48(GIMMICK_SEED);
if (verbose) printf("BEGIN test 1\n");
// setup
diff --git a/samtools/test/merge/test_trans_tbl_init.c b/samtools/test/merge/test_trans_tbl_init.c
index 2a18e2f..64b9786 100644
--- a/samtools/test/merge/test_trans_tbl_init.c
+++ b/samtools/test/merge/test_trans_tbl_init.c
@@ -320,7 +320,7 @@ int main(int argc, char**argv)
}
// Set the seed to a fixed value so that calls to lrand48 within functions return predictable values
- const long GIMMICK_SEED = 0x1234abcd330e;
+ const long GIMMICK_SEED = 0x1234330e;
srand48(GIMMICK_SEED);
bam_hdr_t* out;
diff --git a/samtools/test/merge/test_trans_tbl_init.c.pysam.c b/samtools/test/merge/test_trans_tbl_init.c.pysam.c
index 2c69e21..594bf2c 100644
--- a/samtools/test/merge/test_trans_tbl_init.c.pysam.c
+++ b/samtools/test/merge/test_trans_tbl_init.c.pysam.c
@@ -322,7 +322,7 @@ int main(int argc, char**argv)
}
// Set the seed to a fixed value so that calls to lrand48 within functions return predictable values
- const long GIMMICK_SEED = 0x1234abcd330e;
+ const long GIMMICK_SEED = 0x1234330e;
srand48(GIMMICK_SEED);
bam_hdr_t* out;
diff --git a/samtools/version.h b/samtools/version.h
index 4558007..64eb542 100644
--- a/samtools/version.h
+++ b/samtools/version.h
@@ -1 +1 @@
-#define SAMTOOLS_VERSION "1.1"
+#define SAMTOOLS_VERSION "1.2"
diff --git a/setup.py b/setup.py
index 45fb8bc..8c0a132 100644
--- a/setup.py
+++ b/setup.py
@@ -40,18 +40,29 @@ import version
version = version.__version__
# exclude sources that contains a main function
-samtools_exclude = ("bamtk.c", "razip.c", "bgzip.c",
- "main.c", "calDepth.c", "bam2bed.c",
- "wgsim.c", "md5fa.c", "maq2sam.c",
+samtools_exclude = ("bamtk.c",
+ "razip.c",
+ "bgzip.c",
+ "main.c",
+ "calDepth.c",
+ "bam2bed.c",
+ "wgsim.c",
+ "md5fa.c",
+ "maq2sam.c",
"bamcheck.c",
"chk_indel.c",
- "vcf-miniview.c")
-htslib_exclude = ('htslib/tabix.c', 'htslib/bgzip.c')
-tabix_exclude = ("main.c",)
+ "vcf-miniview.c",
+ "htslib-1.2.1", # do not import twice
+ "hfile_irods.c", # requires irods library
+ )
+
+htslib_exclude = ('htslib/tabix.c',
+ 'htslib/bgzip.c',
+ 'htslib/htsfile.c',
+ 'htslib/hfile_irods.c')
# destination directories for import of samtools and tabix
samtools_dest = os.path.abspath("samtools")
-tabix_dest = os.path.abspath("tabix")
if HTSLIB_LIBRARY_DIR:
# linking against a shared, externally installed htslib version, no
@@ -93,7 +104,8 @@ else:
def locate(pattern, root=os.curdir):
'''Locate all files matching supplied filename pattern in and below
- supplied root directory.'''
+ supplied root directory.
+ '''
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in fnmatch.filter(files, pattern):
yield os.path.join(path, filename)
@@ -137,58 +149,68 @@ if len(sys.argv) >= 2 and sys.argv[1] == "import":
if len(sys.argv) < 3:
raise ValueError("missing PATH to samtools source directory")
- for destdir, srcdir, exclude in zip(
- (samtools_dest,),
- sys.argv[2:3],
- (samtools_exclude,)):
-
- srcdir = os.path.abspath(srcdir)
- if not os.path.exists(srcdir):
- raise IOError("source directory `%s` does not exist." % srcdir)
-
- cfiles = locate("*.c", srcdir)
- hfiles = locate("*.h", srcdir)
- ncopied = 0
-
- def _compareAndCopy(src, srcdir, destdir, exclude):
-
- d, f = os.path.split(src)
- if f in exclude:
- return None
- common_prefix = os.path.commonprefix((d, srcdir))
- subdir = re.sub(common_prefix, "", d)[1:]
- targetdir = os.path.join(destdir, subdir)
- if not os.path.exists(targetdir):
- os.makedirs(targetdir)
- old_file = os.path.join(targetdir, f)
- if os.path.exists(old_file):
- md5_old = hashlib.md5(
- "".join(open(old_file, "r").readlines())).digest()
- md5_new = hashlib.md5(
- "".join(open(src, "r").readlines())).digest()
- if md5_old != md5_new:
- raise ValueError(
- "incompatible files for %s and %s" % (old_file, src))
-
- shutil.copy(src, targetdir)
- return old_file
-
- for src_file in hfiles:
- _compareAndCopy(src_file, srcdir, destdir, exclude)
- ncopied += 1
-
- cf = []
- for src_file in cfiles:
- cf.append(_compareAndCopy(src_file, srcdir, destdir, exclude))
- ncopied += 1
-
- sys.stdout.write(
- "installed latest source code from %s: "
- "%i files copied\n" % (srcdir, ncopied))
- # redirect stderr to pysamerr and replace bam.h with a stub.
- sys.stdout.write("applying stderr redirection\n")
-
- _update_pysam_files(cf, destdir)
+ destdir = samtools_dest
+ srcdir = sys.argv[2]
+ exclude = samtools_exclude
+
+ srcdir = os.path.abspath(srcdir)
+ if not os.path.exists(srcdir):
+ raise IOError(
+ "source directory `%s` does not exist." % srcdir)
+
+ cfiles = locate("*.c", srcdir)
+ hfiles = locate("*.h", srcdir)
+
+ # remove unwanted files and htslib subdirectory.
+ cfiles = [x for x in cfiles if os.path.basename(x) not in exclude
+ and not re.search("htslib-", x)]
+
+ hfiles = [x for x in hfiles if os.path.basename(x) not in exclude
+ and not re.search("htslib-", x)]
+
+ ncopied = 0
+
+ def _compareAndCopy(src, srcdir, destdir, exclude):
+
+ d, f = os.path.split(src)
+ common_prefix = os.path.commonprefix((d, srcdir))
+ subdir = re.sub(common_prefix, "", d)[1:]
+ targetdir = os.path.join(destdir, subdir)
+ if not os.path.exists(targetdir):
+ os.makedirs(targetdir)
+ old_file = os.path.join(targetdir, f)
+ if os.path.exists(old_file):
+ md5_old = hashlib.md5(
+ "".join(open(old_file, "r").readlines())).digest()
+ md5_new = hashlib.md5(
+ "".join(open(src, "r").readlines())).digest()
+ if md5_old != md5_new:
+ raise ValueError(
+ "incompatible files for %s and %s" %
+ (old_file, src))
+
+ shutil.copy(src, targetdir)
+ return old_file
+
+ for src_file in hfiles:
+ _compareAndCopy(src_file, srcdir, destdir, exclude)
+ ncopied += 1
+
+ cf = []
+ for src_file in cfiles:
+ cf.append(_compareAndCopy(src_file,
+ srcdir,
+ destdir,
+ exclude))
+ ncopied += 1
+
+ sys.stdout.write(
+ "installed latest source code from %s: "
+ "%i files copied\n" % (srcdir, ncopied))
+ # redirect stderr to pysamerr and replace bam.h with a stub.
+ sys.stdout.write("applying stderr redirection\n")
+
+ _update_pysam_files(cf, destdir)
sys.exit(0)
@@ -197,7 +219,7 @@ if len(sys.argv) >= 2 and sys.argv[1] == "refresh":
sys.stdout.write("refreshing latest source code from .c to .pysam.c")
# redirect stderr to pysamerr and replace bam.h with a stub.
sys.stdout.write("applying stderr redirection")
- for destdir in ('samtools', 'tabix'):
+ for destdir in ('samtools', ):
pysamcfiles = locate("*.pysam.c", destdir)
for f in pysamcfiles:
os.remove(f)
@@ -231,6 +253,7 @@ except ImportError:
calignmentfile_sources = ["pysam/calignmentfile.c"]
tabproxies_sources = ["pysam/TabProxies.c"]
cvcf_sources = ["pysam/cvcf.c"]
+ cbcf_sources = ["pysam/cbcf.c"]
else:
# remove existing files to recompute
# necessary to be both compatible for python 2.7 and 3.3
@@ -241,7 +264,9 @@ else:
"pysam/cfaidx.c",
"pysam/csamfile.c",
"pysam/TabProxies.c",
- "pysam/cvcf.c"):
+ "pysam/cvcf.c",
+ "pysam/bvcf.c",
+ ):
try:
os.unlink(f)
except:
@@ -256,6 +281,7 @@ else:
faidx_sources = ["pysam/cfaidx.pyx"]
tabproxies_sources = ["pysam/TabProxies.pyx"]
cvcf_sources = ["pysam/cvcf.pyx"]
+ cbcf_sources = ["pysam/cbcf.pyx"]
#######################################################
@@ -415,6 +441,22 @@ cvcf = Extension(
extra_compile_args=["-Wno-error=declaration-after-statement"],
)
+cbcf = Extension(
+ "pysam.cbcf",
+ cbcf_sources +
+ htslib_sources +
+ os_c_files,
+ library_dirs=htslib_library_dirs,
+ include_dirs=["htslib"] + include_os + htslib_include_dirs,
+ libraries=["z"] + htslib_libraries,
+ language="c",
+ extra_compile_args=[
+ "-Wno-error=declaration-after-statement",
+ "-DSAMTOOLS=1"],
+ define_macros=[('_FILE_OFFSET_BITS', '64'),
+ ('_USE_KNETFILE', '')]
+)
+
metadata = {
'name': name,
'version': version,
@@ -432,7 +474,7 @@ metadata = {
'pysam.include.samtools',
# 'pysam.include.samtools.bcftools',
'pysam.include.samtools.win32'],
- 'requires': ['cython (>=0.20.1)'],
+ 'requires': ['cython (>=0.21)'],
'ext_modules': [samtools,
htslib,
samfile,
@@ -440,6 +482,7 @@ metadata = {
tabix,
tabproxies,
cvcf,
+ cbcf,
faidx],
'cmdclass': cmdclass,
'package_dir': {'pysam': 'pysam',
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py
new file mode 100644
index 0000000..2f096c2
--- /dev/null
+++ b/tests/AlignedSegment_test.py
@@ -0,0 +1,447 @@
+import os
+import pysam
+import unittest
+from TestUtils import checkFieldEqual
+
+SAMTOOLS = "samtools"
+WORKDIR = "pysam_test_work"
+DATADIR = "pysam_data"
+
+
+class ReadTest(unittest.TestCase):
+
+ def buildRead(self):
+ '''build an example read.'''
+
+ a = pysam.AlignedSegment()
+ a.query_name = "read_12345"
+ a.query_sequence = "ACGT" * 10
+ a.flag = 0
+ a.reference_id = 0
+ a.reference_start = 20
+ a.mapping_quality = 20
+ a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
+ a.next_reference_id = 0
+ a.next_reference_start = 200
+ a.template_length = 167
+ a.query_qualities = pysam.fromQualityString("1234") * 10
+ # todo: create tags
+ return a
+
+
+class TestAlignedSegment(ReadTest):
+
+ '''tests to check if aligned read can be constructed
+ and manipulated.
+ '''
+
+ def testEmpty(self):
+ a = pysam.AlignedSegment()
+ self.assertEqual(a.query_name, None)
+ self.assertEqual(a.query_sequence, None)
+ self.assertEqual(pysam.toQualityString(a.query_qualities), None)
+ self.assertEqual(a.flag, 0)
+ self.assertEqual(a.reference_id, 0)
+ self.assertEqual(a.mapping_quality, 0)
+ self.assertEqual(a.cigartuples, None)
+ self.assertEqual(a.tags, [])
+ self.assertEqual(a.next_reference_id, 0)
+ self.assertEqual(a.next_reference_start, 0)
+ self.assertEqual(a.template_length, 0)
+
+ def testStrOfEmptyRead(self):
+ a = pysam.AlignedSegment()
+ s = str(a)
+ self.assertEqual(
+ "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
+ s)
+
+ def testSettingTagInEmptyRead(self):
+ '''see issue 62'''
+ a = pysam.AlignedSegment()
+ a.tags = (("NM", 1),)
+ a.query_qualities = None
+ self.assertEqual(a.tags, [("NM", 1), ])
+
+ def testCompare(self):
+ '''check comparison functions.'''
+ a = self.buildRead()
+ b = self.buildRead()
+
+ self.assertEqual(0, a.compare(b))
+ self.assertEqual(0, b.compare(a))
+ self.assertTrue(a == b)
+ self.assertTrue(b == a)
+ self.assertFalse(a != b)
+ self.assertFalse(b != a)
+
+ b.tid = 2
+ self.assertFalse(a == b)
+ self.assertFalse(b == a)
+ self.assertTrue(a != b)
+ self.assertTrue(b != a)
+
+ def testHashing(self):
+ a = self.buildRead()
+ b = self.buildRead()
+ self.assertEqual(hash(a), hash(b))
+ b.tid = 2
+ self.assertNotEqual(hash(a), hash(b))
+
+ def testUpdate(self):
+ '''check if updating fields affects other variable length data
+ '''
+ a = self.buildRead()
+ b = self.buildRead()
+
+ # check qname
+ b.query_name = "read_123"
+ checkFieldEqual(self, a, b, "query_name")
+ b.query_name = "read_12345678"
+ checkFieldEqual(self, a, b, "query_name")
+ b.query_name = "read_12345"
+ checkFieldEqual(self, a, b)
+
+ # check cigar
+ b.cigartuples = ((0, 10), )
+ checkFieldEqual(self, a, b, "cigartuples")
+ b.cigartuples = ((0, 10), (2, 1), (0, 10))
+ checkFieldEqual(self, a, b, "cigartuples")
+ b.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
+ checkFieldEqual(self, a, b)
+
+ # check seq
+ b.query_sequence = "ACGT"
+ checkFieldEqual(self,
+ a, b,
+ ("query_sequence", "query_qualities", "query_length"))
+ b.query_sequence = "ACGT" * 3
+ checkFieldEqual(self,
+ a, b,
+ ("query_sequence", "query_qualities", "query_length"))
+ b.query_sequence = "ACGT" * 10
+ checkFieldEqual(self, a, b, ("query_qualities",))
+
+ # reset qual
+ b = self.buildRead()
+
+ # check flags:
+ for x in (
+ "is_paired", "is_proper_pair",
+ "is_unmapped", "mate_is_unmapped",
+ "is_reverse", "mate_is_reverse",
+ "is_read1", "is_read2",
+ "is_secondary", "is_qcfail",
+ "is_duplicate", "is_supplementary"):
+ setattr(b, x, True)
+ self.assertEqual(getattr(b, x), True)
+ checkFieldEqual(self, a, b, ("flag", x,))
+ setattr(b, x, False)
+ self.assertEqual(getattr(b, x), False)
+ checkFieldEqual(self, a, b)
+
+ def testUpdate2(self):
+ '''issue 135: inplace update of sequence and quality score.
+
+ This does not work as setting the sequence will erase
+ the quality scores.
+ '''
+ a = self.buildRead()
+ a.query_sequence = a.query_sequence[5:10]
+ self.assertEqual(pysam.toQualityString(a.query_qualities), None)
+
+ a = self.buildRead()
+ s = pysam.toQualityString(a.query_qualities)
+ a.query_sequence = a.query_sequence[5:10]
+ a.query_qualities = pysam.fromQualityString(s[5:10])
+
+ self.assertEqual(pysam.toQualityString(a.query_qualities), s[5:10])
+
+ def testLargeRead(self):
+ '''build an example read.'''
+
+ a = pysam.AlignedSegment()
+ a.query_name = "read_12345"
+ a.query_sequence = "ACGT" * 200
+ a.flag = 0
+ a.reference_id = 0
+ a.reference_start = 20
+ a.mapping_quality = 20
+ a.cigartuples = ((0, 4 * 200), )
+ a.next_reference_id = 0
+ a.next_reference_start = 200
+ a.template_length = 167
+ a.query_qualities = pysam.fromQualityString("1234") * 200
+
+ return a
+
+ def testUpdateTlen(self):
+ '''check if updating tlen works'''
+ a = self.buildRead()
+ oldlen = a.template_length
+ oldlen *= 2
+ a.template_length = oldlen
+ self.assertEqual(a.template_length, oldlen)
+
+ def testPositions(self):
+ a = self.buildRead()
+ self.assertEqual(a.get_reference_positions(),
+ [20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])
+
+ self.assertEqual(a.get_aligned_pairs(),
+ [(0, 20), (1, 21), (2, 22), (3, 23), (4, 24),
+ (5, 25), (6, 26), (7, 27), (8, 28), (9, 29),
+ (None, 30),
+ (10, 31), (11, 32), (12, 33), (13, 34), (14, 35),
+ (15, 36), (16, 37), (17, 38), (18, 39), (19, None),
+ (20, 40), (21, 41), (22, 42), (23, 43), (24, 44),
+ (25, 45), (26, 46), (27, 47), (28, 48), (29, 49),
+ (30, 50), (31, 51), (32, 52), (33, 53), (34, 54),
+ (35, 55), (36, 56), (37, 57), (38, 58), (39, 59)])
+
+ self.assertEqual(
+ a.get_reference_positions(),
+ [x[1] for x in a.get_aligned_pairs()
+ if x[0] is not None and x[1] is not None])
+ # alen is the length of the aligned read in genome
+ self.assertEqual(a.reference_length,
+ a.get_aligned_pairs()[-1][0] + 1)
+ # aend points to one beyond last aligned base in ref
+ self.assertEqual(a.get_reference_positions()[-1],
+ a.reference_end - 1)
+
+ def testFullReferencePositions(self):
+ '''see issue 26'''
+ a = self.buildRead()
+ a.cigar = [(4, 30), (0, 20), (1, 3), (0, 47)]
+
+ self.assertEqual(100,
+ len(a.get_reference_positions(full_length=True)))
+
+ def testBlocks(self):
+ a = self.buildRead()
+ self.assertEqual(a.get_blocks(),
+ [(20, 30), (31, 40), (40, 60)])
+
+ def test_get_aligned_pairs_soft_clipping(self):
+ a = pysam.AlignedSegment()
+ a.query_name = "read_12345"
+ a.query_sequence = "ACGT" * 10
+ a.flag = 0
+ a.reference_id = 0
+ a.reference_start = 20
+ a.mapping_quality = 20
+ a.cigartuples = ((4, 2), (0, 35), (4, 3))
+ a.query_qualities = pysam.fromQualityString("1234") * 10
+ self.assertEqual(a.get_aligned_pairs(),
+ [(0, None), (1, None)] +
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(2, 2 + 35), range(20, 20 + 35))] +
+ [(37, None), (38, None), (39, None)]
+ )
+ self.assertEqual(a.get_aligned_pairs(True),
+ # [(0, None), (1, None)] +
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(2, 2 + 35), range(20, 20 + 35))]
+ # [(37, None), (38, None), (39, None)]
+ )
+
+ def test_get_aligned_pairs_hard_clipping(self):
+ a = pysam.AlignedSegment()
+ a.query_name = "read_12345"
+ a.query_sequence = "ACGT" * 10
+ a.flag = 0
+ a.reference_id = 0
+ a.reference_start = 20
+ a.mapping_quality = 20
+ a.cigartuples = ((5, 2), (0, 35), (5, 3))
+ a.query_qualities = pysam.fromQualityString("1234") * 10
+ self.assertEqual(a.get_aligned_pairs(),
+ # No seq, no seq pos
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(0, 0 + 35), range(20, 20 + 35))])
+ self.assertEqual(a.get_aligned_pairs(True),
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(0, 0 + 35), range(20, 20 + 35))])
+
+ def test_get_aligned_pairs_skip(self):
+ a = pysam.AlignedSegment()
+ a.query_name = "read_12345"
+ a.query_sequence = "ACGT" * 10
+ a.flag = 0
+ a.reference_id = 0
+ a.reference_start = 20
+ a.mapping_quality = 20
+ a.cigartuples = ((0, 2), (3, 100), (0, 38))
+ a.query_qualities = pysam.fromQualityString("1234") * 10
+ self.assertEqual(a.get_aligned_pairs(),
+ [(0, 20), (1, 21)] +
+ [(None, refpos) for refpos in range(22, 22 + 100)] +
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(2, 2 + 38),
+ range(20 + 2 + 100, 20 + 2 + 100 + 38))])
+ self.assertEqual(a.get_aligned_pairs(True),
+ [(0, 20), (1, 21)] +
+ # [(None, refpos) for refpos in range(21, 21+100)] +
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(2, 2 + 38),
+ range(20 + 2 + 100, 20 + 2 + 100 + 38))])
+
+ def test_get_aligned_pairs_match_mismatch(self):
+ a = pysam.AlignedSegment()
+ a.query_name = "read_12345"
+ a.query_sequence = "ACGT" * 10
+ a.flag = 0
+ a.reference_id = 0
+ a.reference_start = 20
+ a.mapping_quality = 20
+ a.cigartuples = ((7, 20), (8, 20))
+ a.query_qualities = pysam.fromQualityString("1234") * 10
+ self.assertEqual(a.get_aligned_pairs(),
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(0, 0 + 40), range(20, 20 + 40))])
+ self.assertEqual(a.get_aligned_pairs(True),
+ [(qpos, refpos) for (qpos, refpos) in zip(
+ range(0, 0 + 40), range(20, 20 + 40))])
+
+ def test_get_aligned_pairs_padding(self):
+ a = pysam.AlignedSegment()
+ a.query_name = "read_12345"
+ a.query_sequence = "ACGT" * 10
+ a.flag = 0
+ a.reference_id = 0
+ a.reference_start = 20
+ a.mapping_quality = 20
+ a.cigartuples = ((7, 20), (6, 1), (8, 19))
+ a.query_qualities = pysam.fromQualityString("1234") * 10
+
+ def inner():
+ a.get_aligned_pairs()
+ # padding is not bein handled right now
+ self.assertRaises(NotImplementedError, inner)
+
+
+class TestTags(ReadTest):
+
+ def testMissingTag(self):
+ a = self.buildRead()
+ self.assertRaises(KeyError, a.get_tag, "XP")
+
+ def testEmptyTag(self):
+ a = self.buildRead()
+ self.assertRaises(KeyError, a.get_tag, "XT")
+
+ def testSetTag(self):
+ a = self.buildRead()
+ self.assertEqual(False, a.has_tag("NM"))
+ a.set_tag("NM", 2)
+ self.assertEqual(True, a.has_tag("NM"))
+ self.assertEqual(a.get_tag("NM"), 2)
+ a.set_tag("NM", 3)
+ self.assertEqual(a.get_tag("NM"), 3)
+ a.set_tag("NM", None)
+ self.assertEqual(False, a.has_tag("NM"))
+ # check if deleting a non-existing tag is fine
+ a.set_tag("NM", None)
+
+ def testAddTagsType(self):
+ a = self.buildRead()
+ a.tags = None
+ self.assertEqual(a.tags, [])
+
+ a.setTag('X1', 5.0)
+ a.setTag('X2', "5.0")
+ a.setTag('X3', 5)
+
+ self.assertEqual(sorted(a.tags),
+ sorted([('X1', 5.0),
+ ('X2', "5.0"),
+ ('X3', 5)]))
+
+ # test setting float for int value
+ a.setTag('X4', 5, value_type='d')
+ self.assertEqual(sorted(a.tags),
+ sorted([('X1', 5.0),
+ ('X2', "5.0"),
+ ('X3', 5),
+ ('X4', 5.0)]))
+
+ # test setting int for float value - the
+ # value will be rounded.
+ a.setTag('X5', 5.2, value_type='i')
+ self.assertEqual(sorted(a.tags),
+ sorted([('X1', 5.0),
+ ('X2', "5.0"),
+ ('X3', 5),
+ ('X4', 5.0),
+ ('X5', 5)]))
+
+ # test setting invalid type code
+ self.assertRaises(ValueError, a.setTag, 'X6', 5.2, 'g')
+
+ def testTagsUpdatingFloat(self):
+ a = self.buildRead()
+ a.tags = [('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U')]
+
+ self.assertEqual(a.tags,
+ [('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U')])
+ a.tags += [('XC', 5.0)]
+ self.assertEqual(a.tags,
+ [('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)])
+
+ def testAddTags(self):
+ a = self.buildRead()
+ a.tags = [('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U')]
+
+ self.assertEqual(sorted(a.tags),
+ sorted([('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U')]))
+
+ a.setTag('X1', 'C')
+ self.assertEqual(sorted(a.tags),
+ sorted([('X1', 'C'), ('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U'), ]))
+ a.setTag('X2', 5)
+ self.assertEqual(sorted(a.tags),
+ sorted([('X2', 5), ('X1', 'C'),
+ ('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U'), ]))
+ # add with replacement
+ a.setTag('X2', 10)
+ self.assertEqual(sorted(a.tags),
+ sorted([('X2', 10), ('X1', 'C'),
+ ('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U'), ]))
+
+ # add without replacement
+ a.setTag('X2', 5, replace=False)
+ self.assertEqual(sorted(a.tags),
+ sorted([('X2', 10), ('X1', 'C'),
+ ('X2', 5),
+ ('NM', 1), ('RG', 'L1'),
+ ('PG', 'P1'), ('XT', 'U'), ]))
+
+ def testTagParsing(self):
+ '''test for tag parsing
+
+ see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a
+ '''
+ samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex8.bam"),
+ "rb")
+
+ for entry in samfile:
+ before = entry.get_tags()
+ entry.set_tags(before)
+ after = entry.get_tags()
+ self.assertEqual(after, before)
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py
index 785e508..751fad0 100644
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -13,7 +13,9 @@ import sys
import collections
import subprocess
import logging
-from TestUtils import checkBinaryEqual, checkURL
+from functools import partial
+from TestUtils import checkBinaryEqual, checkURL, checkSamtoolsViewEqual, checkFieldEqual
+import array
IS_PYTHON3 = sys.version_info[0] >= 3
@@ -22,7 +24,15 @@ WORKDIR = "pysam_test_work"
DATADIR = "pysam_data"
-class BasicTestBAMFetch(unittest.TestCase):
+##################################################
+#
+# Detailed test of file contents
+#
+# Data are read either through file based iterator
+# access (BasicTestBAMFromFile) or by calling fetch
+# without coordinates (BasicTestBAMFromFetch)
+##################################################
+class BasicTestBAMFromFetch(unittest.TestCase):
'''basic first test - detailed testing
if information in file is consistent
@@ -164,30 +174,62 @@ class BasicTestBAMFetch(unittest.TestCase):
"quality string mismatch in read 3: %s != %s" % (pysam.toQualityString(self.reads[3].query_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
def testARquery(self):
- self.assertEqual(self.reads[0].query_alignment_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "query mismatch in read 1: %s != %s" % (
- self.reads[0].query_alignment_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
- self.assertEqual(self.reads[1].query_alignment_sequence, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "query size mismatch in read 2: %s != %s" % (
- self.reads[1].query_alignment_sequence, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"))
- self.assertEqual(self.reads[3].query_alignment_sequence, "TAGCTAGCTACCTATATCTTGGTCTT", "query mismatch in read 4: %s != %s" % (
- self.reads[3].query_alignment_sequence, "TAGCTAGCTACCTATATCTTGGTCTT"))
+ self.assertEqual(
+ self.reads[0].query_alignment_sequence,
+ "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG",
+ "query mismatch in read 1: %s != %s" %
+ (self.reads[0].query_alignment_sequence,
+ "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
+ self.assertEqual(
+ self.reads[1].query_alignment_sequence,
+ "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA",
+ "query size mismatch in read 2: %s != %s" %
+ (self.reads[1].query_alignment_sequence,
+ "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"))
+ self.assertEqual(
+ self.reads[3].query_alignment_sequence,
+ "TAGCTAGCTACCTATATCTTGGTCTT",
+ "query mismatch in read 4: %s != %s" %
+ (self.reads[3].query_alignment_sequence,
+ "TAGCTAGCTACCTATATCTTGGTCTT"))
def testARqqual(self):
- self.assertEqual(pysam.toQualityString(self.reads[0].query_alignment_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "qquality string mismatch in read 1: %s != %s" % (pysam.toQualityString(self.reads[0].query_alignment_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
- self.assertEqual(pysam.toQualityString(self.reads[1].query_alignment_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "qquality string mismatch in read 2: %s != %s" % (
- pysam.toQualityString(self.reads[1].query_alignment_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"))
- self.assertEqual(pysam.toQualityString(self.reads[3].query_alignment_qualities), "<<<<<<<<<<<<<<<<<:<9/,&,22",
- "qquality string mismatch in read 3: %s != %s" % (pysam.toQualityString(self.reads[3].query_alignment_qualities), "<<<<<<<<<<<<<<<<<:<9/,&,22"))
+ self.assertEqual(
+ pysam.toQualityString(self.reads[0].query_alignment_qualities),
+ "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
+ "qquality string mismatch in read 1: %s != %s" %
+ (pysam.toQualityString(self.reads[0].query_alignment_qualities),
+ "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
+ self.assertEqual(
+ pysam.toQualityString(self.reads[1].query_alignment_qualities),
+ "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<",
+ "qquality string mismatch in read 2: %s != %s" %
+ (pysam.toQualityString(self.reads[1].query_alignment_qualities),
+ "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"))
+ self.assertEqual(
+ pysam.toQualityString(self.reads[3].query_alignment_qualities),
+ "<<<<<<<<<<<<<<<<<:<9/,&,22",
+ "qquality string mismatch in read 3: %s != %s" %
+ (pysam.toQualityString(self.reads[3].query_alignment_qualities),
+ "<<<<<<<<<<<<<<<<<:<9/,&,22"))
def testPresentOptionalFields(self):
- self.assertEqual(self.reads[0].opt(
- 'NM'), 1, "optional field mismatch in read 1, NM: %s != %s" % (self.reads[0].opt('NM'), 1))
- self.assertEqual(self.reads[0].opt(
- 'RG'), 'L1', "optional field mismatch in read 1, RG: %s != %s" % (self.reads[0].opt('RG'), 'L1'))
- self.assertEqual(self.reads[1].opt(
- 'RG'), 'L2', "optional field mismatch in read 2, RG: %s != %s" % (self.reads[1].opt('RG'), 'L2'))
- self.assertEqual(self.reads[1].opt(
- 'MF'), 18, "optional field mismatch in read 2, MF: %s != %s" % (self.reads[1].opt('MF'), 18))
+ self.assertEqual(
+ self.reads[0].opt('NM'), 1,
+ "optional field mismatch in read 1, NM: %s != %s" %
+ (self.reads[0].opt('NM'), 1))
+ self.assertEqual(
+ self.reads[0].opt('RG'), 'L1',
+ "optional field mismatch in read 1, RG: %s != %s" %
+ (self.reads[0].opt('RG'), 'L1'))
+ self.assertEqual(
+ self.reads[1].opt('RG'), 'L2',
+ "optional field mismatch in read 2, RG: %s != %s" %
+ (self.reads[1].opt('RG'), 'L2'))
+ self.assertEqual(
+ self.reads[1].opt('MF'), 18,
+ "optional field mismatch in read 2, MF: %s != %s" %
+ (self.reads[1].opt('MF'), 18))
def testPairedBools(self):
self.assertEqual(self.reads[0].is_paired, True, "is paired mismatch in read 1: %s != %s" % (
@@ -207,102 +249,70 @@ class BasicTestBAMFetch(unittest.TestCase):
[('MF', 18), ('RG', 'L2'),
('PG', 'P2'), ('XT', 'R')])
- def testAddTags(self):
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')]))
-
- self.reads[0].setTag('X1', 'C')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 'C'), ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
- self.reads[0].setTag('X2', 5)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 5), ('X1', 'C'),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
- # add with replacement
- self.reads[0].setTag('X2', 10)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 10), ('X1', 'C'),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
-
- # add without replacement
- self.reads[0].setTag('X2', 5, replace=False)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 10), ('X1', 'C'),
- ('X2', 5),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
-
- def testAddTagsType(self):
- self.reads[0].tags = None
- self.assertEqual(self.reads[0].tags, [])
-
- self.reads[0].setTag('X1', 5.0)
- self.reads[0].setTag('X2', "5.0")
- self.reads[0].setTag('X3', 5)
-
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5)]))
-
- # test setting float for int value
- self.reads[0].setTag('X4', 5, value_type='d')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5),
- ('X4', 5.0)]))
-
- # test setting int for float value - the
- # value will be rounded.
- self.reads[0].setTag('X5', 5.2, value_type='i')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5),
- ('X4', 5.0),
- ('X5', 5)]))
-
- # test setting invalid type code
- self.assertRaises(ValueError, self.reads[0].setTag, 'X6', 5.2, 'g')
-
- def testTagsUpdatingFloat(self):
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')])
- self.reads[0].tags += [('XC', 5.0)]
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)])
-
def testOpt(self):
self.assertEqual(self.reads[0].opt("XT"), "U")
self.assertEqual(self.reads[1].opt("XT"), "R")
- def testMissingOpt(self):
- self.assertRaises(KeyError, self.reads[0].opt, "XP")
-
- def testEmptyOpt(self):
- self.assertRaises(KeyError, self.reads[2].opt, "XT")
-
def tearDown(self):
self.samfile.close()
-class BasicTestBAMFile(BasicTestBAMFetch):
+class BasicTestSAMFromFetch(BasicTestBAMFromFetch):
def setUp(self):
self.samfile = pysam.AlignmentFile(
os.path.join(DATADIR, "ex3.sam"),
"r")
- self.reads = [r for r in self.samfile]
+ self.reads = list(self.samfile.fetch())
-class BasicTestSAMFile(BasicTestBAMFetch):
+class BasicTestCRAMFromFetch(BasicTestBAMFromFetch):
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.cram"),
+ "rc")
+ self.reads = list(self.samfile.fetch())
+
+ def testTags(self):
+ self.assertEqual(
+ sorted(self.reads[0].tags),
+ sorted([('RG', 'L1'),
+ ('NM', 22),
+ ('MD', '0C0T1G1C0C0A1G0^G0C1C1G1A0T2G0G0G0A1C1G1G1A2C0'),
+ ('PG', 'P1'),
+ ('XT', 'U'),
+ ]))
+ self.assertEqual(
+ sorted(self.reads[1].tags),
+ sorted([('RG', 'L2'),
+ ('NM', 26),
+ ('MD',
+ '1G0A0A1G1G0G2C0A0G0A0A0C0T0T0G0A0A0G0A0C0A0A1T2C0T0T1'),
+ ('MF', 18),
+ ('PG', 'P2'),
+ ('XT', 'R')]))
+
+ def testPresentOptionalFields(self):
+ self.assertEqual(
+ self.reads[0].opt('NM'), 22,
+ "optional field mismatch in read 1, NM: %s != %s" %
+ (self.reads[0].opt('NM'), 22))
+ self.assertEqual(
+ self.reads[0].opt('RG'), 'L1',
+ "optional field mismatch in read 1, RG: %s != %s" %
+ (self.reads[0].opt('RG'), 'L1'))
+ self.assertEqual(
+ self.reads[1].opt('RG'), 'L2',
+ "optional field mismatch in read 2, RG: %s != %s" %
+ (self.reads[1].opt('RG'), 'L2'))
+ self.assertEqual(
+ self.reads[1].opt('MF'), 18,
+ "optional field mismatch in read 2, MF: %s != %s" %
+ (self.reads[1].opt('MF'), 18))
+
+
+class BasicTestSAMFromFile(BasicTestBAMFromFetch):
def setUp(self):
self.samfile = pysam.AlignmentFile(
@@ -311,77 +321,168 @@ class BasicTestSAMFile(BasicTestBAMFetch):
self.reads = [r for r in self.samfile]
-class BasicTestSAMFetch(BasicTestBAMFetch):
+class BasicTestCRAMFromFile(BasicTestCRAMFromFetch):
+
def setUp(self):
self.samfile = pysam.AlignmentFile(
- os.path.join(DATADIR, "ex3.sam"),
- "r")
- self.reads = list(self.samfile.fetch())
+ os.path.join(DATADIR, "ex3.cram"),
+ "rc")
+ self.reads = [r for r in self.samfile]
-# needs to be implemented
-# class TestAlignedSegmentFromSamWithoutHeader(TestAlignedSegmentFromBam):
-#
-# def setUp(self):
-# self.samfile=pysam.AlignmentFile( "ex7.sam","r" )
-# self.reads=list(self.samfile.fetch())
+class BasicTestBAMFromFile(BasicTestBAMFromFetch):
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.bam"),
+ "rb")
+ self.reads = [r for r in self.samfile]
+
+##################################################
+#
+# Test of basic File I/O
+#
+# * format conversions
+# * reading with/without index
+# * reading from closed files
+#
+##################################################
class TestIO(unittest.TestCase):
- '''check if reading samfile and writing a samfile are consistent.'''
+ '''check if reading samfile and writing a samfile
+ are consistent.'''
def checkEcho(self,
input_filename,
reference_filename,
output_filename,
input_mode, output_mode,
- use_template=True):
- '''iterate through *input_filename* writing to *output_filename* and
- comparing the output to *reference_filename*.
+ sequence_filename=None,
+ use_template=True,
+ checkf=checkBinaryEqual):
+ '''iterate through *input_filename* writing to
+ *output_filename* and comparing the output to
+ *reference_filename*.
- The files are opened according to the *input_mode* and *output_mode*.
+ The files are opened according to the *input_mode* and
+ *output_mode*.
If *use_template* is set, the header is copied from infile
using the template mechanism, otherwise target names and
lengths are passed explicitely.
+ The *checkf* is used to determine if the files are
+ equal.
'''
-
- infile = pysam.AlignmentFile(os.path.join(DATADIR, input_filename),
- input_mode)
+ infile = pysam.AlignmentFile(
+ os.path.join(DATADIR, input_filename),
+ input_mode)
if use_template:
- outfile = pysam.AlignmentFile(output_filename,
- output_mode,
- template=infile)
+ outfile = pysam.AlignmentFile(
+ output_filename,
+ output_mode,
+ reference_filename=sequence_filename,
+ template=infile)
else:
- outfile = pysam.AlignmentFile(output_filename,
- output_mode,
- referencenames=infile.references,
- referencelengths=infile.lengths,
- add_sq_text=False)
+ outfile = pysam.AlignmentFile(
+ output_filename,
+ output_mode,
+ reference_names=infile.references,
+ reference_lengths=infile.lengths,
+ reference_filename=sequence_filename,
+ add_sq_text=False)
iter = infile.fetch()
for x in iter:
outfile.write(x)
+
infile.close()
outfile.close()
- self.assertTrue(
- checkBinaryEqual(os.path.join(DATADIR, reference_filename),
- output_filename),
- "files %s and %s are not the same" % (reference_filename,
- output_filename))
+ self.assertTrue(checkf(
+ os.path.join(DATADIR, reference_filename),
+ output_filename),
+ "files %s and %s are not the same" %
+ (reference_filename,
+ output_filename))
- def testReadWriteBam(self):
+ os.unlink(output_filename)
- input_filename = "ex1.bam"
- output_filename = "pysam_ex1.bam"
- reference_filename = "ex1.bam"
+ def testSAM2SAM(self):
+ self.checkEcho("ex2.sam",
+ "ex2.sam",
+ "tmp_ex2.sam",
+ "r", "wh")
- self.checkEcho(input_filename, reference_filename, output_filename,
- "rb", "wb", use_template=True)
+ def testBAM2BAM(self):
+ self.checkEcho("ex2.bam",
+ "ex2.bam",
+ "tmp_ex2.bam",
+ "rb", "wb")
+
+ def testCRAM2CRAM(self):
+ self.checkEcho("ex2.cram",
+ "ex2.cram",
+ "tmp_ex2.cram",
+ "rc", "wc",
+ sequence_filename="pysam_data/ex1.fa",
+ checkf=checkSamtoolsViewEqual)
+
+ def testSAM2BAM(self):
+ self.checkEcho("ex2.sam",
+ "ex2.bam",
+ "tmp_ex2.bam",
+ "r", "wb")
+
+ def testBAM2SAM(self):
+ self.checkEcho("ex2.bam",
+ "ex2.sam",
+ "tmp_ex2.sam",
+ "rb", "wh")
+
+ def testBAM2CRAM(self):
+ # ignore header (md5 sum)
+ self.checkEcho("ex2.bam",
+ "ex2.cram",
+ "tmp_ex2.cram",
+ "rb", "wc",
+ sequence_filename="pysam_data/ex1.fa",
+ checkf=partial(
+ checkSamtoolsViewEqual,
+ without_header=True))
+
+ def testCRAM2BAM(self):
+ # ignore header (md5 sum)
+ self.checkEcho("ex2.cram",
+ "ex2.bam",
+ "tmp_ex2.bam",
+ "rc", "wb",
+ sequence_filename="pysam_data/ex1.fa",
+ checkf=partial(
+ checkSamtoolsViewEqual,
+ without_header=True))
+
+ def testSAM2CRAM(self):
+ self.checkEcho("ex2.sam",
+ "ex2.cram",
+ "tmp_ex2.cram",
+ "r", "wc",
+ sequence_filename="pysam_data/ex1.fa",
+ checkf=partial(
+ checkSamtoolsViewEqual,
+ without_header=True))
+
+ def testCRAM2SAM(self):
+ self.checkEcho("ex2.cram",
+ "ex2.sam",
+ "tmp_ex2.sam",
+ "rc", "wh",
+ sequence_filename="pysam_data/ex1.fa",
+ checkf=partial(
+ checkSamtoolsViewEqual,
+ without_header=True))
# Disabled - should work, files are not binary equal, but are
# non-binary equal:
@@ -394,40 +495,29 @@ class TestIO(unittest.TestCase):
# self.checkEcho(input_filename, reference_filename, output_filename,
# "rb", "wb", use_template=False)
- def testReadWriteSamWithHeader(self):
-
- input_filename = "ex2.sam"
- output_filename = "pysam_ex2.sam"
- reference_filename = "ex2.sam"
-
- self.checkEcho(input_filename,
- reference_filename,
- output_filename,
- "r", "wh")
-
# Release 0.8.0
# no samfiles without header
- def testReadWriteSamWithoutHeader(self):
-
- input_filename = "ex2.sam"
- output_filename = "pysam_ex2.sam"
- reference_filename = "ex1.sam"
-
- self.checkEcho(input_filename,
- reference_filename,
- output_filename,
+ def testSAM2SAMWithoutHeader(self):
+ self.checkEcho("ex2.sam",
+ "ex1.sam",
+ "tmp_ex2.sam",
"r", "w")
def testReadSamWithoutTargetNames(self):
'''see issue 104.'''
- input_filename = os.path.join(DATADIR,
- "example_unmapped_reads_no_sq.sam")
+ input_filename = os.path.join(
+ DATADIR,
+ "example_unmapped_reads_no_sq.sam")
# raise exception in default mode
- self.assertRaises(ValueError, pysam.AlignmentFile, input_filename, "r")
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
+ input_filename,
+ "r")
# raise exception if no SQ files
- self.assertRaises(ValueError, pysam.AlignmentFile,
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
input_filename, "r",
check_header=True)
@@ -435,7 +525,7 @@ class TestIO(unittest.TestCase):
input_filename,
check_header=False,
check_sq=False)
-
+
# TODO
# result = list(infile.fetch(until_eof=True))
# self.assertEqual(2, len(result))
@@ -446,10 +536,16 @@ class TestIO(unittest.TestCase):
DATADIR, "example_unmapped_reads_no_sq.bam")
# raise exception in default mode
- self.assertRaises(ValueError, pysam.AlignmentFile, input_filename, "r")
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
+ input_filename,
+ "r")
# raise exception if no SQ files
- self.assertRaises(ValueError, pysam.AlignmentFile, input_filename, "r",
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
+ input_filename,
+ "r",
check_header=True)
infile = pysam.AlignmentFile(
@@ -460,7 +556,7 @@ class TestIO(unittest.TestCase):
def testReadSamWithoutHeader(self):
input_filename = os.path.join(DATADIR, "ex1.sam")
- # reading from a samfile without header is not
+ # reading from a samfile without header is not
# implemented
self.assertRaises(ValueError,
pysam.AlignmentFile,
@@ -502,19 +598,24 @@ class TestIO(unittest.TestCase):
def testBAMWithoutAlignedSegments(self):
'''see issue 117'''
input_filename = os.path.join(DATADIR, "test_unaligned.bam")
- samfile = pysam.AlignmentFile(input_filename, "rb", check_sq=False)
+ samfile = pysam.AlignmentFile(input_filename,
+ "rb",
+ check_sq=False)
samfile.fetch(until_eof=True)
def testBAMWithShortBAI(self):
'''see issue 116'''
input_filename = os.path.join(DATADIR, "example_bai.bam")
- samfile = pysam.AlignmentFile(input_filename, "rb", check_sq=False)
+ samfile = pysam.AlignmentFile(input_filename,
+ "rb",
+ check_sq=False)
samfile.fetch('chr2')
def testFetchFromClosedFile(self):
- samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex1.bam"),
+ "rb")
samfile.close()
self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
@@ -522,7 +623,7 @@ class TestIO(unittest.TestCase):
'''test that access to a closed samfile raises ValueError.'''
samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ "rb")
samfile.close()
self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
self.assertRaises(ValueError, samfile.pileup, 'chr1', 100, 120)
@@ -563,12 +664,14 @@ class TestIO(unittest.TestCase):
def testReadingFromFileWithoutIndex(self):
'''read from bam file without index.'''
- shutil.copyfile(os.path.join(DATADIR, "ex2.bam"), 'tmp_ex2.bam')
+ shutil.copyfile(os.path.join(DATADIR, "ex2.bam"),
+ 'tmp_ex2.bam')
samfile = pysam.AlignmentFile('tmp_ex2.bam',
"rb")
self.assertRaises(ValueError, samfile.fetch)
- self.assertEqual(len(list(samfile.fetch(until_eof=True))),
- 3270)
+ self.assertEqual(
+ len(list(samfile.fetch(until_eof=True))),
+ 3270)
os.unlink('tmp_ex2.bam')
# def testReadingUniversalFileMode(self):
@@ -612,12 +715,254 @@ class TestIO(unittest.TestCase):
output_filename,
"r", "wbu")
- def testEmptyBAM(self):
- samfile = pysam.Samfile(os.path.join(DATADIR, "empty.bam"),
- "rb")
- self.assertEqual(samfile.mapped, 0)
- self.assertEqual(samfile.unmapped, 0)
- self.assertEqual(samfile.nocoordinate, 0)
+ def testEmptyBAM(self):
+ samfile = pysam.Samfile(os.path.join(DATADIR, "empty.bam"),
+ "rb")
+ self.assertEqual(samfile.mapped, 0)
+ self.assertEqual(samfile.unmapped, 0)
+ self.assertEqual(samfile.nocoordinate, 0)
+
+
+##################################################
+#
+# Random access iterator tests
+#
+##################################################
+class TestIteratorRowBAM(unittest.TestCase):
+
+ filename = os.path.join(DATADIR, "ex2.bam")
+ mode = "rb"
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(
+ self.filename, self.mode,
+ )
+
+ def checkRange(self, rnge):
+ '''compare results from iterator with those from samtools.'''
+ ps = list(self.samfile.fetch(region=rnge))
+ sa = list(pysam.view(self.filename,
+ rnge,
+ raw=True))
+ self.assertEqual(
+ len(ps), len(sa),
+ "unequal number of results for range %s: %i != %i" %
+ (rnge, len(ps), len(sa)))
+ # check if the same reads are returned and in the same order
+ for line, (a, b) in enumerate(list(zip(ps, sa))):
+ d = b.split("\t")
+ self.assertEqual(
+ a.query_name, d[0],
+ "line %i: read id mismatch: %s != %s" %
+ (line, a.reference_id, d[0]))
+ self.assertEqual(
+ a.reference_start,
+ int(d[3]) - 1,
+ "line %i: read position mismatch: %s != %s, \n%s\n%s\n" %
+ (line, a.reference_start, int(d[3]) - 1,
+ str(a), str(d)))
+ qual = d[10]
+ self.assertEqual(
+ pysam.toQualityString(a.query_qualities),
+ qual,
+ "line %i: quality mismatch: %s != %s, \n%s\n%s\n" %
+ (line, pysam.toQualityString(a.query_qualities), qual,
+ str(a), str(d)))
+
+ def testIteratePerContig(self):
+ '''check random access per contig'''
+ for contig in self.samfile.references:
+ self.checkRange(contig)
+
+ def testIterateRanges(self):
+ '''check random access per range'''
+ for contig, length in zip(self.samfile.references,
+ self.samfile.lengths):
+ for start in range(1, length, 90):
+ # this includes empty ranges
+ self.checkRange("%s:%i-%i" %
+ (contig, start, start + 90))
+
+ def tearDown(self):
+ self.samfile.close()
+
+
+class TestIteratorRowAllBAM(unittest.TestCase):
+
+ filename = os.path.join(DATADIR, "ex2.bam")
+ mode = "rb"
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(
+ self.filename,
+ self.mode)
+
+ def testIterate(self):
+ '''compare results from iterator with those from samtools.'''
+ ps = list(self.samfile.fetch())
+ sa = list(pysam.view(self.filename,
+ raw=True))
+ self.assertEqual(
+ len(ps), len(sa),
+ "unequal number of results: %i != %i" %
+ (len(ps), len(sa)))
+ # check if the same reads are returned
+ for line, pair in enumerate(list(zip(ps, sa))):
+ data = pair[1].split("\t")
+ self.assertEqual(
+ pair[0].query_name,
+ data[0],
+ "read id mismatch in line %i: %s != %s" %
+ (line, pair[0].reference_id, data[0]))
+
+ def tearDown(self):
+ self.samfile.close()
+
+
+class TestIteratorColumnBAM(unittest.TestCase):
+
+ '''test iterator column against contents of ex4.bam.'''
+
+ # note that samfile contains 1-based coordinates
+ # 1D means deletion with respect to reference sequence
+ #
+ mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35),
+ 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35),
+ }
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex4.bam"),
+ "rb")
+
+ def checkRange(self, contig, start=None, end=None, truncate=False):
+ '''compare results from iterator with those from samtools.'''
+ # check if the same reads are returned and in the same order
+ for column in self.samfile.pileup(
+ contig, start, end, truncate=truncate):
+ if truncate:
+ self.assertGreaterEqual(column.reference_pos, start)
+ self.assertLess(column.reference_pos, end)
+ thiscov = len(column.pileups)
+ refcov = self.mCoverages[
+ self.samfile.getrname(column.reference_id)][column.reference_pos]
+ self.assertEqual(thiscov, refcov,
+ "wrong coverage at pos %s:%i %i should be %i" % (
+ self.samfile.getrname(column.reference_id),
+ column.reference_pos, thiscov, refcov))
+
+ def testIterateAll(self):
+ '''check random access per contig'''
+ self.checkRange(None)
+
+ def testIteratePerContig(self):
+ '''check random access per contig'''
+ for contig in self.samfile.references:
+ self.checkRange(contig)
+
+ def testIterateRanges(self):
+ '''check random access per range'''
+ for contig, length in zip(
+ self.samfile.references, self.samfile.lengths):
+ for start in range(1, length, 90):
+ # this includes empty ranges
+ self.checkRange(contig, start, start + 90)
+
+ def testInverse(self):
+ '''test the inverse, is point-wise pileup accurate.'''
+ for contig, refseq in list(self.mCoverages.items()):
+ refcolumns = sum(refseq)
+ for pos, refcov in enumerate(refseq):
+ columns = list(self.samfile.pileup(contig, pos, pos + 1))
+ if refcov == 0:
+ # if no read, no coverage
+ self.assertEqual(
+ len(columns),
+ refcov,
+ "wrong number of pileup columns returned for position %s:%i, %i should be %i" % (
+ contig, pos,
+ len(columns), refcov))
+ elif refcov == 1:
+ # one read, all columns of the read are returned
+ self.assertEqual(
+ len(columns),
+ refcolumns,
+ "pileup incomplete at position %i: got %i, expected %i " %
+ (pos, len(columns), refcolumns))
+
+ def testIterateTruncate(self):
+ '''check random access per range'''
+ for contig, length in zip(self.samfile.references,
+ self.samfile.lengths):
+ for start in range(1, length, 90):
+ # this includes empty ranges
+ self.checkRange(contig, start, start + 90, truncate=True)
+
+ def tearDown(self):
+ self.samfile.close()
+
+
+class TestIteratorRowCRAM(TestIteratorRowBAM):
+ filename = os.path.join(DATADIR, "ex2.cram")
+ mode = "rc"
+
+
+class TestIteratorRowCRAM(TestIteratorRowBAM):
+ filename = os.path.join(DATADIR, "ex2.cram")
+ mode = "rc"
+
+##########################################################
+##########################################################
+##########################################################
+# needs to be implemented
+# class TestAlignedSegmentFromSamWithoutHeader(TestAlignedSegmentFromBam):
+#
+# def setUp(self):
+# self.samfile=pysam.AlignmentFile( "ex7.sam","r" )
+# self.reads=list(self.samfile.fetch())
+
+
+class TestIteratorColumn2(unittest.TestCase):
+
+ '''test iterator column against contents of ex1.bam.'''
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex1.bam"),
+ "rb")
+
+ def testStart(self):
+ # print self.samfile.fetch().next().reference_start
+ # print self.samfile.pileup().next().reference_start
+ pass
+
+ def testTruncate(self):
+ '''see issue 107.'''
+ # note that ranges in regions start from 1
+ p = self.samfile.pileup(region='chr1:170:172', truncate=True)
+ columns = [x.reference_pos for x in p]
+ self.assertEqual(len(columns), 3)
+ self.assertEqual(columns, [169, 170, 171])
+
+ p = self.samfile.pileup('chr1', 169, 172, truncate=True)
+ columns = [x.reference_pos for x in p]
+
+ self.assertEqual(len(columns), 3)
+ self.assertEqual(columns, [169, 170, 171])
+
+ def testAccessOnClosedIterator(self):
+ '''see issue 131
+
+ Accessing pileup data after iterator has closed.
+ '''
+ pcolumn = self.samfile.pileup('chr1', 170, 180).__next__()
+ self.assertRaises(ValueError, getattr, pcolumn, "pileups")
+
+ def testStr(self):
+ '''test if PileupRead can be printed.'''
+ iter = self.samfile.pileup('chr1', 170, 180)
+ pcolumn = iter.__next__()
+ s = str(pcolumn)
+ self.assertEqual(len(s.split("\n")), 2)
class TestFloatTagBug(unittest.TestCase):
@@ -808,189 +1153,9 @@ class TestClipping(unittest.TestCase):
'01234')
-class TestIteratorRow(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def checkRange(self, rnge):
- '''compare results from iterator with those from samtools.'''
- ps = list(self.samfile.fetch(region=rnge))
- sa = list(pysam.view(os.path.join(DATADIR, "ex1.bam"),
- rnge,
- raw=True))
- self.assertEqual(len(ps), len(
- sa), "unequal number of results for range %s: %i != %i" % (rnge, len(ps), len(sa)))
- # check if the same reads are returned and in the same order
- for line, (a, b) in enumerate(list(zip(ps, sa))):
- d = b.split("\t")
- self.assertEqual(
- a.query_name, d[0], "line %i: read id mismatch: %s != %s" % (line, a.reference_id, d[0]))
- self.assertEqual(a.reference_start, int(d[3]) - 1, "line %i: read position mismatch: %s != %s, \n%s\n%s\n" %
- (line, a.reference_start, int(d[3]) - 1,
- str(a), str(d)))
- qual = d[10]
- self.assertEqual(pysam.toQualityString(a.query_qualities), qual, "line %i: quality mismatch: %s != %s, \n%s\n%s\n" %
- (line, pysam.toQualityString(a.query_qualities), qual,
- str(a), str(d)))
-
- def testIteratePerContig(self):
- '''check random access per contig'''
- for contig in self.samfile.references:
- self.checkRange(contig)
-
- def testIterateRanges(self):
- '''check random access per range'''
- for contig, length in zip(self.samfile.references,
- self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange("%s:%i-%i" % (contig, start, start + 90))
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorRowAll(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testIterate(self):
- '''compare results from iterator with those from samtools.'''
- ps = list(self.samfile.fetch())
- sa = list(pysam.view(os.path.join(DATADIR, "ex1.bam"),
- raw=True))
- self.assertEqual(
- len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa)))
- # check if the same reads are returned
- for line, pair in enumerate(list(zip(ps, sa))):
- data = pair[1].split("\t")
- self.assertEqual(pair[0].query_name, data[
- 0], "read id mismatch in line %i: %s != %s" % (line, pair[0].reference_id, data[0]))
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorColumn(unittest.TestCase):
-
- '''test iterator column against contents of ex4.bam.'''
-
- # note that samfile contains 1-based coordinates
- # 1D means deletion with respect to reference sequence
- #
- mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35),
- 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35),
- }
-
- def setUp(self):
- self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex4.bam"),
- "rb")
-
- def checkRange(self, contig, start=None, end=None, truncate=False):
- '''compare results from iterator with those from samtools.'''
- # check if the same reads are returned and in the same order
- for column in self.samfile.pileup(
- contig, start, end, truncate=truncate):
- if truncate:
- self.assertGreaterEqual(column.reference_pos, start)
- self.assertLess(column.reference_pos, end)
- thiscov = len(column.pileups)
- refcov = self.mCoverages[
- self.samfile.getrname(column.reference_id)][column.reference_pos]
- self.assertEqual(thiscov, refcov, "wrong coverage at pos %s:%i %i should be %i" % (
- self.samfile.getrname(column.reference_id), column.reference_pos, thiscov, refcov))
-
- def testIterateAll(self):
- '''check random access per contig'''
- self.checkRange(None)
-
- def testIteratePerContig(self):
- '''check random access per contig'''
- for contig in self.samfile.references:
- self.checkRange(contig)
-
- def testIterateRanges(self):
- '''check random access per range'''
- for contig, length in zip(
- self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange(contig, start, start + 90)
-
- def testInverse(self):
- '''test the inverse, is point-wise pileup accurate.'''
- for contig, refseq in list(self.mCoverages.items()):
- refcolumns = sum(refseq)
- for pos, refcov in enumerate(refseq):
- columns = list(self.samfile.pileup(contig, pos, pos + 1))
- if refcov == 0:
- # if no read, no coverage
- self.assertEqual(
- len(columns),
- refcov,
- "wrong number of pileup columns returned for position %s:%i, %i should be %i" % (
- contig, pos,
- len(columns), refcov))
- elif refcov == 1:
- # one read, all columns of the read are returned
- self.assertEqual(
- len(columns),
- refcolumns,
- "pileup incomplete at position %i: got %i, expected %i " %
- (pos, len(columns), refcolumns))
-
- def testIterateTruncate(self):
- '''check random access per range'''
- for contig, length in zip(self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange(contig, start, start + 90, truncate=True)
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorColumn2(unittest.TestCase):
-
- '''test iterator column against contents of ex1.bam.'''
-
- def setUp(self):
- self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testStart(self):
- # print self.samfile.fetch().next().reference_start
- # print self.samfile.pileup().next().reference_start
- pass
-
- def testTruncate(self):
- '''see issue 107.'''
- # note that ranges in regions start from 1
- p = self.samfile.pileup(region='chr1:170:172', truncate=True)
- columns = [x.reference_pos for x in p]
- self.assertEqual(len(columns), 3)
- self.assertEqual(columns, [169, 170, 171])
-
- p = self.samfile.pileup('chr1', 169, 172, truncate=True)
- columns = [x.reference_pos for x in p]
-
- self.assertEqual(len(columns), 3)
- self.assertEqual(columns, [169, 170, 171])
-
- def testAccessOnClosedIterator(self):
- '''see issue 131
-
- Accessing pileup data after iterator has closed.
- '''
- pcolumn = self.samfile.pileup('chr1', 170, 180).__next__()
- self.assertRaises(ValueError, getattr, pcolumn, "pileups")
+class TestHeaderSAM(unittest.TestCase):
-
-class TestHeaderSam(unittest.TestCase):
+ """testing header manipulation"""
header = {'SQ': [{'LN': 1575, 'SN': 'chr1'},
{'LN': 1584, 'SN': 'chr2'}],
@@ -1010,8 +1175,9 @@ class TestHeaderSam(unittest.TestCase):
self.assertEqual(av, b[ak])
def setUp(self):
- self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex3.sam"),
- "r")
+ self.samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.sam"),
+ "r")
def testHeaders(self):
self.compareHeaders(self.header, self.samfile.header)
@@ -1031,11 +1197,35 @@ class TestHeaderSam(unittest.TestCase):
self.samfile.close()
-class TestHeaderBam(TestHeaderSam):
+class TestHeaderBAM(TestHeaderSAM):
def setUp(self):
- self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex3.bam"),
- "rb")
+ self.samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.bam"),
+ "rb")
+
+
+class TestHeaderCRAM(TestHeaderSAM):
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex3.cram"),
+ "rc")
+
+ def compareHeaders(self, a, b):
+ '''compare two headers a and b.'''
+ def _strip(dd):
+ for x in dd:
+ for y in ("M5", "UR"):
+ if y in x:
+ del x[y]
+
+ for ak, av in a.items():
+ _strip(av)
+ self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b))
+ _strip(b[ak])
+
+ self.assertEqual(sorted(av), sorted(b[ak]))
class TestHeaderFromRefs(unittest.TestCase):
@@ -1059,6 +1249,7 @@ class TestHeaderFromRefs(unittest.TestCase):
class TestHeader1000Genomes(unittest.TestCase):
+
'''see issue 110'''
# bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase2b_alignment/data/NA07048/exome_alignment/NA07048.unmapped.ILLUMINA.bwa.CEU.exome.20120522_p2b.bam"
bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam"
@@ -1084,7 +1275,7 @@ class TestUnmappedReads(unittest.TestCase):
def testBAM(self):
samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex5.bam"),
- "rb")
+ "rb")
self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2)
samfile.close()
@@ -1093,7 +1284,7 @@ class TestPileupObjects(unittest.TestCase):
def setUp(self):
self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ "rb")
def testPileupColumn(self):
for pcolumn1 in self.samfile.pileup(region="chr1:105"):
@@ -1134,9 +1325,8 @@ class TestPileupObjects(unittest.TestCase):
": %s != %s" %
(len(pcolumn1.pileups), 2))
-
-# self.assertEqual( pcolumn1.pileups[0] # need to test additional
-# properties here
+ # self.assertEqual( pcolumn1.pileups[0] # need to test additional
+ # properties here
def tearDown(self):
self.samfile.close()
@@ -1155,7 +1345,7 @@ class TestContextManager(unittest.TestCase):
def testManager(self):
with pysam.AlignmentFile(os.path.join(DATADIR, 'ex1.bam'),
- 'rb') as samfile:
+ 'rb') as samfile:
samfile.fetch()
self.assertEqual(samfile._isOpen(), False)
@@ -1164,14 +1354,18 @@ class TestExceptions(unittest.TestCase):
def setUp(self):
self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ "rb")
def testMissingFile(self):
- self.assertRaises(IOError, pysam.AlignmentFile, "exdoesntexist.bam", "rb")
- self.assertRaises(IOError, pysam.AlignmentFile, "exdoesntexist.sam", "r")
- self.assertRaises(IOError, pysam.AlignmentFile, "exdoesntexist.bam", "r")
- self.assertRaises(IOError, pysam.AlignmentFile, "exdoesntexist.sam", "rb")
+ self.assertRaises(
+ IOError, pysam.AlignmentFile, "exdoesntexist.bam", "rb")
+ self.assertRaises(
+ IOError, pysam.AlignmentFile, "exdoesntexist.sam", "r")
+ self.assertRaises(
+ IOError, pysam.AlignmentFile, "exdoesntexist.bam", "r")
+ self.assertRaises(
+ IOError, pysam.AlignmentFile, "exdoesntexist.sam", "rb")
def testBadContig(self):
self.assertRaises(ValueError, self.samfile.fetch, "chr88")
@@ -1258,244 +1452,7 @@ class TestWrongFormat(unittest.TestCase):
'rb')
-class ReadTest(unittest.TestCase):
-
- def checkFieldEqual(self, read1, read2, exclude=[]):
- '''check if two reads are equal by comparing each field.'''
-
- # add the . for refactoring purposes.
- for x in (".query_name",
- ".query_sequence",
- ".flag",
- ".reference_id",
- ".reference_start",
- ".mapping_quality",
- ".cigartuples",
- ".next_reference_id",
- ".next_reference_start",
- ".template_length",
- ".query_length",
- ".query_qualities",
- ".bin",
- ".is_paired", ".is_proper_pair",
- ".is_unmapped", ".mate_is_unmapped",
- ".is_reverse", ".mate_is_reverse",
- ".is_read1", ".is_read2",
- ".is_secondary", ".is_qcfail",
- ".is_duplicate"):
- n = x[1:]
- if n in exclude:
- continue
- self.assertEqual(getattr(read1, n), getattr(read2, n),
- "attribute mismatch for %s: %s != %s" %
- (n, getattr(read1, n), getattr(read2, n)))
-
-
-class TestAlignedSegment(ReadTest):
-
- '''tests to check if aligned read can be constructed
- and manipulated.
- '''
-
- def testEmpty(self):
- a = pysam.AlignedSegment()
- self.assertEqual(a.query_name, None)
- self.assertEqual(a.query_sequence, None)
- self.assertEqual(pysam.toQualityString(a.query_qualities), None)
- self.assertEqual(a.flag, 0)
- self.assertEqual(a.reference_id, 0)
- self.assertEqual(a.mapping_quality, 0)
- self.assertEqual(a.cigartuples, None)
- self.assertEqual(a.tags, [])
- self.assertEqual(a.next_reference_id, 0)
- self.assertEqual(a.next_reference_start, 0)
- self.assertEqual(a.template_length, 0)
-
- def testStrOfEmptyRead(self):
- a = pysam.AlignedSegment()
- s = str(a)
- self.assertEqual(
- "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
- s)
-
- def buildRead(self):
- '''build an example read.'''
-
- a = pysam.AlignedSegment()
- a.query_name = "read_12345"
- a.query_sequence = "ACGT" * 10
- a.flag = 0
- a.reference_id = 0
- a.reference_start = 20
- a.mapping_quality = 20
- a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
- a.next_reference_id = 0
- a.next_reference_start = 200
- a.template_length = 167
- a.query_qualities = pysam.fromQualityString("1234") * 10
- # todo: create tags
- return a
-
- def testUpdate(self):
- '''check if updating fields affects other variable length data
- '''
- a = self.buildRead()
- b = self.buildRead()
-
- # check qname
- b.query_name = "read_123"
- self.checkFieldEqual(a, b, "query_name")
- b.query_name = "read_12345678"
- self.checkFieldEqual(a, b, "query_name")
- b.query_name = "read_12345"
- self.checkFieldEqual(a, b)
-
- # check cigar
- b.cigartuples = ((0, 10), )
- self.checkFieldEqual(a, b, "cigartuples")
- b.cigartuples = ((0, 10), (2, 1), (0, 10))
- self.checkFieldEqual(a, b, "cigartuples")
- b.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
- self.checkFieldEqual(a, b)
-
- # check seq
- b.query_sequence = "ACGT"
- self.checkFieldEqual(
- a, b,
- ("query_sequence", "query_qualities", "query_length"))
- b.query_sequence = "ACGT" * 3
- self.checkFieldEqual(
- a, b,
- ("query_sequence", "query_qualities", "query_length"))
- b.query_sequence = "ACGT" * 10
- self.checkFieldEqual(a, b, ("query_qualities",))
-
- # reset qual
- b = self.buildRead()
-
- # check flags:
- for x in (
- "is_paired", "is_proper_pair",
- "is_unmapped", "mate_is_unmapped",
- "is_reverse", "mate_is_reverse",
- "is_read1", "is_read2",
- "is_secondary", "is_qcfail",
- "is_duplicate"):
- setattr(b, x, True)
- self.assertEqual(getattr(b, x), True)
- self.checkFieldEqual(a, b, ("flag", x,))
- setattr(b, x, False)
- self.assertEqual(getattr(b, x), False)
- self.checkFieldEqual(a, b)
-
- def testUpdate2(self):
- '''issue 135: inplace update of sequence and quality score.
-
- This does not work as setting the sequence will erase
- the quality scores.
- '''
- a = self.buildRead()
- a.query_sequence = a.query_sequence[5:10]
- self.assertEqual(pysam.toQualityString(a.query_qualities), None)
-
- a = self.buildRead()
- s = pysam.toQualityString(a.query_qualities)
- a.query_sequence = a.query_sequence[5:10]
- a.query_qualities = pysam.fromQualityString(s[5:10])
-
- self.assertEqual(pysam.toQualityString(a.query_qualities), s[5:10])
-
- def testLargeRead(self):
- '''build an example read.'''
-
- a = pysam.AlignedSegment()
- a.query_name = "read_12345"
- a.query_sequence = "ACGT" * 200
- a.flag = 0
- a.reference_id = 0
- a.reference_start = 20
- a.mapping_quality = 20
- a.cigartuples = ((0, 4 * 200), )
- a.next_reference_id = 0
- a.next_reference_start = 200
- a.template_length = 167
- a.query_qualities = pysam.fromQualityString("1234") * 200
-
- return a
-
- def testTagParsing(self):
- '''test for tag parsing
-
- see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a
- '''
- samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex8.bam"),
- "rb")
-
- for entry in samfile:
- before = entry.tags
- entry.tags = entry.tags
- after = entry.tags
- self.assertEqual(after, before)
-
- def testUpdateTlen(self):
- '''check if updating tlen works'''
- a = self.buildRead()
- oldlen = a.template_length
- oldlen *= 2
- a.template_length = oldlen
- self.assertEqual(a.template_length, oldlen)
-
- def testPositions(self):
- a = self.buildRead()
- self.assertEqual(a.get_reference_positions(),
- [20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
- 31, 32, 33, 34, 35, 36, 37, 38, 39,
- 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
- 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])
-
- self.assertEqual(a.get_aligned_pairs(),
- [(0, 20), (1, 21), (2, 22), (3, 23), (4, 24),
- (5, 25), (6, 26), (7, 27), (8, 28), (9, 29),
- (None, 30),
- (10, 31), (11, 32), (12, 33), (13, 34), (14, 35),
- (15, 36), (16, 37), (17, 38), (18, 39), (19, None),
- (20, 40), (21, 41), (22, 42), (23, 43), (24, 44),
- (25, 45), (26, 46), (27, 47), (28, 48), (29, 49),
- (30, 50), (31, 51), (32, 52), (33, 53), (34, 54),
- (35, 55), (36, 56), (37, 57), (38, 58), (39, 59)])
-
- self.assertEqual(
- a.get_reference_positions(),
- [x[1] for x in a.get_aligned_pairs()
- if x[0] is not None and x[1] is not None])
- # alen is the length of the aligned read in genome
- self.assertEqual(a.reference_length,
- a.get_aligned_pairs()[-1][0] + 1)
- # aend points to one beyond last aligned base in ref
- self.assertEqual(a.get_reference_positions()[-1],
- a.reference_end - 1)
-
- def testFullReferencePositions(self):
- '''see issue 26'''
- a = self.buildRead()
- a.cigar = [(4, 30), (0, 20), (1, 3), (0, 47)]
-
- self.assertEqual(100,
- len(a.get_reference_positions(full_length=True)))
-
- def testBlocks(self):
- a = self.buildRead()
- self.assertEqual(a.get_blocks(),
- [(20, 30), (31, 40), (40, 60)])
-
- # Disabled as not backwards compatible
- # def testFancyStr(self):
- # a = self.buildRead()
- # output = a.fancy_str()
- # self.assertEqual(len(output), 9)
-
-
-class TestDeNovoConstruction(ReadTest):
+class TestDeNovoConstruction(unittest.TestCase):
'''check BAM/SAM file construction using ex6.sam
@@ -1525,7 +1482,8 @@ class TestDeNovoConstruction(ReadTest):
a.next_reference_id = 0
a.next_reference_start = 199
a.template_length = 167
- a.query_qualities = pysam.fromQualityString("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+ a.query_qualities = pysam.fromQualityString(
+ "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
a.tags = (("NM", 1),
("RG", "L1"))
@@ -1540,7 +1498,8 @@ class TestDeNovoConstruction(ReadTest):
b.next_reference_id = 1
b.next_reference_start = 499
b.template_length = 412
- b.query_qualities = pysam.fromQualityString("<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")
+ b.query_qualities = pysam.fromQualityString(
+ "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")
b.tags = (("MF", 18),
("RG", "L2"))
@@ -1569,7 +1528,7 @@ class TestDeNovoConstruction(ReadTest):
others = list(infile)
for denovo, other in zip(others, self.reads):
- self.checkFieldEqual(other, denovo)
+ checkFieldEqual(self, other, denovo)
self.assertEqual(other.compare(denovo), 0)
# TODO
@@ -1579,7 +1538,7 @@ class TestDeNovoConstruction(ReadTest):
# others = list(infile)
# for denovo, other in zip(others, self.reads):
- # self.checkFieldEqual(other, denovo)
+ # checkFieldEqual(self, other, denovo)
# self.assertEqual(other.compare(denovo), 0)
def testBAMWholeFile(self):
@@ -1624,6 +1583,7 @@ class TestEmptyHeader(unittest.TestCase):
class TestHeaderWithProgramOptions(unittest.TestCase):
+
'''see issue 39.'''
def testHeader(self):
@@ -1647,10 +1607,24 @@ class TestTruncatedBAM(unittest.TestCase):
def testTruncatedBam(self):
- s = pysam.AlignmentFile(os.path.join(DATADIR, 'ex2_truncated.bam'))
+ s = pysam.AlignmentFile(
+ os.path.join(DATADIR, 'ex2_truncated.bam'))
iterall = lambda x: len([a for a in x])
self.assertRaises(IOError, iterall, s)
+ def testTruncatedBamFetch(self):
+ '''See comments for pull request at
+ https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
+ '''
+ # Currently there is no way to detect truncated
+ # files through hts_iter_fetch, so this test is
+ # disabled
+ return
+ s = pysam.AlignmentFile(
+ os.path.join(DATADIR, 'ex2_truncated.bam'))
+ iterall = lambda x: len([a for a in x])
+ self.assertRaises(IOError, iterall, s.fetch())
+
class TestBTagSam(unittest.TestCase):
@@ -1691,6 +1665,7 @@ class TestBTagBam(TestBTagSam):
class TestDoubleFetch(unittest.TestCase):
+
'''check if two iterators on the same bamfile are independent.'''
filename = os.path.join(DATADIR, 'ex1.bam')
@@ -1847,6 +1822,7 @@ class TestLargeOptValues(unittest.TestCase):
class TestPileup(unittest.TestCase):
+
'''test pileup functionality.'''
samfilename = "pysam_data/ex1.bam"
@@ -1885,6 +1861,156 @@ class TestPileup(unittest.TestCase):
fastafile=self.fastafile)
self.checkEqual(refs, iterator)
+ def count_coverage_python(self, bam, chr, start, stop, read_callback, quality_threshold=15):
+ l = stop - start
+ count_a = array.array('L', [0] * l)
+ count_c = array.array('L', [0] * l)
+ count_g = array.array('L', [0] * l)
+ count_t = array.array('L', [0] * l)
+ for p in bam.pileup(chr, start, stop, truncate=True, stepper='nofilter'):
+ rpos = p.reference_pos - start
+ for read in p.pileups:
+ if not read.is_del and not read.is_refskip and read_callback(read.alignment):
+ try:
+ if read.alignment.query_qualities[read.query_position] > quality_threshold:
+ letter = read.alignment.query[read.query_position]
+ if letter == 'A':
+ count_a[rpos] += 1
+ elif letter == 'C':
+ count_c[rpos] += 1
+ elif letter == 'G':
+ count_g[rpos] += 1
+ elif letter == 'T':
+ count_t[rpos] += 1
+ except IndexError:
+ pass
+ return count_a, count_c, count_g, count_t
+
+ def test_count_coverage(self):
+ chr = 'chr1'
+ start = 0
+ stop = 2000
+ manual_counts = self.count_coverage_python(self.samfile, chr, start, stop,
+ lambda read: True,
+ quality_threshold=0)
+ fast_counts = self.samfile.count_coverage(chr, start, stop,
+ read_callback=lambda read: True,
+ quality_threshold=0)
+ self.assertEqual(fast_counts[0], manual_counts[0])
+ self.assertEqual(fast_counts[1], manual_counts[1])
+ self.assertEqual(fast_counts[2], manual_counts[2])
+ self.assertEqual(fast_counts[3], manual_counts[3])
+
+ def test_count_coverage_quality_filter(self):
+ chr = 'chr1'
+ start = 0
+ stop = 2000
+ manual_counts = self.count_coverage_python(self.samfile, chr, start, stop,
+ lambda read: True,
+ quality_threshold=0)
+ fast_counts = self.samfile.count_coverage(chr, start, stop,
+ read_callback=lambda read: True,
+ quality_threshold=15)
+ # we filtered harder, should be less
+ for i in range(4):
+ for r in range(start, stop):
+ self.assertTrue(fast_counts[i][r] <= manual_counts[i][r])
+
+ def test_count_coverage_read_callback(self):
+ chr = 'chr1'
+ start = 0
+ stop = 2000
+ manual_counts = self.count_coverage_python(self.samfile, chr, start, stop,
+ lambda read: read.flag & 0x10,
+ quality_threshold=0)
+ fast_counts = self.samfile.count_coverage(chr, start, stop,
+ read_callback=lambda read: True,
+ quality_threshold=0)
+ for i in range(4):
+ for r in range(start, stop):
+ self.assertTrue(fast_counts[i][r] >= manual_counts[i][r])
+ fast_counts = self.samfile.count_coverage(chr, start, stop,
+ read_callback=lambda read: read.flag & 0x10,
+ quality_threshold=0)
+
+ self.assertEqual(fast_counts[0], manual_counts[0])
+ self.assertEqual(fast_counts[1], manual_counts[1])
+ self.assertEqual(fast_counts[2], manual_counts[2])
+ self.assertEqual(fast_counts[3], manual_counts[3])
+
+ def test_count_coverage_read_all(self):
+ samfile = pysam.AlignmentFile(
+ "test_count_coverage_read_all.bam", 'wb', template=self.samfile)
+ for ii, read in enumerate(self.samfile.fetch()):
+ # if ii % 2 == 0: # setting BFUNMAP makes no sense...
+ #read.flag = read.flag | 0x4
+ if ii % 3 == 0:
+ read.flag = read.flag | 0x100
+ if ii % 5 == 0:
+ read.flag = read.flag | 0x200
+ if ii % 7 == 0:
+ read.flag = read.flag | 0x400
+ samfile.write(read)
+ samfile.close()
+ pysam.index("test_count_coverage_read_all.bam")
+ samfile = pysam.AlignmentFile("test_count_coverage_read_all.bam")
+ chr = 'chr1'
+ start = 0
+ stop = 2000
+
+ def filter(read):
+ return not (read.flag & (0x4 | 0x100 | 0x200 | 0x400))
+ fast_counts = samfile.count_coverage(chr, start, stop,
+ read_callback='all',
+ #read_callback = lambda read: ~(read.flag & (0x4 | 0x100 | 0x200 | 0x400)),
+ quality_threshold=0)
+ manual_counts = samfile.count_coverage(chr, start, stop,
+ read_callback=lambda read: not(
+ read.flag & (0x4 | 0x100 | 0x200 | 0x400)),
+ quality_threshold=0)
+
+ os.unlink("test_count_coverage_read_all.bam")
+ os.unlink("test_count_coverage_read_all.bam.bai")
+
+ self.assertEqual(fast_counts[0], manual_counts[0])
+ self.assertEqual(fast_counts[1], manual_counts[1])
+ self.assertEqual(fast_counts[2], manual_counts[2])
+ self.assertEqual(fast_counts[3], manual_counts[3])
+
+ def test_count_coverage_nofilter(self):
+ samfile = pysam.AlignmentFile(
+ "test_count_coverage_nofilter.bam", 'wb', template=self.samfile)
+ for ii, read in enumerate(self.samfile.fetch()):
+ # if ii % 2 == 0: # setting BFUNMAP makes no sense...
+ #read.flag = read.flag | 0x4
+ if ii % 3 == 0:
+ read.flag = read.flag | 0x100
+ if ii % 5 == 0:
+ read.flag = read.flag | 0x200
+ if ii % 7 == 0:
+ read.flag = read.flag | 0x400
+ samfile.write(read)
+ samfile.close()
+ pysam.index("test_count_coverage_nofilter.bam")
+ samfile = pysam.AlignmentFile("test_count_coverage_nofilter.bam")
+ chr = 'chr1'
+ start = 0
+ stop = 2000
+ fast_counts = samfile.count_coverage(chr, start, stop,
+ read_callback='nofilter',
+ quality_threshold=0)
+
+ manual_counts = self.count_coverage_python(samfile, chr, start, stop,
+ read_callback=lambda x: True,
+ quality_threshold=0)
+ samfile.close()
+ os.unlink("test_count_coverage_nofilter.bam")
+ os.unlink("test_count_coverage_nofilter.bam.bai")
+ self.assertEqual(fast_counts[0], manual_counts[0])
+ self.assertEqual(fast_counts[1], manual_counts[1])
+ self.assertEqual(fast_counts[2], manual_counts[2])
+ self.assertEqual(fast_counts[3], manual_counts[3])
+
class TestLogging(unittest.TestCase):
@@ -2033,7 +2159,7 @@ class TestAlignmentFileIndex(unittest.TestCase):
def testIndex(self):
samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ "rb")
index = pysam.IndexedReads(samfile)
index.build()
reads = collections.defaultdict(int)
@@ -2048,6 +2174,19 @@ class TestAlignmentFileIndex(unittest.TestCase):
self.assertEqual(x.query_name, qname)
+class TestVerbosity(unittest.TestCase):
+
+ '''test if setting/getting of verbosity works.'''
+
+ def testVerbosity(self):
+ self.assertEqual(pysam.get_verbosity(), 3)
+ old = pysam.set_verbosity(0)
+ self.assertEqual(pysam.get_verbosity(), 0)
+ new = pysam.set_verbosity(old)
+ self.assertEqual(new, 0)
+ self.assertEqual(pysam.get_verbosity(), 3)
+
+
if __name__ == "__main__":
# build data files
print ("building data files")
diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py
index d714a64..889ff96 100644
--- a/tests/SamFile_test.py
+++ b/tests/SamFile_test.py
@@ -322,6 +322,7 @@ class BasicTestSAMFile(BasicTestBAMFetch):
class BasicTestSAMFetch(BasicTestBAMFetch):
+
def setUp(self):
self.samfile = pysam.Samfile(
os.path.join(DATADIR, "ex3.sam"),
@@ -445,7 +446,7 @@ class TestIO(unittest.TestCase):
input_filename,
check_header=False,
check_sq=False)
-
+
# TODO
# result = list(infile.fetch(until_eof=True))
# self.assertEqual(2, len(result))
@@ -470,7 +471,7 @@ class TestIO(unittest.TestCase):
def testReadSamWithoutHeader(self):
input_filename = os.path.join(DATADIR, "ex1.sam")
- # reading from a samfile without header is not
+ # reading from a samfile without header is not
# implemented
self.assertRaises(ValueError,
pysam.Samfile,
@@ -838,7 +839,7 @@ class TestIteratorColumn(unittest.TestCase):
self.samfile.getrname(column.tid)][column.pos]
self.assertEqual(
thiscov, refcov, "wrong coverage at pos %s:%i %i should be %i" % (
- self.samfile.getrname(column.tid), column.pos, thiscov, refcov))
+ self.samfile.getrname(column.tid), column.pos, thiscov, refcov))
def testIterateAll(self):
'''check random access per contig'''
@@ -992,6 +993,7 @@ class TestHeaderFromRefs(unittest.TestCase):
class TestHeader1000Genomes(unittest.TestCase):
+
'''see issue 110'''
# bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase2b_alignment/data/NA07048/exome_alignment/NA07048.unmapped.ILLUMINA.bwa.CEU.exome.20120522_p2b.bam"
bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam"
@@ -1187,7 +1189,7 @@ class ReadTest(unittest.TestCase):
# add the . for refactoring purposes.
for x in (".qname", ".seq", ".flag",
".rname", ".pos", ".mapq", ".cigar",
- ".mrnm", ".mpos", ".isize",
+ ".mrnm", ".mpos", ".isize",
".qual",
".bin",
".is_paired", ".is_proper_pair",
@@ -1224,7 +1226,6 @@ class TestAlignedRead(ReadTest):
self.assertEqual(a.mpos, 0)
self.assertEqual(a.isize, 0)
-
def testStrOfEmptyRead(self):
a = pysam.AlignedRead()
s = str(a)
@@ -1232,7 +1233,6 @@ class TestAlignedRead(ReadTest):
"None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
s)
-
def buildRead(self):
'''build an example read.'''
@@ -1563,6 +1563,7 @@ class TestBTagBam(TestBTagSam):
class TestDoubleFetch(unittest.TestCase):
+
'''check if two iterators on the same bamfile are independent.'''
filename = os.path.join(DATADIR, 'ex1.bam')
@@ -1719,6 +1720,7 @@ class TestLargeOptValues(unittest.TestCase):
class TestPileup(unittest.TestCase):
+
'''test pileup functionality.'''
samfilename = "pysam_data/ex1.bam"
diff --git a/tests/TestUtils.py b/tests/TestUtils.py
index d005cd9..3533f00 100644
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -1,5 +1,8 @@
import sys
import os
+import pysam
+import difflib
+
IS_PYTHON3 = sys.version_info[0] >= 3
@@ -12,7 +15,8 @@ else:
def checkBinaryEqual(filename1, filename2):
- '''return true if the two files are binary equal.'''
+ '''return true if the two files are binary equal.
+ '''
if os.path.getsize(filename1) != os.path.getsize(filename2):
return False
@@ -38,6 +42,42 @@ def checkBinaryEqual(filename1, filename2):
return found
+def checkSamtoolsViewEqual(filename1, filename2,
+ without_header=False):
+ '''return true if the two files are equal in their
+ content through samtools view.
+ '''
+
+ # strip MD and NM tags, as not preserved in CRAM files
+ args = ["-x", "MD", "-x", "NM"]
+ if not without_header:
+ args.append("-h")
+
+ lines1 = pysam.view(*(args + [filename1]))
+ lines2 = pysam.view(*(args + [filename2]))
+
+ if len(lines1) != len(lines2):
+ return False
+
+ if lines1 != lines2:
+ # line by line comparison
+ # sort each line, as tags get rearranged between
+ # BAM/CRAM
+ for n, pair in enumerate(zip(lines1, lines2)):
+ l1, l2 = pair
+ l1 = sorted(l1[:-1].split("\t"))
+ l2 = sorted(l2[:-1].split("\t"))
+ if l1 != l2:
+ print "mismatch in line %i" % n
+ print l1
+ print l2
+ return False
+ else:
+ return False
+
+ return True
+
+
def checkURL(url):
'''return True if URL is available.
@@ -50,3 +90,33 @@ def checkURL(url):
except:
return False
+
+def checkFieldEqual(cls, read1, read2, exclude=[]):
+ '''check if two reads are equal by comparing each field.'''
+
+ # add the . for refactoring purposes.
+ for x in (".query_name",
+ ".query_sequence",
+ ".flag",
+ ".reference_id",
+ ".reference_start",
+ ".mapping_quality",
+ ".cigartuples",
+ ".next_reference_id",
+ ".next_reference_start",
+ ".template_length",
+ ".query_length",
+ ".query_qualities",
+ ".bin",
+ ".is_paired", ".is_proper_pair",
+ ".is_unmapped", ".mate_is_unmapped",
+ ".is_reverse", ".mate_is_reverse",
+ ".is_read1", ".is_read2",
+ ".is_secondary", ".is_qcfail",
+ ".is_duplicate"):
+ n = x[1:]
+ if n in exclude:
+ continue
+ cls.assertEqual(getattr(read1, n), getattr(read2, n),
+ "attribute mismatch for %s: %s != %s" %
+ (n, getattr(read1, n), getattr(read2, n)))
diff --git a/tests/cython_flagstat.py b/tests/cython_flagstat.py
index 33b56f7..851157a 100644
--- a/tests/cython_flagstat.py
+++ b/tests/cython_flagstat.py
@@ -9,4 +9,3 @@ is_paired, is_proper = _cython_flagstat.count(
print ("there are alignments of %i paired reads" % is_paired)
print ("there are %i proper paired alignments" % is_proper)
-
diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile
index eb6c1d8..8b0964a 100644
--- a/tests/pysam_data/Makefile
+++ b/tests/pysam_data/Makefile
@@ -1,17 +1,20 @@
SAM=$(wildcard *.sam)
BAM=$(SAM:%.sam=%.bam)
BAI=$(BAM:%.bam=%.bam.bai)
+CRAM=ex1.cram ex2.cram ex3.cram
+CRAI=$(CRAM:%.cram=%.cram.crai)
# ex2.bam - bam file without index
all: ex1.pileup.gz \
ex1.sam ex1.bam \
- ex2.sam.gz ex2.sam ex2.bam \
+ ex2.sam.gz ex2.sam ex2.bam ex2.bam.bai \
uncompressed.bam \
$(BAM) $(BAI) \
+ $(CRAM) $(CRAI) \
example_bai.bam \
rg_with_tab.bam \
- ex2_truncated.bam \
+ ex2_truncated.bam ex2_truncated.bam.bai \
empty.bam empty.bam.bai
# ex2.sam - as ex1.sam, but with header
@@ -27,6 +30,12 @@ uncompressed.bam: ex2.sam
%.bam: %.sam
samtools view -bS $< > $@
+%.cram: %.sam
+ samtools view -bC -T ex1.fa $< > $@
+
+%.cram.crai: %.cram
+ samtools index $<
+
%.sam: %.sam.gz
gunzip < $< > $@
diff --git a/tests/python_flagstat.py b/tests/python_flagstat.py
index e9ec971..b14e52d 100644
--- a/tests/python_flagstat.py
+++ b/tests/python_flagstat.py
@@ -9,5 +9,3 @@ for read in pysam.AlignmentFile("ex1.bam", "rb"):
print ("there are alignments of %i paired reads" % is_paired)
print ("there are %i proper paired alignments" % is_proper)
-
-
diff --git a/tests/samtools_test.py b/tests/samtools_test.py
index 203c0a7..f247373 100644
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -23,7 +23,6 @@ DATADIR = "pysam_data"
def runSamtools(cmd):
'''run a samtools command'''
-
try:
retcode = subprocess.call(cmd, shell=True,
stderr=subprocess.PIPE)
@@ -36,7 +35,8 @@ def runSamtools(cmd):
def getSamtoolsVersion():
'''return samtools version'''
- with subprocess.Popen(SAMTOOLS, shell=True, stderr=subprocess.PIPE).stderr as pipe:
+ with subprocess.Popen(SAMTOOLS, shell=True,
+ stderr=subprocess.PIPE).stderr as pipe:
lines = b"".join(pipe.readlines())
if IS_PYTHON3:
@@ -224,16 +224,11 @@ class BinaryTest(unittest.TestCase):
# copy the source files to WORKDIR
os.makedirs(WORKDIR)
- shutil.copy(os.path.join(DATADIR, "ex1.fa"),
- os.path.join(WORKDIR, "pysam_ex1.fa"))
- shutil.copy(os.path.join(DATADIR, "ex1.fa"),
- os.path.join(WORKDIR, "ex1.fa"))
- shutil.copy(os.path.join(DATADIR, "ex1.sam.gz"),
- os.path.join(WORKDIR, "ex1.sam.gz"))
- shutil.copy(os.path.join(DATADIR, "ex1.sam"),
- os.path.join(WORKDIR, "ex1.sam"))
- shutil.copy(os.path.join(DATADIR, "ex2.bam"),
- os.path.join(WORKDIR, "ex2.bam"))
+ for f in ("ex1.fa", "ex1.sam.gz",
+ "ex1.sam", "ex2.bam",
+ "ex1.bed"):
+ shutil.copy(os.path.join(DATADIR, f),
+ os.path.join(WORKDIR, f))
# cd to workdir
savedir = os.getcwd()
@@ -286,9 +281,10 @@ class BinaryTest(unittest.TestCase):
return re.sub("[^0-9.]", "", s)
if _r(samtools_version) != _r(pysam.__samtools_version__):
- raise ValueError("versions of pysam/samtools and samtools differ: %s != %s" %
- (pysam.__samtools_version__,
- samtools_version))
+ raise ValueError(
+ "versions of pysam/samtools and samtools differ: %s != %s" %
+ (pysam.__samtools_version__,
+ samtools_version))
def checkCommand(self, command):
if command:
@@ -296,8 +292,10 @@ class BinaryTest(unittest.TestCase):
command][0][0], self.commands[command][1][0]
samtools_target = os.path.join(WORKDIR, samtools_target)
pysam_target = os.path.join(WORKDIR, pysam_target)
- self.assertTrue(checkBinaryEqual(samtools_target, pysam_target),
- "%s failed: files %s and %s are not the same" % (command, samtools_target, pysam_target))
+ self.assertTrue(
+ checkBinaryEqual(samtools_target, pysam_target),
+ "%s failed: files %s and %s are not the same" %
+ (command, samtools_target, pysam_target))
def testImport(self):
self.checkCommand("import")
@@ -374,6 +372,19 @@ class BinaryTest(unittest.TestCase):
pass
# shutil.rmtree( WORKDIR )
+
+class StdoutTest(unittest.TestCase):
+ '''test if stdout can be redirected.'''
+
+ def testWithRedirectedStdout(self):
+ r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"))
+ self.assertTrue(len(r) > 0)
+
+ def testWithoutRedirectedStdout(self):
+ r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"),
+ catch_stdout=False)
+ self.assertTrue(len(r) == 0)
+
if __name__ == "__main__":
# build data files
print ("building data files")
diff --git a/tests/tabix_test.py b/tests/tabix_test.py
index bc3c80a..1ad48ba 100644
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -260,6 +260,7 @@ class TestGZFile(IterationTest):
class TestIterationWithoutComments(IterationTest):
+
'''test iterating with TabixFile.fetch() when
there are no comments in the file.'''
@@ -350,7 +351,6 @@ class TestIterationWithoutComments(IterationTest):
# raise no error for invalid intervals
self.tabix.fetch("chr1", 100, 100)
-
def testGetContigs(self):
self.assertEqual(sorted(self.tabix.contigs), [b"chr1", b"chr2"])
# check that contigs is read-only
@@ -382,6 +382,7 @@ class TestIterationWithoutComments(IterationTest):
class TestIterationWithComments(TestIterationWithoutComments):
+
'''test iterating with TabixFile.fetch() when
there are comments in the file.
@@ -591,6 +592,7 @@ class TestGTF(TestParser):
class TestIterationMalformattedGTFFiles(unittest.TestCase):
+
'''test reading from malformatted gtf files.'''
parser = pysam.asGTF
@@ -670,6 +672,7 @@ class TestVCF(unittest.TestCase):
if IS_PYTHON3:
class TestUnicode(unittest.TestCase):
+
'''test reading from a file with non-ascii characters.'''
filename = os.path.join(DATADIR, "example_unicode.vcf")
@@ -860,7 +863,7 @@ class TestVCFFromVCF(TestVCF):
c[y].split(","), val,
"mismatch in field %s: expected %s, got %s" %
(field, c[y], val))
-
+
elif field == "filter":
if c[y] == "PASS" or c[y] == ".":
# convert PASS to empty list
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pysam.git
More information about the debian-med-commit
mailing list